Python で dict を numpy の array に変換するには、scikit-learn の DictVectorizer を使うとよい
(試した scikit-learn の version: 1.0.1)
import numpy as np
from sklearn.feature_extraction import DictVectorizer
D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
v_sparse = DictVectorizer(dtype=int, sparse=True)
# デフォルトでは sparse=True
X_sparse = v_sparse.fit_transform(D)
print(type(X_sparse))
# <class 'scipy.sparse.csr.csr_matrix'>
print(X_sparse)
# (0, 0) 2
# (0, 2) 1
# (1, 1) 1
# (1, 2) 3
v_not_sparse = DictVectorizer(dtype=int, sparse=False)
X_not_sparse = v_not_sparse.fit_transform(D)
print(type(X_not_sparse))
# <class 'numpy.ndarray'>
print(X_not_sparse)
# [[2 0 1]
# [0 1 3]]
assert np.array_equal(X_parse.toarray(), X_not_sparse)
print(v_sparse.feature_names_)
# ['bar', 'baz', 'foo']
print(v_not_sparse.vocabulary_)
# {'foo': 2, 'bar': 0, 'baz': 1}
print(v_not_sparse.transform({'foo': 4, 'unseen_feature': 3}))
# [[0 0 4]]
コメント