Pythonでdictをarrayに変換する (DictVectorizer)

Python で dict を numpy の array に変換するには、scikit-learn の DictVectorizer を使うとよい

（試した scikit-learn の version: 1.0.1）

import numpy as np
from sklearn.feature_extraction import DictVectorizer

D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]

v_sparse = DictVectorizer(dtype=int, sparse=True)
# デフォルトでは sparse=True
X_sparse = v_sparse.fit_transform(D)
print(type(X_sparse))
# <class 'scipy.sparse.csr.csr_matrix'>
print(X_sparse)
#   (0, 0)        2
#   (0, 2)        1
#   (1, 1)        1
#   (1, 2)        3

v_not_sparse = DictVectorizer(dtype=int, sparse=False)
X_not_sparse = v_not_sparse.fit_transform(D)
print(type(X_not_sparse))
# <class 'numpy.ndarray'>
print(X_not_sparse)
# [[2 0 1]
#  [0 1 3]]

assert np.array_equal(X_parse.toarray(), X_not_sparse)

print(v_sparse.feature_names_)
# ['bar', 'baz', 'foo']
print(v_not_sparse.vocabulary_)
# {'foo': 2, 'bar': 0, 'baz': 1}

print(v_not_sparse.transform({'foo': 4, 'unseen_feature': 3}))
# [[0 0 4]]