FFMをxLearnで試す

Field-aware Factorization Machines (FFM) を Python で試すために、xLearn を使ってみた。

環境によってインストールできたりできなかったりして、よく分からない。メンテナンスもされていなさそう。

とりあえず、分類モデルを作ってみる。

xLearn Python Package Guide

ライブラリの import

from pathlib import Path
import random

from matplotlib import pyplot as plt
import numpy as np
import polars as pl
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
import xlearn as xl

適当にデータ生成。ユーザとアイテムの組み合わせに対してCV予測するようなのを想定。ユーザには年齢（簡単のため20歳以上60歳以下）、職業（’A’ から ‘J’ の10種類）の情報があり、アイテムにはカテゴリー（’A’ から ‘J’ の10種類）の情報がある。ちなみに Polars については、以前 Qiita で記事を書いている。

random.seed(1)
np.random.seed(1)

n_users = 1000
n_items = 100
users = [f'user{i:05}' for i in range(n_users)]
items = [f'item{i:04}' for i in range(n_items)]

df_users = pl.DataFrame({
    'user': users,
    'age': [random.randint(20, 60) for _ in range(n_users)],
    'occupation': [random.choice(list('ABCDEFGHIJ')) for _ in range(n_users)],
})

df_items = pl.DataFrame({
    'item': items,
    'category': [random.choice(list('ABCDEFGHIJ')) for _ in range(n_items)],
})

df = df_users.join(df_items, how='cross')


x = (df['age'] - 40) / 10
y = df['occupation'].apply(lambda x: ord(x) - 68)
z = df['category'].apply(lambda x: ord(x) - 72)

# 分布確認用
# plt.hist((x * y - x * z + y * z / 10 - 10).to_numpy()
#          + np.random.normal(scale=5, size=df.height))

df['cv'] = ((x * y - x * z + y * z / 10 - 10).to_numpy()
            + np.random.normal(scale=5, size=df.height)) > 0
df['cv'] = df['cv'].cast(pl.Int32)

train/valid/test に分割。valid は early stopping に使う。

dev_idx, test_idx = train_test_split(
    range(df.height), test_size=0.2, random_state=1, stratify=df['cv'])
train_idx, valid_idx = train_test_split(
    dev_idx, test_size=0.2, random_state=1, stratify=df[dev_idx]['cv'])

FFM は独特の format があるので、その変換のための関数を定義。上記データ用にハードコーディングしている。予測用データにはラベルはなくてもよい。あと for loop は分かりやすいので使っているが、当然ながら遅いので、大きいデータを扱うには向かない。例えば df を直接いじって to_csv(has_header=False, sep=' ') とかすれば速くなる。

def ffm_format(df: pl.DataFrame) -> str:
    """
    format df to
    label field_1:feature_1:value_1 field_2:feature_2:value_2 ...
    """

    txt_all = ''
    for i in range(df.height):
        # label
        txt = str(df[i, 'cv'])

        # field: age
        age = df[i, 'age'] - 20  # 20 -- 60 -> 0 -- 40
        txt += f' 0:{age}:1'

        # field: occupation
        occupation = ord(df[i, 'occupation']) - 65 + 41
        txt += f' 1:{occupation}:1'

        # field: category
        category = ord(df[i, 'category']) - 65 + 41 + 10
        txt += f' 2:{category}:1'

        txt += '\n'
        txt_all += txt

    return txt_all

適当にディレクトリを作って、フォーマットしたテキストデータを入れる。

Path('xlearn_ffm').mkdir(exist_ok=True)
train_path = 'xlearn_ffm/train.txt'
valid_path = 'xlearn_ffm/valid.txt'
test_path = 'xlearn_ffm/test.txt'
with open(train_path, 'w') as f:
    f.write(ffm_format(df[train_idx]))
with open(valid_path, 'w') as f:
    f.write(ffm_format(df[valid_idx]))
with open(test_path, 'w') as f:
    f.write(ffm_format(df[test_idx]))

学習。pathlib.Path は受け付けないので注意。

ffm_model = xl.create_ffm()

ffm_model.setTrain(train_path)
ffm_model.setValidate(valid_path)

param = {
    'task': 'binary',
    'metric': 'auc',
    'lr': 0.2,
    'lambda': 0.002,
    'k': 4,
    'epoch': 10,
}

model_path = 'xlearn_ffm/model.out'
ffm_model.fit(param, model_path)

予測。

predict_path = 'xlearn_ffm/predict.txt'
ffm_model.setSigmoid()
ffm_model.setTest(test_path)
ffm_model.predict(model_path, predict_path)

精度評価。AUC で 0.91 くらいになった。

y_true = pl.read_csv(
    test_path, sep=' ', has_header=False, columns=[0])[:, 0].to_numpy()
y_pred = pl.read_csv(
    predict_path, has_header=False)[:, 0].to_numpy()
print(f'AUC = {roc_auc_score(y_true, y_pred):.4f}')

ROC curve (https://note.nkmk.me/python-sklearn-roc-curve-auc-score/)

fpr, tpr, thresholds = roc_curve(y_true, y_pred)

plt.plot(fpr, tpr, marker='o')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()

以上。