XGB+FM、XGB+FFM代码实现(FM、FFM基于xlearn、pylibfm实现)

XGB+FM 同 GBDT+LR 原理一致,原理解释可看下面这篇文章,本文只分享XGB+FM的代码实现demo

https://blog.csdn.net/qq_42363032/article/details/112756687

假数据

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from scipy.stats import pearsonr
import lightgbm as lgb
import pickle
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, log_loss, roc_auc_score
import xlearn as xl
from sklearn.datasets import make_classification
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=80000)

print(X)
print(y)
print()
[[ 0.09290313 -1.5338069   2.49097541 ... -0.74785285  0.85073894
   1.67822484]
 [ 2.84160485  0.35751684  1.16678272 ... -0.85795697  0.303276
  -1.10934523]
 [ 0.32030069 -0.12157832  0.95744782 ...  1.93378667 -0.69885375
   1.04388881]
 ...
 [-1.11610414 -2.02439974 -2.08584046 ...  1.53547347  0.95855299
  -0.92907583]
 [-0.21338069 -0.18964935 -0.52669662 ...  1.19182673 -2.43832808
  -0.77222773]
 [ 0.52265496 -0.77998801 -1.1404401  ... -0.43649192 -0.13329408
   1.7156541 ]]
[0 1 0 ... 1 1 0]

XGB+FM(xlearn)

base = XGBClassifier(booster='gbtree',
                      objective='binary:logistic',
                      n_jobs=4,
                      tree_method="hist",
                      importance_type='gain',
                      n_estimators=100,
                      learning_rate=0.05,
                      max_depth=15,
                      gamma=0.1, reg_alpha=0.2, reg_lambda=0.1,
                      min_child_weight=20, subsample=0.9, colsample_bytree=0.9,
                      # scale_pos_weight=10,
                      )
base.fit(x_train, y_train,
          eval_set=[(x_train, y_train)],
          # early_stopping_rounds=100,
          eval_metric=custom_eval,
          # feature_weights=feature_weights,
          verbose=50)

# apply:为每个样本返回每个树的预测叶子,注意和GBDT的apply返回维度不一致,此处维度为:shape=[n_samples, n_trees]
print(base.apply(x_train).shape)

leafs = base.apply(x_train)  # 获取叶子节点
print('leafs', leafs)

oneenc = OneHotEncoder()
oneenc.fit(leafs)

# lr = LogisticRegression(solver='lbfgs', max_iter=100)  # 1000
# lr.fit(oneenc.transform(base.apply(x_train)), y_train)
#
# ypreba = lr.predict_proba(oneenc.transform(base.apply(x_train)))[:, 1]
# print(ypreba)

# 叶节点和原特征拼接
x_train_stack = hstack([oneenc.transform(base.apply(x_train)), x_train])
print(x_train_stack.shape)
print(x_train_stack)

fm = xl.FMModel()
# xl.FFMModel()
fm.fit(x_train_stack, y_train,
        eval_set=[x_train_stack, y_train],
        is_quiet=False)

print(fm.predict(x_train_stack))  # 此为概率值
(80000, 100)
leafs [[ 57.  75.  49. ...  41.  23.  36.]
 [ 86.  55. 146. ...  46.  32.  85.]
 [ 57.  75.  49. ...  42.  26.  50.]
 ...
 [ 86.  55.  95. ...  56.  78.  67.]
 [356. 146. 274. ...  45.  79.  67.]
 [ 57.  75.  49. ...  41.  38.  47.]]

(80000, 16674)

[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   20%      ]     1            0.310002            0.306044            0.985898                0.14
[   40%      ]     2            0.300465            0.292990            0.985904                0.12
[   60%      ]     3            0.300018            0.294255            0.985920                0.12
[   80%      ]     4            0.300247            0.304429            0.985930                0.12
[  100%      ]     5            0.300019            0.306976            0.986089                0.12

[0.250724 0.870463 0.245593 ... 0.867491 0.856433 0.238356]

XGB+FFM(xlearn)

https://blog.csdn.net/Spirit_6275/article/details/111694502

pylibfm实现(推荐)

可以通过lightfm、libffm实现

4.19更

# docker 
RUN apt-get update
RUN apt install -y gcc
RUN pip install git+https://github.com/coreylynch/pyFM
from pyfm import pylibfm

data_X = dataTrain.iloc[:, :-1]
data_y = dataTrain['y']

X_train, X_val, y_train, y_val = train_test_split(data_X, data_y, test_size=0.01, random_state=0, stratify=data_y)

X_train, y_train = X_train.values, y_train.values
X_val, y_val = X_val.values, y_val.values

base = XGBClassifier(booster='gbtree', objective='binary:logistic', n_jobs=6,
                      tree_method="hist",
                      importance_type='gain',
                      n_estimators=300, learning_rate=0.01, max_depth=13,
                      gamma=0.2, reg_alpha=0.2, reg_lambda=0.3,
                      min_child_weight=5, subsample=0.9, colsample_bytree=0.9,
                      max_delta_step=10,
                      )

base.fit(X_train, y_train,
         eval_set=[(X_val, y_val)],
         eval_metric=custom_eval,
         verbose=50)

leafs = base.apply(X_train)
print(' leafs shape ', leafs.shape)

oneenc = OneHotEncoder(handle_unknown='ignore')
oneenc.fit(leafs)

print(' leaf node vectorization is complete ', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))

# 叶节点向量和原特征拼接
x_train_stack = hstack([oneenc.transform(leafs), X_train])
print(' concat shape ', x_train_stack.shape)

fm = pylibfm.FM(num_factors=6,
                 num_iter=100,
                 k0=True,
                 k1=True,
                 init_stdev=0.01,
                 validation_size=0.001,
                 initial_learning_rate=0.01,
                 verbose=True
                 )

fm.fit(x_train_stack, y_train)

保存model

xlearn的模型没法通过pickle保存

pyfm的可以

os_mkdir(model_PATH)
with open(saveModel_PATH_base_model, mode='wb') as f:
    pickle.dump(base, f)
with open(saveModel_PATH_one_model, mode='wb') as f:
    pickle.dump(oneenc, f)
with open(saveModel_PATH_cross_model, mode='wb') as f:
    pickle.dump(fm, f)
print(' model保存成功', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))



XGB+LR看这篇文章:

https://blog.csdn.net/anshuai_aw1/article/details/82983997

猜你喜欢

转载自blog.csdn.net/qq_42363032/article/details/124104826
xgb