假数据

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from scipy.stats import pearsonr
import lightgbm as lgb
import pickle
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, log_loss, roc_auc_score
import xlearn as xl
from sklearn.datasets import make_classification
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=80000)

print(X)
print(y)
print()

[[ 0.09290313 -1.5338069   2.49097541 ... -0.74785285  0.85073894
   1.67822484]
 [ 2.84160485  0.35751684  1.16678272 ... -0.85795697  0.303276
  -1.10934523]
 [ 0.32030069 -0.12157832  0.95744782 ...  1.93378667 -0.69885375
   1.04388881]
 ...
 [-1.11610414 -2.02439974 -2.08584046 ...  1.53547347  0.95855299
  -0.92907583]
 [-0.21338069 -0.18964935 -0.52669662 ...  1.19182673 -2.43832808
  -0.77222773]
 [ 0.52265496 -0.77998801 -1.1404401  ... -0.43649192 -0.13329408
   1.7156541 ]]
[0 1 0 ... 1 1 0]

XGB+FM（xlearn）

base = XGBClassifier(booster='gbtree',
                      objective='binary:logistic',
                      n_jobs=4,
                      tree_method="hist",
                      importance_type='gain',
                      n_estimators=100,
                      learning_rate=0.05,
                      max_depth=15,
                      gamma=0.1, reg_alpha=0.2, reg_lambda=0.1,
                      min_child_weight=20, subsample=0.9, colsample_bytree=0.9,
                      # scale_pos_weight=10,
                      )
base.fit(x_train, y_train,
          eval_set=[(x_train, y_train)],
          # early_stopping_rounds=100,
          eval_metric=custom_eval,
          # feature_weights=feature_weights,
          verbose=50)

# apply:为每个样本返回每个树的预测叶子，注意和GBDT的apply返回维度不一致，此处维度为：shape=[n_samples, n_trees]
print(base.apply(x_train).shape)

leafs = base.apply(x_train)  # 获取叶子节点
print('leafs', leafs)

oneenc = OneHotEncoder()
oneenc.fit(leafs)

# lr = LogisticRegression(solver='lbfgs', max_iter=100)  # 1000
# lr.fit(oneenc.transform(base.apply(x_train)), y_train)
#
# ypreba = lr.predict_proba(oneenc.transform(base.apply(x_train)))[:, 1]
# print(ypreba)

# 叶节点和原特征拼接
x_train_stack = hstack([oneenc.transform(base.apply(x_train)), x_train])
print(x_train_stack.shape)
print(x_train_stack)

fm = xl.FMModel()
# xl.FFMModel()
fm.fit(x_train_stack, y_train,
        eval_set=[x_train_stack, y_train],
        is_quiet=False)

print(fm.predict(x_train_stack))  # 此为概率值

(80000, 100)
leafs [[ 57.  75.  49. ...  41.  23.  36.]
 [ 86.  55. 146. ...  46.  32.  85.]
 [ 57.  75.  49. ...  42.  26.  50.]
 ...
 [ 86.  55.  95. ...  56.  78.  67.]
 [356. 146. 274. ...  45.  79.  67.]
 [ 57.  75.  49. ...  41.  38.  47.]]

(80000, 16674)

[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
[   20%      ]     1            0.310002            0.306044            0.985898                0.14
[   40%      ]     2            0.300465            0.292990            0.985904                0.12
[   60%      ]     3            0.300018            0.294255            0.985920                0.12
[   80%      ]     4            0.300247            0.304429            0.985930                0.12
[  100%      ]     5            0.300019            0.306976            0.986089                0.12

[0.250724 0.870463 0.245593 ... 0.867491 0.856433 0.238356]

XGB+FFM（xlearn）

https://blog.csdn.net/Spirit_6275/article/details/111694502

pylibfm实现（推荐）

可以通过lightfm、libffm实现

4.19更

# docker 
RUN apt-get update
RUN apt install -y gcc
RUN pip install git+https://github.com/coreylynch/pyFM

from pyfm import pylibfm

data_X = dataTrain.iloc[:, :-1]
data_y = dataTrain['y']

X_train, X_val, y_train, y_val = train_test_split(data_X, data_y, test_size=0.01, random_state=0, stratify=data_y)

X_train, y_train = X_train.values, y_train.values
X_val, y_val = X_val.values, y_val.values

base = XGBClassifier(booster='gbtree', objective='binary:logistic', n_jobs=6,
                      tree_method="hist",
                      importance_type='gain',
                      n_estimators=300, learning_rate=0.01, max_depth=13,
                      gamma=0.2, reg_alpha=0.2, reg_lambda=0.3,
                      min_child_weight=5, subsample=0.9, colsample_bytree=0.9,
                      max_delta_step=10,
                      )

base.fit(X_train, y_train,
         eval_set=[(X_val, y_val)],
         eval_metric=custom_eval,
         verbose=50)

leafs = base.apply(X_train)
print(' leafs shape ', leafs.shape)

oneenc = OneHotEncoder(handle_unknown='ignore')
oneenc.fit(leafs)

print(' leaf node vectorization is complete ', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))

# 叶节点向量和原特征拼接
x_train_stack = hstack([oneenc.transform(leafs), X_train])
print(' concat shape ', x_train_stack.shape)

fm = pylibfm.FM(num_factors=6,
                 num_iter=100,
                 k0=True,
                 k1=True,
                 init_stdev=0.01,
                 validation_size=0.001,
                 initial_learning_rate=0.01,
                 verbose=True
                 )

fm.fit(x_train_stack, y_train)

保存model

xlearn的模型没法通过pickle保存

pyfm的可以

os_mkdir(model_PATH)
with open(saveModel_PATH_base_model, mode='wb') as f:
    pickle.dump(base, f)
with open(saveModel_PATH_one_model, mode='wb') as f:
    pickle.dump(oneenc, f)
with open(saveModel_PATH_cross_model, mode='wb') as f:
    pickle.dump(fm, f)
print(' model保存成功', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))

XGB+LR看这篇文章：

https://blog.csdn.net/anshuai_aw1/article/details/82983997

XGB+FM、XGB+FFM代码实现（FM、FFM基于xlearn、pylibfm实现）

文章目录

假数据

XGB+FM（xlearn）

XGB+FFM（xlearn）

pylibfm实现（推荐）

保存model

猜你喜欢