MLR原理及deepctr组网实现MLR

文章目录

MLR

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

https://zhuanlan.zhihu.com/p/100532677

https://blog.csdn.net/fyneru_xiaohui/article/details/106390266

deepctr实现MLR

import os, warnings, time, sys
import pickle
import matplotlib.pyplot as plt
import pandas as pd, numpy as np
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, accuracy_score, roc_curve, precision_score, recall_score, roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from deepctr.models import DeepFM, xDeepFM, MLR, DeepFEFM, DIN, AFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from deepctr.layers import custom_objects
from tensorflow.python.keras.models import save_model, load_model
from tensorflow.keras.models import model_from_yaml
import tensorflow as tf
from tensorflow.python.ops import array_ops
import tensorflow.keras.backend as K
from sklearn import datasets
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.models import model_from_json
from tensorflow.keras.callbacks import *
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.layers.embeddings import Embedding

from toolsnn import *


def train_MLR():
    print('MLR 模型训练开始 ', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))
    start_time_start = time.time()

    # pdtrain:正样本数:565485,负样本数:1133910,正负样本比: 1 : 2.0052
    # pdtest:正样本数:565485,负样本数:1134505,正负样本比: 1 : 2.0063
    # pdeval_full:正样本数:46,负样本数:8253,正负样本比: 1 : 179.413
    pdtrain = pd.read_csv(train_path_ascii)
    pdtest = pd.read_csv(test_path_ascii)
    data = pd.concat([pdtrain, pdtest[pdtest['y'] == 0]], axis=0, ignore_index=True)

    data = data.drop(['WilsonClickRate_all', 'WilsonClickRate_yesterday', 'WilsonAd_clickRate_all',
                      'WilsonAd_clickRate_yesterday'], axis=1)

    # 将 `用户id`、`广告id`、`用户设备`、`多牛广告位id` 用ASCII数值化,转为embedding: 利用卷积原理,将每个字符的ascii码相加作为字符串的数值
    data['suuid'] = data['suuid'].apply(lambda x: sum([ord(i) for i in x]))
    data['advertisement'] = data['advertisement'].apply(lambda x: sum([ord(i) for i in x]))
    # data['position'] = data['position'].apply(lambda x: sum([ord(i) for i in x]))     # 多牛广告位id本身就是float类型,直接embedding
    data['user_modelMake'] = data['user_modelMake'].apply(lambda x: sum([ord(i) for i in x]))

    # double -> float
    data = transformDF(data, ['reserve_price', 'reserve_price_cpc', 'clickRate_all', 'clickRate_yesterday',
                              'ad_clickRate_yesterday'], float)

    '''   特征处理   '''
    global sparsecols, densecols
    # 稀疏-onehot
    sparsecols = ['hour', 'advert_place', 'province_id', 'port_type', 'user_osID', 'is_holidays', 'is_being',
                  'is_outflow', 'advertiser', 'ad_from', 'payment']
    # ascii embedding
    sparse_ascii = ['suuid', 'advertisement', 'position', 'user_modelMake']
    # 稠密-归一化
    densecols = ['W', 'H', 'reserve_price', 'reserve_price_cpc', 'is_rest_click', 'clickPerHour_yesterday',
                 'display_nums_all', 'click_nums_all', 'display_nums_yesterday', 'click_nums_yesterday',
                 'ad_display_all', 'ad_click_all', 'ad_display_yesterday', 'ad_click_yesterday']
    # 稠密-点击率
    ratecols = ['WHrate', 'clickRate_all', 'clickRate_yesterday', 'ad_clickRate_yesterday']

    global namesoh
    namesoh = {
    
    }
    for sparse in sparsecols:
        onehot = OneHotEncoder()
        arrays = onehot.fit_transform(np.array(data[sparse]).reshape(-1, 1))
        # 将onehot后的稀疏矩阵拼回原来的df
        arrays = arrays.toarray()
        names = [sparse + '_' + str(n) for n in range(len(arrays[0]))]
        namesoh[sparse] = names
        data = pd.concat([data, pd.DataFrame(arrays, columns=names)], axis=1)
        data = data.drop([sparse], axis=1)
        # 保存编码规则
        with open(feature_encode_path.format(sparse) + '.pkl', 'wb') as f:
            pickle.dump(onehot, f)
        # print(' {} onehot完成'.format(sparse))
    print(' onehot完成', time.strftime("%H:%M:%S", time.localtime(time.time())))

    for dense in densecols:
        mms = MinMaxScaler(feature_range=(0, 1))
        data[dense] = mms.fit_transform(np.array(data[dense]).reshape(-1, 1))
        with open(feature_encode_path.format(dense) + '.pkl', 'wb') as f:
            pickle.dump(mms, f)
        # print(' {} 归一化完成'.format(dense))
    print(' 归一化完成', time.strftime("%H:%M:%S", time.localtime(time.time())))

    print(' columns: ', len(list(data.columns)))

    '''   训练集、测试集、验证集划分   '''
    train_data, test_data = getRata2(data, num=1)
    _, val_data = train_test_split(test_data, test_size=0.2, random_state=1, shuffle=True)

    train_data = shuffle(train_data)
    test_data = shuffle(test_data)
    val_data = shuffle(val_data)
    negBpow(train_data, '训练集')
    negBpow(val_data, '验证集')
    negBpow(test_data, '测试集')

    print(' train_data shape: ', train_data.shape)
    print(' val_data shape: ', val_data.shape)
    print(' test_data shape: ', test_data.shape)


    sparse_features = []
    for value in namesoh.values():
        for v in value:
            sparse_features.append(v)
    dense_features = densecols + ratecols

    sparse_feature_columns1 = [SparseFeat(feat, vocabulary_size=int(train_data[feat].max() + 1), embedding_dim=4)
                               for i, feat in enumerate(sparse_features)]
    sparse_feature_columns2 = [SparseFeat(feat, vocabulary_size=int(train_data[feat].max() + 1), embedding_dim=4)
                               for i, feat in enumerate(sparse_ascii)]
    sparse_feature_columns = sparse_feature_columns1 + sparse_feature_columns2
    dense_feature_columns = [DenseFeat(feat, 1)
                             for feat in dense_features]

    print(' sparse_features count: ', len(sparse_features))
    print(' dense_features count: ', len(dense_features))

    linear_feature_columns = sparse_feature_columns + dense_feature_columns
    tmp_user = ['hour', 'province_id', 'user_osID', 'is_holidays', 'is_being', 'is_outflow']
    region_feature_columns = []
    for key, value in namesoh.items():
        if key in tmp_user:
            for v in value:
                region_feature_columns.append(v)
    base_feature_columns = linear_feature_columns

    global feature_names
    feature_names = get_feature_names(linear_feature_columns)

    print(' feature_names: ', feature_names)

    '''   feed input   '''
    train_x = {
    
    name: train_data[name].values for name in feature_names}
    test_x = {
    
    name: test_data[name].values for name in feature_names}
    val_x = {
    
    name: val_data[name].values for name in feature_names}
    train_y = train_data[['y']].values
    test_y = test_data[['y']].values
    val_y = val_data[['y']].values

    print(' 数据处理完成', time.strftime("%H:%M:%S", time.localtime(time.time())))

    '''
    region_feature_columns: 用于聚类的用户特征
    base_feature_columns:基模型特征,其实可以是全部特征,也可以是用于训练的广告特征
    l2_reg_linear:LR的正则强度(L2正则)
    bias_feature_columns: 偏好特征,不同的人群具有聚类特性,同一类人群具有类似的广告点击偏好。
    '''
    deep = MLR(region_feature_columns=base_feature_columns, region_num=4,
        l2_reg_linear=1e-5, task='binary',)

    mNadam = Adam(lr=1e-4, beta_1=0.95, beta_2=0.96)
    deep.compile(optimizer=mNadam, loss='binary_crossentropy',
                 metrics=['AUC', 'Precision', 'Recall'])

    print(' 组网完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
    print(' 训练开始 ', time.strftime("%H:%M:%S", time.localtime(time.time())))
    start_time = time.time()

    '''   训练   '''
    # 早停止:验证集精确率上升幅度小于min_delta,训练停止
    earlystop_callback = EarlyStopping(
        monitor='val_precision', min_delta=0.001, mode='max',
        verbose=2, patience=3)

    generator_flag = False  # fit
    # generator_flag = True       # fit_generator
    if not generator_flag:
        history = deep.fit(
            train_x, train_y, validation_data=(val_x, val_y),
            batch_size=2000,
            epochs=3,
            verbose=2,
            shuffle=True,
            # callbacks=[earlystop_callback]
        )
    else:
        batch_size = 2000
        train_nums = len(train_data)
        history = deep.fit_generator(
            GeneratorRandomPatchs(train_x, train_y, batch_size, train_nums, feature_names),
            validation_data=(val_x, val_y),
            steps_per_epoch=train_nums // batch_size,
            epochs=3000,
            verbose=2,
            shuffle=True,
            # callbacks=[earlystop_callback]
        )

    end_time = time.time()
    print(' 训练完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
    print((' 训练运行时间: {:.0f}分 {:.0f}秒'.format((end_time - start_time) // 60, (end_time - start_time) % 60)))

    # 模型保存成yaml文件
    save_model(deep, save_path)
    print(' 模型保存完成', time.strftime("%H:%M:%S", time.localtime(time.time())))

    # # 训练可视化
    # visualization(history, saveflag=True, showflag=False, path1=loss_plt_path.format('loss_auc.jpg'),
    #               path2=loss_plt_path.format('precision_recall.jpg'))

    # 测试集评估
    scores = deep.evaluate(test_x, test_y, verbose=0)
    print(' %s: %.4f' % (deep.metrics_names[0], scores[0]))
    print(' %s: %.4f' % (deep.metrics_names[1], scores[1]))
    print(' %s: %.4f' % (deep.metrics_names[2], scores[2]))
    print(' %s: %.4f' % (deep.metrics_names[3], scores[3]))
    print(' %s: %.4f' % ('F1', (2 * scores[2] * scores[3]) / (scores[2] + scores[3])))
    print(' 验证集再评估完成', time.strftime("%H:%M:%S", time.localtime(time.time())))

    # 全量评估
    full_evaluate2()

    end_time_end = time.time()
    print(('MLR 模型训练运行时间: {:.0f}分 {:.0f}秒'.format((end_time_end - start_time_start) // 60,
                                                     (end_time_end - start_time_start) % 60)))
    print(('{:.0f}小时'.format((end_time_end - start_time_start) // 60 / 60)))

猜你喜欢

转载自blog.csdn.net/qq_42363032/article/details/121425254