5 Model Building:模型构建
划分数据集及相关函数构建
1. 加载相应库
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, scorer, f1_score
import statsmodels.api as sm
from sklearn.metrics import precision_score, recall_score
from yellowbrick.classifier import DiscriminationThreshold
2. 划分数据
# 区分训练集和测试集
train, test = train_test_split(telcom, test_size=0.25, random_state=111)
# 区分独立和非独立变量
cols = [i for i in telcom.columns if i not in Id_col+target_col]
train_X = train[cols]
train_Y = train[target_col]
test_X = test[cols]
test_Y = test[target_col]
3. 建模函数及可视化函数构建
3.1 建模函数变量说明
# 构建方程元素
# dataframe -用于构建模型的数据表
# Algorithm -使用的算法
# training_x -用于训练的数据
# testing_x -用于预测的测试集
# training_y -目标变量(用于训练)
# testing_y -目标变量(用于测试)
# cf -[‘coefficient’, ‘feature’]逻辑回归的相关系数
# threshold_plot-如果为True,返回模型的阈值图
3.2 构建函数,实现传入算法及数据集,输出模型表现指标
def telecom_churn_prediction(algorithm, training_x, testing_x, training_y, testing_y, cols, cf, thredshold_plot):
# 模型构建
algorithm.fit(training_x, training_y)
preds = algorithm.predict(testing_x)
prob = algorithm.predict_proba(testing_x)
# 相关性
if cf == 'coefficients':
coefficients = pd.DataFrame(algorithm.coef_.ravel())
elif cf == 'features':
coefficients = pd.DataFrame(algorithm.feature_importances_)
column_df = pd.DataFrame(cols)
coef_sumry = (pd.merge(coefficients, column_df, left_index=True,
right_index=True, how='left'))
coef_sumry.columns = ['coefficients', 'features']
coef_sumry = coef_sumry.sort_values(by='coefficients', ascending=False)
print(algorithm)
print('\n Classification report: \n', classification_report(testing_y, preds))
print('Accuracy Score: ', accuracy_score(testing_y, preds))
# 混淆矩阵
conf_matrix = confusion_matrix(testing_y, preds)
# roc_auc_score得分
model_roc_auc = roc_auc_score(testing_y, preds)
print('Area under curve: ', model_roc_auc, '\n')
fpr, tpr, thredsholds = roc_curve(testing_y, prob[:, 1])
# 绘制混淆矩阵图
trace1 = go.Heatmap(z=conf_matrix, x=['Not Churn', 'Churn'], y=['Not Churn', 'Churn'],
showscale=False, colorscale='Picnic', name='Matrix')
# 绘制roc曲线
trace2 = go.Scatter(x=fpr, y=tpr, name='Roc: '+str(model_roc_auc),
line=dict(color='rgb(22,96,167)', width=2))
trace3 = go.Scatter(x=[0, 1], y=[0, 1], line=dict(color=('rgb(205, 12, 24)'),
width=2, dash='dot'))
# 绘制相关性图
trace4 = go.Bar(x=coef_sumry['features'], y=coef_sumry['coefficients'],
name='Coefficients',
marker=dict(color=coef_sumry['coefficients'],
colorscale='Picnic',
line=dict(width=0.6, color='black')))
# 合并绘制
fig = tls.make_subplots(rows=2, cols=2, specs=[[{
}, {
}], [{
'colspan':2}, None]],
subplot_titles=('Confusion Matrix', 'Receiver operating characteristic',
'Feature Importances'))
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 2)
fig.append_trace(trace4, 2, 1)
fig['layout'].update(showlegend=False, title='Model perfomance', autosize=False,
height=900, width=800,
plot_bgcolor='rgba(240,240,240,0.95)', paper_bgcolor='rgba(240,240,240,0.95)',
margin=dict(b=195))
fig['layout']['xaxis2'].update(dict(title='false positive rate'))
fig['layout']['yaxis2'].update(dict(title='ture positive rate'))
fig['layout']['xaxis3'].update(dict(showgrid=True, tickfont=dict(size=10),
tickangle=90))
py.iplot(fig)
if thredshold_plot == True:
visualizer = DiscriminationThreshold(algorithm)
visualizer.fit(training_x, training_y)
visualizer.poof()
5.1 基本线性模型(Baseline Model)
logit = LogisticRegression(C=1.0, class_weight='liblinear', tol=0.0001,
verbose=0, warm_start=False)
telecom_churn_prediction(logit, train_X, test_X, train_Y, test_Y, cols,
'coefficients', thredshold_plot=True)
** 结果输出:**
- 模型准确率得分0.7998分
- 不知道为什么混淆矩阵也优点飘,求大神解答
5.2 人工少数类过抽样技术(Synthetic Minority Oversampling TEchnique (SMOTE))
5.2.1 使用过抽样区分数据集和测试集
from imblearn.over_sampling import SMOTE
cols = [i for i in telcom.columns if i not in Id_col+target_col]
smote_X = telcom[cols]
smote_Y = telcom[target_col]
# 区分训练集和测试机
smote_train_X, smote_test_X, smote_train_Y, smote_test_Y = train_test_split(
smote_X, smote_Y, test_size=.25, random_state=111)
# 使用过抽样
os = SMOTE(random_state=0)
os_smote_X, os_smote_Y = os.fit_sample(smote_train_X, smote_train_Y)
os_smote_X = pd.DataFrame(data=os_smote_X, columns=cols)
os_smote_Y = pd.DataFrame(data=os_smote_Y, columns=target_col)
5.2.2 对数据集使用基本线性模型
logit_smote = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
telecom_churn_prediction(logit_smote, os_smote_X, test_X, os_smote_Y, test_Y,
cols, 'coefficients', thredshold_plot=True)
5.3 递归式特征消除法(Recursive Feature Elimination(RFE))
结果输出:
5.4 单变量特征选取(Univariate Selection)
5.4.1 特征选取代码
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
#select columns
cols = [i for i in telcom.columns if i not in Id_col + target_col ]
#dataframe with non negative values
df_x = df_telcom_og[cols]
df_y = df_telcom_og[target_col]
#fit model with k= 3
select = SelectKBest(score_func = chi2,k = 3)
fit = select.fit(df_x,df_y)
5.4.2 结果显示
#Summerize scores
print ("scores")
print (fit.scores_)
print ("P - Values")
print (fit.pvalues_)
#create dataframe
score = pd.DataFrame({
"features":cols,"scores":fit.scores_,"p_values":fit.pvalues_ })
score = score.sort_values(by = "scores" ,ascending =False)
# 为分类变量和数值变量创建新标签
score['feature_type'] = np.where(score['features'].isin(num_cols), 'Numerical', 'Categorical')
# 绘图
trace = go.Scatter(x=score[score['feature_type']=='Categorical']['features'],
y=score[score['feature_type']=='Categorical']['scores'],
name='Categorical', mode='lines+markers',
marker=dict(color='red', line=dict(width=1)))
trace1 = go.Bar(x=score[score['feature_type']=='Numerical']['features'],
y=score[score['feature_type']=='Numerical']['scores'],
name='Numerical', marker=dict(color='royalblue', line=dict(width=1)),
xaxis='x2', yaxis='y2')
layout = go.Layout(dict(
title='Scores for Categorical & Numerical features',
plot_bgcolor='rgb(243, 243, 243)', paper_bgcolor='rgb(243, 243, 243)',
xaxis=dict(gridcolor='rgb(255, 255, 255)', tickfont=dict(size=10),
domain=[0, 0.7], ticklen=5, gridwidth=2),
yaxis=dict(gridcolor='rgb(255, 255, 255)', title='scores', zerolinewidth=1,
ticklen=5, gridwidth=2),
margin=dict(b=200),
xaxis2=dict(gridcolor='rgb(255, 255, 255)', domain=[0.8, 1], tickangle=90),
yaxis2=dict(anchor='x2', gridcolor='rgb(255, 255, 255)')
))
data = [trace, trace1]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
** 结果输出:**
5.5 决策树模型可视化
5.5.1 使用上述特征选取结果的前三名来构建决策树——gini
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz
# 前三个分类变量
features_cat = score[score['feature_type']=='Categorical']['features'][:3].tolist()
# 前三个数值型变量
features_num = score[score['feature_type']=='Numerical']['features'][:3].tolist()
** 构建决策树所需变量说明:**
# 函数元素
# columns -选择的列
# maximum_depth -树的大小
# criterion_type- 【’gini‘或者’entropy‘】
# split_type - 【’best‘或者’random‘】
# Model Performance - True(给出模型输出)
决策树及指标可视化
def plot_DT(columns, maximum_depth, criterion_type, split_type, filename, model_performance=None):
dtc_x = df_x[columns]
dtc_y = df_y[target_col]
# 建模
dt_classifier = DecisionTreeClassifier(max_depth=maximum_depth,
splitter=split_type,
criterion=criterion_type)
dt_classifier = dt_classifier.fit(dtc_x, dtc_y)
data = tree.export_graphviz(dt_classifier, out_file=None, rounded=True,
proportion=False, feature_names=columns,
precision=2, class_names=['Not churn', 'churn'],
filled=True)
# 绘图
graph = graphviz.Source(data)
path = 'D:\\temp-pdf\\'+ filename
graph.render(directory=path)
if model_performance == True:
telecom_churn_prediction(dt_classifier, dtc_x, test_X[columns],
dtc_y, test_Y, columns, 'features', thredshold_plot=True)
plot_DT(features_num, 3, 'gini', 'best', filename='tree_D_num')
结果输出:
5.5.2 5.5.1 使用上述特征选取结果的前三名来构建决策树——entropy
plot_DT(features_cat, 3, 'entropy', 'best', filename='tree_DT_cat', model_performance=True)
** 结果输出:**
5.6 KNN模型
5.6.1 构建模型可视化函数
def telecom_churn_prediction_alg(algorithm, training_x, testing_x, training_y, testing_y, thredshold_plot=True):
# 构建模型
algorithm.fit(training_x, training_y)
preds = algorithm.predict(testing_x)
probs = algorithm.predict_proba(testing_x)
print(algorithm)
print('\n Classification report: \n', classification_report(testing_y, preds))
print('Accuracy Score: ', accuracy_score(testing_y, preds))
# 混淆矩阵
cm = confusion_matrix(testing_y, preds),
# roc得分
model_roc_auc = roc_auc_score(testing_y, preds)
print('Area under curve:', model_roc_auc)
fpr, tpr, thresholds = roc_curve(testing_y, probs[:, 1])
# 绘制roc曲线
trace1 = go.Scatter(x=fpr, y=tpr, name='Roc'+str(model_roc_auc),
line=dict(color=('rgb(22, 96, 167)'), width=2))
trace2 = go.Scatter(x=[0, 1], y=[0, 1],
line=dict(color=('rgb(205, 12, 24)'), width=2, dash='dot'))
trace3 = go.Heatmap(z=cm, x=['Not churn', 'churn'], y=['Not churn', 'churn'],
showscale=False, colorscale='Blues', name='Matrix',
xaxis='x2', yaxis='y2')
layout = go.Layout(dict(title='Model performance', autosize=False,
height=500, width=800, showlegend=False,
plot_bgcolor='rgb(243, 243, 243)', paper_bgcolor='rgb(243, 243, 243)',
xaxis=dict(title='false positive rate', gridcolor='rgb(255, 255, 255)',
domain=[0, 0.6], ticklen=5, gridwidth=2),
yaxis=dict(title='true posotive rate', gridcolor='rgb(255, 255, 255)',
zerolinewidth=1, ticklen=5, gridwidth=2),
margin=dict(b=200),
xaxis2=dict(domain=[0.7, 1], tickangle=90, gridcolor='rgb(255, 255, 255)'),
yaxis2=dict(anchor='x2', gridcolor='rgb(255, 255, 255)')))
data=[trace1, trace2, trace3]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
if thredshold_plot == True:
vz = DiscriminationThreshold(algorithm)
vz.fit(training_x, training_y)
vz.poof()
5.6.2 实例化KNN模型,调用函数进行可视化展现
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=5, weights='uniform')
telecom_churn_prediction_alg(knn, os_smote_X, test_X, os_smote_Y, test_Y, thredshold_plot=True)
** 结果输出:**
5.7 随机森林分类器单个树可视化展现
5.7.1 导入库,函数构建变量说明及函数构建
from sklearn.ensemble import RandomForestClassifier
# 函数元素:
# columns - 使用到的列
# nf_estimators -随机森林的树的数量
# estimated_tree-使用到的树的数量
# maximum——tree -最大深度
# criterion_tree-使用’gini‘还是’entropy‘来确定构建决策树模型
# Model_performance-打印模型的表现
def plot_tree_rf(columns, nf_estimators, estimated_tree, maximum_depth,
criterion_type, tree_name, model_pf=None):
df = df_telcom_og[columns+target_col].copy()
# 区分训练集和测试集
rf_x = df[[i for i in columns if i not in target_col]]
rf_y = df[target_col]
# 随机森林分类器模型构建
rfc = RandomForestClassifier(n_estimators=nf_estimators, max_depth=maximum_depth,
criterion=criterion_type)
rfc.fit(rf_x, rf_y)
estimated_tree=rfc.estimators_[estimated_tree]
graph = graphviz.Source(tree.export_graphviz(estimated_tree, out_file=None, rounded=True,
proportion=False, feature_names=columns, precision=2,
class_names=['Not churn', 'churn'], filled=True))
path = 'D:\\temp-pdf\\'+tree_name
graph.render(directory=path) # 储存为pdf格式
# 模型表现
if model_pf == True:
telecom_churn_prediction(rfc, rf_x, test_X[columns], rf_y, test_Y,
columns, 'features', thredshold_plot=True)
- 原文中,是用display方法将决策树显示出来。但是运行的时候,发现这样太慢。
- 所以改成用render,将决策树存储为pdf文件,放在D盘中。
结果输出:
5.7.2 实例化随机森林模型,调用函数进行可视化
调用函数:
cols1 = [i for i in train_X.columns if i not in target_col+Id_col]
plot_tree_rf( cols1, 100, 99, 3, 'entropy', 'rf-tree1', True)
结果输出:
5.8 构建有十棵树的随机森林分类器
# 构建一个有10棵树的随机森林模型
n = np.arange(0, 10).tolist()
cols1 = [i for i in train_X.columns if i not in target_col+Id_col]
for i in n:
tree_name = 'tree-rf-%d' %i
plot_tree_rf(cols1, 10, i, 3, 'entropy',tree_name=tree_name, model_pf=False)
- 以上代码将调用函数的model_pf参数设置为False,不显示每棵树的report、模型表现、特征重要性和阈值得分变化表。
- 输出结果,就是10棵树的PDF文件。
5.9 朴素贝叶斯模型(NB)
# 朴素贝叶斯方法
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB(priors=None)
telecom_churn_prediction_alg(gnb, os_smote_X, test_X, os_smote_Y, test_Y)
结果输出:
5.10 支持向量机(SVC)
# 支持向量机SVM
from sklearn.svm import SVC
# 支持向量机分类器
# 使用线性超平面
svc_lin = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr',
degree=3, gamma=1.0, kernel='linear', max_iter=-1, probability=True,
random_state=None, shrinking=True, tol=0.001, verbose=False)
cols = [i for i in telcom.columns if i not in target_col+Id_col]
telecom_churn_prediction(svc_lin, os_smote_X, test_X, os_smote_Y, test_Y, cols,
'coefficients', thredshold_plot=False)
结果输出:
5.11 SVC调参
# 使用非线性超平面‘rbf’
svc_rbf = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr',
degree=3, gamma=1.0, kernel='rbf', max_iter=-1, probability=True,
random_state=None, shrinking=True, tol=0.001, verbose=False)
telecom_churn_prediction_alg(svc_rbf, os_smote_X, test_X, os_smote_Y, test_Y, thredshold_plot=False)
** 结果输出:**
5.12 LightGBM分类器
# 使用LightGBMClassifier
from lightgbm import LGBMClassifier
lgbm_c = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
learning_rate=0.5, max_depth=7, min_child_samples=20, min_child_weight=0.001,
min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=500, objective='binary',
random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
subsample_for_bin=200000, subsample_freq=0)
cols = [i for i in telcom.columns if i not in target_col+Id_col]
telecom_churn_prediction(lgbm_c, os_smote_X, test_X, os_smote_Y, test_Y,
cols, 'features', thredshold_plot=True)
** 结果输出:**
5.13 XGBoost分类器
# XGBoost 算法
from xgboost import XGBClassifier
xgc = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1,
gamma=0, learning_rate=0.9, max_delta_step=0, max_depth=7, min_child_weight=1,
missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='binary:logistic',
random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1)
telecom_churn_prediction(xgc, os_smote_X, test_X, os_smote_Y, test_Y,
cols, 'features', thredshold_plot=True)
** 结果输出:**