6 Model Performance:模型表现
构建指标计算函数
from sklearn.metrics import f1_score, cohen_kappa_score, precision_recall_curve, average_precision_score
# 设置模型报告表格
def model_report_df(model, training_x, testing_x, training_y, testing_y, name):
model.fit(training_x, training_y)
preds = model.predict(testing_x)
accracy = accuracy_score(testing_y, preds)
recallscore = recall_score(testing_y, preds)
precision = precision_score(testing_y, preds)
roc_auc = roc_auc_score(testing_y, preds)
f1score = f1_score(testing_y, preds)
kappa_metric = cohen_kappa_score(testing_y, preds)
df = pd.DataFrame({
'Model': [name],
'Accuracy':[accracy],
'Recall_score':[recallscore],
'Precision':[precision],
'F1_score':[f1score],
'Area_under_curve':[roc_auc],
'Kappa_metric':[kappa_metric]
})
return df
6.1 模型表现指标计算
# 每个模型的输出值计算
model1 = model_report_df(logit, train_X, test_X, train_Y, test_Y,
'Logistic Regression(Baseline_model)')
model2 = model_report_df(logit_smote, os_smote_X, test_X, os_smote_Y, test_Y,
'Logistic Regression(SMOTE)')
model3 = model_report_df(logit_rfe, train_X, test_X, train_Y, test_Y,
'Logistic Regression(RFE)')
DTree = DecisionTreeClassifier(max_depth=9, random_state=123, splitter='best', criterion='gini')
model4 = model_report_df(DTree, train_X, test_X, train_Y, test_Y,
'Decision Tree')
model5 = model_report_df(knn, os_smote_X, test_X, os_smote_Y, test_Y,
'KNN Classifier')
rfc = RandomForestClassifier(n_estimators=1000, random_state=123, max_depth=9, criterion='gini')
model6 = model_report_df(rfc, train_X, test_X, train_Y, test_Y,
'Random Forest Classifier')
model7 = model_report_df(gnb, os_smote_X, test_X, os_smote_Y, test_Y,
'Naive Bayes')
model8 = model_report_df(svc_lin, os_smote_X, test_X, os_smote_Y, test_Y,
'SVM Clasifier Linear')
model9 = model_report_df(svc_rbf, os_smote_X, test_X, os_smote_Y, test_Y,
'SVM Classifier RBF')
model10 = model_report_df(lgbm_c, os_smote_X, test_X, os_smote_Y, test_Y,
'LGBM Classifier')
model11= model_report_df(xgc, os_smote_X, test_X, os_smote_Y, test_Y,
'XGBoost Classifier')
6.2 模型指标表格及柱状图
6.2.1 表格输出
# 将模型参数整合到一个表格中
model_performances = pd.concat([model1, model2, model3, model4, model5, model6,
model7, model8, model9, model10, model11], axis=0).reset_index()
model_performances = model_performances.drop(columns='index', axis=1)
tabel = ff.create_table(np.round(model_performances, 4))
py.iplot(tabel)
结果输出:
6.2.2 模型指标水平柱状图
def output_trace(metric, color):
tracer = go.Bar(y=model_performances['Model'], x=model_performances[metric],
orientation='h', name=metric,
marker=dict(line=dict(width=0.7), color=color))
return tracer
layout = go.Layout(dict(
title = 'Model Performances',
plot_bgcolor = 'rgb(243, 243, 243)',
paper_bgcolor = 'rgb(243, 243, 243)',
xaxis = dict(
gridcolor = 'rgb(255, 255, 255)',
title = 'metric',
zerolinewidth = 1,
ticklen = 5,
gridwidth=2
),
yaxis = dict(
gridcolor = 'rgb(255, 255, 255)',
zerolinewidth = 1,
ticklen = 5,
gridwidth = 2),
margin = dict(l = 250),
height = 780
)
)
trace1 = output_trace('Accuracy', '#6699FF')
trace2 = output_trace('Recall_score', 'red')
trace3 = output_trace('Precision', '#33CC99')
trace4 = output_trace('F1_score', 'lightgrey')
trace5 = output_trace('Kappa_metric', '#FFCC99')
data = [trace1, trace2, trace3, trace4, trace5]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
结果输出:
6.3 模型混淆矩阵
lst = [logit, logit_smote, DTree, knn, rfc, gnb, svc_lin, svc_rbf, lgbm_c, xgc]
length = len(lst)
mods = ['Logistic Regression(Baseline_model)', 'Logistic Regression(SMOTE)',
'Decision Tree', 'KNN Classifier', 'RandomForest Classifier', 'Naive Bayes',
'SVM Classifier Linear', 'SVM Classifier RBG', 'LGBM Classifier', 'XGBoost Classifier']
fig = plt.figure(figsize=(13, 15))
fig.set_facecolor('#F3F3F3')
for i, j, k in itertools.zip_longest(lst, range(length), mods):
plt.subplot(4, 3, j+1)
preds = i.predict(test_X)
cm = confusion_matrix(test_Y, preds)
sns.heatmap(cm, annot=True, fmt='d', square=True,
xticklabels=['Not churn', 'churn'],
yticklabels=['Not churn', 'churn'],
linewidths=2, linecolor='w', cmap='Set1')
plt.title(k, color='b')
plt.subplots_adjust(wspace=.3, hspace=.3)
结果输出:
6.4 模型ROC曲线
lst = [logit, logit_smote, DTree, knn, rfc, gnb, svc_lin, svc_rbf, lgbm_c, xgc]
length = len(lst)
mods = ['Logistic Regression(Baseline_model)', 'Logistic Regression(SMOTE)',
'Decision Tree', 'KNN Classifier', 'RandomForest Classifier', 'Naive Bayes',
'SVM Classifier Linear', 'SVM Classifier RBG', 'LGBM Classifier', 'XGBoost Classifier']
fig = plt.figure(figsize=(13, 15))
fig.set_facecolor('#F3F3F3')
for i, j, k in itertools.zip_longest(lst, range(length), mods):
plt.subplot(4, 3, j+1)
preds = i.predict(test_X)
cm = confusion_matrix(test_Y, preds)
sns.heatmap(cm, annot=True, fmt='d', square=True,
xticklabels=['Not churn', 'churn'],
yticklabels=['Not churn', 'churn'],
linewidths=2, linecolor='w', cmap='Set1')
plt.title(k, color='b')
plt.subplots_adjust(wspace=.3, hspace=.3)
结果输出:
6.5 模型精密召回曲线(Precision recall curves(PR))
lst = [logit, logit_smote, DTree, knn, rfc, gnb, svc_lin, svc_rbf, lgbm_c, xgc]
length = len(lst)
mods = ['Logistic Regression(Baseline_model)', 'Logistic Regression(SMOTE)',
'Decision Tree', 'KNN Classifier', 'RandomForest Classifier', 'Naive Bayes',
'SVM Classifier Linear', 'SVM Classifier RBG', 'LGBM Classifier', 'XGBoost Classifier']
plt.style.use('dark_background')
fig = plt.figure(figsize=(13, 17))
fig.set_facecolor('#F3F3F3')
for i, j, k in itertools.zip_longest(lst, range(length), mods):
qx = plt.subplot(4, 3, j+1)
probas = i.predict_proba(test_X)
preds = i.predict(test_X)
recall, precision, thresholds = precision_recall_curve(test_Y, probas[:, 1])
plt.plot(recall, precision, linewidth=1.5,
label=('avg_pcn:'+ str(np.around(average_precision_score(test_Y, probas[:, 1]), 3))))
plt.plot([0, 1], [0, 0], linestyle='dashed')
plt.fill_between(recall, precision, alpha=0.2)
plt.legend(loc='lower left', prop={
'size':10})
qx.set_facecolor('k')
plt.grid(True, alpha=0.15)
plt.title(k, color='b')
plt.xlabel('recall', fontsize=7)
plt.ylabel('recall', fontsize=7)
plt.xlim([0.25, 1])
plt.yticks(np.arange(0, 1, 0.3))
结果输出: