Kaggle案例精选——电信客户流失预测(Telecom Customer Churn Prediction)Part Four:模型表现对比

6 Model Performance:模型表现

构建指标计算函数

from sklearn.metrics import f1_score, cohen_kappa_score, precision_recall_curve, average_precision_score

# 设置模型报告表格
def model_report_df(model, training_x, testing_x, training_y, testing_y, name):
    model.fit(training_x, training_y)
    preds = model.predict(testing_x)
    accracy = accuracy_score(testing_y, preds)
    recallscore = recall_score(testing_y, preds)
    precision = precision_score(testing_y, preds)
    roc_auc = roc_auc_score(testing_y, preds)
    f1score = f1_score(testing_y, preds)
    kappa_metric = cohen_kappa_score(testing_y, preds)

    df = pd.DataFrame({
    
    
        'Model': [name],
        'Accuracy':[accracy],
        'Recall_score':[recallscore],
        'Precision':[precision],
        'F1_score':[f1score],
        'Area_under_curve':[roc_auc],
        'Kappa_metric':[kappa_metric]
    })
    return df

6.1 模型表现指标计算

# 每个模型的输出值计算
model1 = model_report_df(logit, train_X, test_X, train_Y, test_Y,
                         'Logistic Regression(Baseline_model)')
model2 = model_report_df(logit_smote, os_smote_X, test_X, os_smote_Y, test_Y,
                         'Logistic Regression(SMOTE)')
model3 = model_report_df(logit_rfe, train_X, test_X, train_Y, test_Y,
                         'Logistic Regression(RFE)')

DTree = DecisionTreeClassifier(max_depth=9, random_state=123, splitter='best', criterion='gini')
model4 = model_report_df(DTree, train_X, test_X, train_Y, test_Y,
                         'Decision Tree')
model5 = model_report_df(knn, os_smote_X, test_X, os_smote_Y, test_Y,
                         'KNN Classifier')

rfc = RandomForestClassifier(n_estimators=1000, random_state=123, max_depth=9, criterion='gini')
model6 = model_report_df(rfc, train_X, test_X, train_Y, test_Y,
                         'Random Forest Classifier')
model7 = model_report_df(gnb, os_smote_X, test_X, os_smote_Y, test_Y,
                         'Naive Bayes')
model8 = model_report_df(svc_lin, os_smote_X, test_X, os_smote_Y, test_Y,
                         'SVM Clasifier Linear')
model9 = model_report_df(svc_rbf, os_smote_X, test_X, os_smote_Y, test_Y,
                         'SVM Classifier RBF')
model10 = model_report_df(lgbm_c, os_smote_X, test_X, os_smote_Y, test_Y,
                          'LGBM Classifier')
model11= model_report_df(xgc, os_smote_X, test_X, os_smote_Y, test_Y,
                         'XGBoost Classifier')

6.2 模型指标表格及柱状图

6.2.1 表格输出

# 将模型参数整合到一个表格中
model_performances = pd.concat([model1, model2, model3, model4, model5, model6,
                                model7, model8, model9, model10, model11], axis=0).reset_index()

model_performances = model_performances.drop(columns='index', axis=1)
tabel = ff.create_table(np.round(model_performances, 4))

py.iplot(tabel)

结果输出:
模型指标表格

6.2.2 模型指标水平柱状图

def output_trace(metric, color):
    tracer = go.Bar(y=model_performances['Model'], x=model_performances[metric],
                    orientation='h', name=metric,
                    marker=dict(line=dict(width=0.7), color=color))
    return tracer

layout = go.Layout(dict(
    title = 'Model Performances',
    plot_bgcolor = 'rgb(243, 243, 243)',
    paper_bgcolor = 'rgb(243, 243, 243)',
    xaxis = dict(
        gridcolor = 'rgb(255, 255, 255)',
        title = 'metric',
        zerolinewidth = 1,
        ticklen = 5,
        gridwidth=2
    ),
    yaxis = dict(
        gridcolor = 'rgb(255, 255, 255)',
        zerolinewidth = 1,
        ticklen = 5,
        gridwidth = 2),
    margin = dict(l = 250),
    height = 780
    )
)

trace1 = output_trace('Accuracy', '#6699FF')
trace2 = output_trace('Recall_score', 'red')
trace3 = output_trace('Precision', '#33CC99')
trace4 = output_trace('F1_score', 'lightgrey')
trace5 = output_trace('Kappa_metric', '#FFCC99')

data = [trace1, trace2, trace3, trace4, trace5]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

结果输出:
模型指标对比水平柱状图

6.3 模型混淆矩阵


lst = [logit, logit_smote, DTree, knn, rfc, gnb, svc_lin, svc_rbf, lgbm_c, xgc]

length = len(lst)

mods = ['Logistic Regression(Baseline_model)', 'Logistic Regression(SMOTE)',
        'Decision Tree', 'KNN Classifier', 'RandomForest Classifier', 'Naive Bayes',
        'SVM Classifier Linear', 'SVM Classifier RBG', 'LGBM Classifier', 'XGBoost Classifier']

fig = plt.figure(figsize=(13, 15))
fig.set_facecolor('#F3F3F3')
for i, j, k in itertools.zip_longest(lst, range(length), mods):
    plt.subplot(4, 3, j+1)
    preds = i.predict(test_X)
    cm = confusion_matrix(test_Y, preds)
    sns.heatmap(cm, annot=True, fmt='d', square=True,
                xticklabels=['Not churn', 'churn'],
                yticklabels=['Not churn', 'churn'],
                linewidths=2, linecolor='w', cmap='Set1')
    plt.title(k, color='b')
    plt.subplots_adjust(wspace=.3, hspace=.3)

结果输出:
CM矩阵

6.4 模型ROC曲线

lst = [logit, logit_smote, DTree, knn, rfc, gnb, svc_lin, svc_rbf, lgbm_c, xgc]

length = len(lst)

mods = ['Logistic Regression(Baseline_model)', 'Logistic Regression(SMOTE)',
        'Decision Tree', 'KNN Classifier', 'RandomForest Classifier', 'Naive Bayes',
        'SVM Classifier Linear', 'SVM Classifier RBG', 'LGBM Classifier', 'XGBoost Classifier']

fig = plt.figure(figsize=(13, 15))
fig.set_facecolor('#F3F3F3')
for i, j, k in itertools.zip_longest(lst, range(length), mods):
    plt.subplot(4, 3, j+1)
    preds = i.predict(test_X)
    cm = confusion_matrix(test_Y, preds)
    sns.heatmap(cm, annot=True, fmt='d', square=True,
                xticklabels=['Not churn', 'churn'],
                yticklabels=['Not churn', 'churn'],
                linewidths=2, linecolor='w', cmap='Set1')
    plt.title(k, color='b')
    plt.subplots_adjust(wspace=.3, hspace=.3)

结果输出:
ROC曲线

6.5 模型精密召回曲线(Precision recall curves(PR))

lst = [logit, logit_smote, DTree, knn, rfc, gnb, svc_lin, svc_rbf, lgbm_c, xgc]

length = len(lst)

mods = ['Logistic Regression(Baseline_model)', 'Logistic Regression(SMOTE)',
        'Decision Tree', 'KNN Classifier', 'RandomForest Classifier', 'Naive Bayes',
        'SVM Classifier Linear', 'SVM Classifier RBG', 'LGBM Classifier', 'XGBoost Classifier']

plt.style.use('dark_background')
fig = plt.figure(figsize=(13, 17))
fig.set_facecolor('#F3F3F3')

for i, j, k in itertools.zip_longest(lst, range(length), mods):
    qx = plt.subplot(4, 3, j+1)
    probas = i.predict_proba(test_X)
    preds = i.predict(test_X)
    recall, precision, thresholds = precision_recall_curve(test_Y, probas[:, 1])

    plt.plot(recall, precision, linewidth=1.5,
             label=('avg_pcn:'+ str(np.around(average_precision_score(test_Y, probas[:, 1]), 3))))
    plt.plot([0, 1], [0, 0], linestyle='dashed')
    plt.fill_between(recall, precision, alpha=0.2)
    plt.legend(loc='lower left', prop={
    
    'size':10})
    qx.set_facecolor('k')
    plt.grid(True, alpha=0.15)
    plt.title(k, color='b')
    plt.xlabel('recall', fontsize=7)
    plt.ylabel('recall', fontsize=7)
    plt.xlim([0.25, 1])
    plt.yticks(np.arange(0, 1, 0.3))

结果输出:
PC曲线

转载地址:

kaggle典型客户流失数据分析预测精品案例!

数据集下载地址:

数据集及更多notebook浏览地址

猜你喜欢

转载自blog.csdn.net/Haoyu_xie/article/details/108575919