scikit-learn中GridSearchCV的使用:多模型,可视化

步骤:

1.选择并构建训练模型model

2.将训练模型model投入到GridSearchCV中,得到GridSearchCV模型grid_model

3.用grid_model拟合训练集数据,选择在validation_dataset上效果最好的参数的模型best_estimator

4.1.用best_estimator拟合训练集(得到的结果应该与之前不同,因为之前用交叉验证等方法对训练集进行了分割)

4.2.用best_estimator拟合测试集

5.结果可视化:AUC曲线,AUPR曲线


一.数据

【数据准备】

  Size Size
训练集 (1206, 294) (1206,)
测试集 (64, 294) (64,)

二.主模型

【搭建环境】

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import auc,roc_auc_score,roc_curve,precision_recall_curve
from sklearn.metrics import confusion_matrix,make_scorer

【模型准备】

seed = 1231
np.random.seed(seed)
x_train,y_train,x_test,y_test = x_train,y_train,x_test,y_test

names = ['Decision Tree', 'Random Forest']
classifiers = [DecisionTreeClassifier(),RandomForestClassifier()]

parameter_dtc = {'max_features':['auto','sqrt','log2',None],'max_depth':range(3,100,2)}
parameter_rfc = {'n_estimators':range(5,200,5),'max_features':['auto','sqrt','log2',None],'max_depth':range(3,100,2)}

parameters = [parameter_dtc,parameter_rfc,parameter_ada_dtc,parameter_mlp]

scoring = {'roc_auc':'roc_auc','accuracy':'accuracy', 'precision':'precision','recall':'recall','f1':'f1'}

【主模型函数】

def gird_search_model(clf,param,name,x_train,y_train,x_test,y_test): #clf-classifier;param-parameter;name-classifier_name
    model = GridSearchCV(clf,param,cv=5,verbose=2,scoring=scoring,refit='roc_auc',n_jobs=-1,return_train_score=True) #GridSearchCV模型
    fit = model.fit(x_train,y_train) #GridSearchCV模型拟合训练集数据,并返回训练器集合为fit
    y_train_pred = fit.best_estimator_.predict(x_train) #用训练器集合中最好的estimator预测y_train_pred
    y_test_pred = fit.best_estimator_.predict(x_test) #用训练器集合中最好的estimator预测y_test_pred
    
    cv_results = pd.DataFrame(fit.cv_results_).set_index(['params']) #将训练器集合fit的cv_results保存为df格式
    cv_results_mean = cv_results[['mean_train_accuracy', 'mean_train_f1','mean_train_precision', 'mean_train_recall', 'mean_train_roc_auc',
                                  'mean_test_accuracy','mean_test_f1', 'mean_test_precision', 'mean_test_recall','mean_test_roc_auc']] #cv_results中的各个score的mean
    cv_results_std = cv_results[['std_train_accuracy', 'std_train_f1', 'std_train_precision','std_train_recall', 'std_train_roc_auc',
                                 'std_test_accuracy', 'std_test_f1','std_test_precision', 'std_test_recall', 'std_test_roc_auc']] #cv_results中的各个score的std

#-------------------模型结果展示------------------------------------------------------  
  
    print('MODEL : %r' % name)
    print('Best cv_test_roc_auc: %f using %s' % (fit.best_score_,fit.best_params_)) #训练器集合fit中最好的模型得到的:best_score和best_params
    print(cv_results_mean)
    print(cv_results_std)
    
    train_score_list = []
    test_score_list = []
    score_list = []
    model_metrics_name = [accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,aupr] #模型评价指标,与scoreing相对应
    for matrix in model_metrics_name: #计算各个模型评价指标
        train_score = matrix(y_train,y_train_pred) #计算训练集的
        test_score = matrix(y_test,y_test_pred) #计算测试集的
        train_score_list.append(train_score) #把训练集的各个模型指标放在同一行
        test_score_list.append(test_score) #把测试集的各个模型指标放在同一行
    score_list.append(train_score_list) #合并训练集和测试集的结果(便于展示)
    score_list.append(test_score_list) #合并训练集和测试集的结果(便于展示)
    score_df = pd.DataFrame(score_list,index = ['train','test'],columns = ['accuracy','precision','recall','f1','roc_auc','aupr']) #将结果显示为df格式,加上行列index
    print('EVALUATE_METRICS:')
    print(score_df)   
    return cv_results,score_list,y_train_pred,y_test_pred

 【单个模型执行过程】

【单个模型执行结果】

 


【多个模型循环执行】

train_score_list = []
test_score_list = []
y_train_pred_list = []
y_test_pred_list = []
for clf,param,name in zip(classifiers,parameters,names):
    cv_result,score_list,y_train_pred,y_test_pred =  gird_search_model(clf,param,name,x_train,y_train,x_test,y_test) #执行主模型函数
    train_score_list.append(score_list[0])
    test_score_list.append(score_list[1])
    y_train_pred_list.append(y_train_pred)
    y_test_pred_list.append(y_test_pred)
    print('-------------------------------------------------------------------------------------------------------------------------------')
train_score_df = pd.DataFrame(train_score_list,index=names,columns=['acc','pre','rec','f1','roc_auc','aupr'])
test_score_df = pd.DataFrame(test_score_list,index=names,columns=['acc','pre','rec','f1','roc_auc','aupr'])
print('TRAIN_SCORE:')
print(train_score_df)
print()
print('TEST_SCORE:')
print(test_score_df)

【多个模型执行结果】


三.画AUC和PRC图

【主函数】

for clf_name,y_train_pred,y_test_pred in zip(names,y_train_pred_list,y_test_pred_list):
    show_curve(y_train,y_train_pred,clf_name,True)
    show_curve(y_test,y_test_pred,clf_name,False)

【结果】


四.子函数(主程序内的,应该写在最前面,本文为便于理解,放在最后)

1.模型评估函数里有一个aupr(precision-recall-curve的曲线下面积):当正负样本不平衡时使用aupr评估比auc好。

def aupr(y_true,y_pred):
    precision, recall, thresholds = precision_recall_curve(y_true,y_pred)
    roc_aupr = auc(recall,precision) 
    return roc_aupr

2.如果想使用混淆矩阵作为GridSearchCV模型中的scoring,需要用make_scorer转换一下。

def tn(y_true,y_pred): return confusion_matrix(y_true,y_pred)[0,0]
def fp(y_true,y_pred): return confusion_matrix(y_true,y_pred)[0,1]
def fn(y_true,y_pred): return confusion_matrix(y_true,y_pred)[1,0]
def tp(y_true,y_pred): return confusion_matrix(y_true,y_pred)[1,1]
make_score = {'tp':make_scorer(tp),'tn':make_scorer(tn),'fp':make_scorer(fp),'fn':make_scorer(fn)}

3.画图_步1:AUCPRC曲线

import matplotlib.pyplot as plt
def show_roc(roc_auc,fpr,tpr):
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--',label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot(fpr, tpr)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

def show_roc_pr(roc_aupr,recall,precision):
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--',label='ROC curve (area = %0.2f)' % roc_aupr)
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('ROC_PR curve')
    plt.legend(loc='best')
    plt.show()
    print()

4.画图_步2:AUCPRC曲线

def show_curve(y_true,y_pred,clf_name,train=True):
    fpr, tpr, thresholds1 = roc_curve(y_true,y_pred)
    precision, recall, thresholds2 = precision_recall_curve(y_true,y_pred)
    roc_auc = auc(fpr, tpr)
    roc_aupr = auc(recall,precision) 
    if train == True:
        print('%s  (%s)' %(clf_name,"train"))
    else:
        print('%s  (%s)' %(clf_name,"test"))
    show_roc(roc_auc,fpr,tpr)
    print()
    show_roc_pr(roc_aupr,recall,precision)

猜你喜欢

转载自blog.csdn.net/weixin_41171061/article/details/83859856