sklearn模型的保存和加载API--案例癌症(逻辑回归)分类预测

sklearn模型的保存和加载API

from sklearn.externals import joblib

。保存:joblib.dump(estimator,'test.pkl')
。加载:estimator=joblib.load('test.pk')

将训练模型保存下来,下次有新的需要预测的数据传进来的时候直接加载模型,然后预测就不用每次都从新训练了。

案例:癌症(逻辑回归)分类预测–良/恶性

import pandas as pd
import numpy as np

'''# 1、读取数据'''
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv(path, names=column_name)


'''2 数据预处理'''
# 2、缺失值处理
# 1)替换-》np.nan
data = data.replace(to_replace="?", value=np.nan)
# 2)删除有缺失值的样本
data.dropna(inplace=True)


'''# 3、划分数据集'''
from sklearn.model_selection import train_test_split

# 筛选特征值和目标值
x = data.iloc[:, 1:-1]
y = data["Class"]
x_train, x_test, y_train, y_test = train_test_split(x, y)


'''# 4、特征工程---标准化'''
from sklearn.preprocessing import StandardScaler
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)


'''# 5、预估器流程'''
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
estimator.fit(x_train, y_train)

'''模型保存'''
from sklearn.externals import joblib
joblib.dump(estimator,'LogisticRegression.pkl')  #将模型保存在当前路劲下的LogisticRegression.pk文件里

# 返回逻辑回归的模型参数:回归系数和偏置
print('回归系数:',estimator.coef_) 
print("误差(偏置):",estimator.intercept_)


'''# 6、模型评估'''
# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)  #计算预测值
print("y_predict(预测值):\n", y_predict)
# print("直接比对真实值和预测值:\n", y_test == y_predict)

# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:", score)

# 查看精确率、召回率、F1-score
from sklearn.metrics import classification_report
report = classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"])
print("查看精确率、召回率、F1-score:\n",report)


# ROC曲线与AUC指标
from sklearn.metrics import roc_auc_score
#print(y_test.head())
# y_true:每个样本的真实类别,必须为0(反例),1(正例)标记
# 将y_test 转换成 0 ,1
y_true = np.where(y_test > 3, 1, 0)
# print(y_true)
AUC = roc_auc_score(y_true, y_predict)  #计算AUC指标:越接近1越好
print("AUC:",AUC)

回归系数: [[1.47945227 0.07579265 0.59505721 0.69195463 0.33274168 1.16446335
  1.16645995 0.92205206 0.72380317]]
误差(偏置): [-0.93015988]
y_predict(预测值):
 [4 4 2 4 2 2 4 4 4 2 2 4 2 2 4 2 4 4 2 2 2 2 4 2 2 4 2 2 4 2 4 4 2 4 2 4 2
 4 2 2 2 4 4 2 4 2 2 4 2 2 2 4 4 2 2 2 4 2 2 2 4 2 2 2 4 2 4 2 4 4 4 2 4 2
 2 4 2 2 2 4 2 4 2 2 2 4 4 4 2 4 2 2 4 2 4 4 2 2 2 2 2 2 2 4 2 2 2 4 2 2 2
 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 4
 2 2 2 2 2 2 2 4 2 4 2 2 2 4 4 4 2 2 2 2 4 2 2]
准确率为: 0.9766081871345029
查看精确率、召回率、F1-score:
              precision    recall  f1-score   support

         良性       0.98      0.98      0.98       114
         恶性       0.96      0.96      0.96        57

avg / total       0.98      0.98      0.98       171

AUC: 0.9736842105263157
'''加载模型并进行预测'''

#载入需要预测的特征变量
x = (data.iloc[:, 1:-1]).sample(50) #传入需要预测的特征变量,这里就暂时使用原来数据随机抽50个

#对特征变量进行标准化处理
from sklearn.preprocessing import StandardScaler  #对特征变量进行标准化处理
transfer = StandardScaler()
x_s = transfer.fit_transform(x)

#载入模型并预测
from sklearn.externals import joblib  #调用加载模型的API
from sklearn.linear_model import LogisticRegression  #调用模型的API
estimator=joblib.load('LogisticRegression.pkl')  #加载模型
y_predict = estimator.predict(x_s)  #计算预测值
print("y_predict(预测值):\n", y_predict)

# 结果展示:构造二维表
import pandas as pd
data = pd.DataFrame(x)
data["y_predict"]= y_predict
display(data.head())
y_predict(预测值):
 [2 2 4 2 2 4 4 4 2 2 2 2 2 4 4 4 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2
 2 2 4 2 4 4 4 2 4 2 2 2 2]
Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size Bare Nuclei Bland Chromatin Normal Nucleoli Mitoses y_predict
429 2 1 1 1 2 1 2 1 1 2
219 6 1 3 1 2 1 3 1 1 2
320 7 6 3 2 5 10 7 4 6 4
423 5 1 3 1 2 1 2 1 1 2
678 1 1 1 1 2 1 1 1 1 2
发布了130 篇原创文章 · 获赞 144 · 访问量 2万+

猜你喜欢

转载自blog.csdn.net/weixin_41685388/article/details/104515820
今日推荐