版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014727529/article/details/79805319
#内容为实现小象学院的相关代码
#数据连接https://www.kaggle.com/uciml/biomechanical-features-of-orthopedic-patients
import pandas as pd
import numpy as npimport seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
import pickle #训练好的模型持久化
data = pd.read_csv(r'C:\Users\Administrator\Desktop\codereader-master\machinelearning\python\codes\data.csv')
#print(data.head()) #数据预览
#print(data.info()) #数据信息
#print(data.describe()) #数据统计,用于观察异常值,一遍进行数据清洗
#sns.pairplot(data=data,hue='class')
data['label'] = data['class'].map({'Abnormal':1,'Normal':0})#将类别标签Abnormal和Normal映射为0和1
#print(data.head())
X = data.iloc[:,:6].values #iloc为取行的值,本句算法意在取所有行和前6列的数据元素的值,去掉表头
y = data['label'].values #取标签列
#print(X)
#print(len(y))
#划分测试数据和训练数据
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=10)
knn_model = KNeighborsClassifier()
knn_model.fit(X_train,y_train)
y_pred= knn_model.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print(acc)
#对于不同的K值进行计算默认的值为5
k_list = [1,5,10,20,30]
acc_list=[]
model_list=[]
for k in k_list:
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(X_train,y_train)
y_pred = knn_model.predict(X_test)
acc = accuracy_score(y_test,y_pred)
acc_list.append(acc)
model_list.append(knn_model)
print('k={},准确率={:.2f}'.format(k,acc))
#对结果画图,更加直观的尽心查看
# =============================================================================
# plt.figure(figsize=(8,8))
# plt.plot(acc_list)
# plt.title('kNN with different k Values')
# plt.xlabel('k value')
# plt.xticks(range(len(k_list)),k_list)
# plt.ylabel('accuracy')
# plt.ylim([0.8,1.0])
# plt.show()
# =============================================================================
#模型持久化
best_k_index = np.argmax(acc_list)
best_model = model_list[best_k_index] #以上俩步骤为找出最好的模型
model_file = r'C:\Users\Administrator\Desktop\a\knn\model.pkl'
with open(model_file,'wb') as f:
pickle.dump(best_model,f)
#从原始数据中随机选取5名病人,进行检查
n = 5
random_sample_data = data.sample(n)
print(random_sample_data)
#加载模型文件
with open(model_file,'rb') as f:
trained_model = pickle.load(f)
print(trained_model.predict(random_sample_data.iloc[:,:6].values))