GBDT、KNN数据建模分析步骤

from sklearn.neighbors import KNeighborsClassifier
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.learning_curve import learning_curve
from sklearn.svm import SVC
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

第一步: 加载数据集

data = pd.read_excel(‘/home/kesci/input/data_9096/data.xlsx’)
X = data.drop(‘Sex’, axis=1)
y = data.Sex

第二步: 数据集标准化, 划分测试集、训练集

X = preprocessing.scale(X) # 标准化数据
X_train, X_test, y_train, y_test = train_test_split( # 随机划分训练集和测试集 7:3
X, y, test_size=0.3, random_state=0)

第三步: 训练模型

KNN 模型

knn = KNeighborsClassifier() # 训练模型
knn.fit(X_train, y_train)
print(knn.predict(X_test)) # 利用模型做预测
print(y_test)
print(knn.score(X_test, y_test)) # 模型打分

GBDT 模型

clf = ensemble.GradientBoostingClassifier()
clf.fit(X_train, y_train)
print(clf.predict(X_test)) # 利用模型做预测
print(y_test)
print(clf.score(X_test, y_test)) # 模型打分

交叉验证

from sklearn.datasets import load_digits #digits数据集
digits = load_digits()
X = digits.data
y = digits.target
train_sizes, train_loss, test_loss = learning_curve( # 学习曲线模块
KNeighborsClassifier(n_neighbors=5), X, y, cv=2, scoring=’mean_squared_error’,
train_sizes=[0.1, 0.25, 0.5, 0.75, 1])
平均每一轮所得到的平均方差(共5轮,分别为样本10%、25%、50%、75%、100%)
train_loss_mean = -np.mean(train_loss, axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)
plt.plot(train_sizes, train_loss_mean, ‘o-‘, color=”r”,
label=”Training”)
plt.plot(train_sizes, test_loss_mean, ‘o-‘, color=”g”,
label=”Cross-validation”)

plt.xlabel(“Training examples”)
plt.ylabel(“Loss”)
plt.legend(loc=”best”)
plt.show()

猜你喜欢

转载自blog.csdn.net/luzaofa/article/details/80387764
今日推荐