引言
这里使用鸢尾花数据进行实验
鸢尾花数据集——提取码:1234
一、决策树分类
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pydotplus
from sklearn.metrics import accuracy_score
# 加载数据
def loaddata():
columns = ['sepal_length', 'speal_width', 'petal_length', 'petal_width', 'type']
data = pd.read_csv('data/iris.data', header=None, names=columns)
X = data.iloc[:, :2] # 取前两个数值进行分析
# 将类别数据转化为数值数据
y_type = [str(k) for k in data.iloc[:, 4]]
y = pd.Categorical(data.iloc[:, 4]).codes
return X, y, y_type
# 决策树可视化
def plotDT(model, feature, y_type):
dot_data = tree.export_graphviz(model, out_file=None, feature_names=feature,
class_names=y_type, filled=True,
rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('yuanweihua_show.png')
if __name__ == '__main__':
# 解决中文问题
mpl.rcParams["font.sans-serif"] = ['simHei']
mpl.rcParams['axes.unicode_minus'] = False
feature = ['sepal_length', 'speal_width']
# 加载数据
X, y, y_type = loaddata()
# 数据分割
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)
# 模型
model = DecisionTreeClassifier(criterion='gini', max_depth=6)
model.fit(X_train, y_train)
# 决策树可视化
plotDT(model, feature, y_type)
# 预测
y_pred = model.predict(X_test)
# 准确率
print('准确率=', accuracy_score(y_test, y_pred))
# 画决策边界图
N, M = 500, 500
x1_min, x1_max = min(X.iloc[:, 0]) - 0.5, max(X.iloc[:, 0]) + 0.5
x2_min, x2_max = min(X.iloc[:, 1]) - 0.5, max(X.iloc[:, 1]) + 0.5
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, M)
# 生成网格采样点
x1, x2 = np.meshgrid(t1, t2)
# 生成测试点
# x1.flat数组迭代器
x_show = np.stack((x1.flat, x2.flat), axis=1)
# 颜色
cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(["g", 'r', 'b'])
# 预测
y_show_pred = model.predict(x_show)
# 转换成x1样式
y_show_pred = y_show_pred.reshape(x1.shape)
fig = plt.figure(facecolor='w')
fig.subplots()
# 画伪彩图,直观表现出分类边界
plt.pcolormesh(x1, x2, y_show_pred, cmap=cm_light)
# 画散点图
plt.scatter(X_test.iloc[:, 0], X_test.iloc[:, 1], c=y_test.ravel(), edgecolors='k', s=150, zorder=10, cmap=cm_dark,
marker='*')
# 全部数据
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y.ravel(), edgecolors='k', s=40, cmap=cm_dark)
# 坐标范围
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
# 精度
plt.xticks(np.arange(x1_min, x1_max, 0.2))
plt.yticks(np.arange(x2_min, x2_max, 0.2))
# 坐标标签
plt.xlabel('sepal_length', fontsize=15)
plt.ylabel('speal_width', fontsize=15)
plt.title('鸢尾花数据决策树分类', fontsize=18)
plt.grid(b=True, ls=":")
plt.show()
# 分析决策树深度对准确率的影响
depth = np.arange(2, 15, 1)
err_list = []
for d in depth:
model = DecisionTreeClassifier(criterion='gini', max_depth=d)
model.fit(X_train, y_train)
# 预测
y_hat = model.predict(X_test)
err_list.append(1 - accuracy_score(y_hat, y_test))
# 错误率可视化
plt.figure(facecolor='w')
plt.plot(depth, err_list, 'ro-', lw=2)
plt.xlabel('决策树深度', fontsize=15)
plt.ylabel('错误率', fontsize=15)
plt.title('决策树深度与过拟合', fontsize=18)
plt.legend(loc='upper right')
plt.grid(b=True)
plt.show()
准确率= 0.7777777777777778
可视化
二、决策树回归—决策树用于拟合
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.tree import DecisionTreeRegressor
# 构造数据
def makedata():
N = 500
X = np.random.rand(N) * 6 - 3
X.sort()
y = np.sin(X) + np.random.randn(N) * 0.05 # 高斯噪声
X_test = np.linspace(-3, 3, 50).reshape(-1, 1)
return X, y,X_test
if __name__ == '__main__':
X, y,X_test= makedata()
model = DecisionTreeRegressor(criterion='mse', max_depth=9)
model.fit(X.reshape(-1,1), y)
# 预测
y_test_hat = model.predict(X_test)
# 画图
plt.figure(facecolor='w')
plt.plot(X,y,'r*',ms=5,lw=3,label='Accual')
plt.plot(X_test,y_test_hat,'g-',lw=2,label='Predict')
plt.legend(loc='upper right')
plt.show()
# 决策树深度的影响
depth = np.arange(1,10,2)
cm_dark = 'rgbmy'
for d,c in zip(depth,cm_dark):
model.set_params(max_depth = d)
model.fit(X.reshape(-1,1),y)
y_hat = model.predict(X_test)
plt.plot(X_test,y_hat,'-',color=c,lw=2,label='Depth=%d'%d)
plt.legend(loc='best')
plt.grid(b=True)
plt.show()
三、多输出决策树回归
import warnings
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
# 构造数据
def makedata():
N = 500
X = np.random.rand(N) * 8 - 4
X.sort()
y1 = 16 * np.sin(X) ** 3 + np.random.randn(N)
y2 = 13 * np.cos(X) ** 3 - 5 * np.cos(2 * X) - 2 * np.cos(3 * X) - np.cos(4 * X) + 0.1 * np.random.randn(N)
y = np.vstack((y1, y2)).T
X = X.reshape(-1, 1)
return X, y
if __name__ == '__main__':
# 消除警告
warnings.filterwarnings(action='ignore')
# 设置精度
np.set_printoptions(suppress=True)
# 获得数据
X, y = makedata()
# 模型
deep = 8
model = DecisionTreeRegressor(criterion='mse', max_depth=deep)
model.fit(X, y)
# 预测
X_test = np.linspace(-4, 4, num=1000).reshape(-1, 1)
y_pred = model.predict(X_test)
# 画图
plt.figure(facecolor='w')
plt.scatter(y[:, 0], y[:, 1], c='r', marker='s', s=60, label='Actual')
plt.scatter(y_pred[:, 0], y_pred[:, 1], c='g', marker='o', edgecolors='g', s=30, label='Depth=%d' % deep, alpha=0.7)
plt.legend(loc='best')
plt.xlabel('y1', fontsize=15)
plt.ylabel('y2', fontsize=15)
plt.grid(b=True)
plt.show()