3.3 决策树分类与回归实战

引言

这里使用鸢尾花数据进行实验
鸢尾花数据集——提取码:1234

一、决策树分类

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pydotplus
from sklearn.metrics import accuracy_score


# 加载数据
def loaddata():
    columns = ['sepal_length', 'speal_width', 'petal_length', 'petal_width', 'type']
    data = pd.read_csv('data/iris.data', header=None, names=columns)
    X = data.iloc[:, :2]  # 取前两个数值进行分析
    # 将类别数据转化为数值数据
    y_type = [str(k) for k in data.iloc[:, 4]]
    y = pd.Categorical(data.iloc[:, 4]).codes
    return X, y, y_type


# 决策树可视化
def plotDT(model, feature, y_type):
    dot_data = tree.export_graphviz(model, out_file=None, feature_names=feature,
                                    class_names=y_type, filled=True,
                                    rounded=True, special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_png('yuanweihua_show.png')


if __name__ == '__main__':
    # 解决中文问题
    mpl.rcParams["font.sans-serif"] = ['simHei']
    mpl.rcParams['axes.unicode_minus'] = False

    feature = ['sepal_length', 'speal_width']
    # 加载数据
    X, y, y_type = loaddata()
    # 数据分割
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)
    # 模型
    model = DecisionTreeClassifier(criterion='gini', max_depth=6)
    model.fit(X_train, y_train)
    # 决策树可视化
    plotDT(model, feature, y_type)
    # 预测
    y_pred = model.predict(X_test)
    # 准确率
    print('准确率=', accuracy_score(y_test, y_pred))

    # 画决策边界图
    N, M = 500, 500
    x1_min, x1_max = min(X.iloc[:, 0]) - 0.5, max(X.iloc[:, 0]) + 0.5
    x2_min, x2_max = min(X.iloc[:, 1]) - 0.5, max(X.iloc[:, 1]) + 0.5
    t1 = np.linspace(x1_min, x1_max, N)
    t2 = np.linspace(x2_min, x2_max, M)
    # 生成网格采样点
    x1, x2 = np.meshgrid(t1, t2)
    # 生成测试点
    # x1.flat数组迭代器
    x_show = np.stack((x1.flat, x2.flat), axis=1)

    # 颜色
    cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = mpl.colors.ListedColormap(["g", 'r', 'b'])

    # 预测
    y_show_pred = model.predict(x_show)
    # 转换成x1样式
    y_show_pred = y_show_pred.reshape(x1.shape)

    fig = plt.figure(facecolor='w')
    fig.subplots()
    # 画伪彩图,直观表现出分类边界
    plt.pcolormesh(x1, x2, y_show_pred, cmap=cm_light)
    # 画散点图
    plt.scatter(X_test.iloc[:, 0], X_test.iloc[:, 1], c=y_test.ravel(), edgecolors='k', s=150, zorder=10, cmap=cm_dark,
                marker='*')
    # 全部数据
    plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y.ravel(), edgecolors='k', s=40, cmap=cm_dark)
    # 坐标范围
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    # 精度
    plt.xticks(np.arange(x1_min, x1_max, 0.2))
    plt.yticks(np.arange(x2_min, x2_max, 0.2))
    # 坐标标签
    plt.xlabel('sepal_length', fontsize=15)
    plt.ylabel('speal_width', fontsize=15)
    plt.title('鸢尾花数据决策树分类', fontsize=18)
    plt.grid(b=True, ls=":")
    plt.show()

    # 分析决策树深度对准确率的影响
    depth = np.arange(2, 15, 1)
    err_list = []
    for d in depth:
        model = DecisionTreeClassifier(criterion='gini', max_depth=d)
        model.fit(X_train, y_train)
        # 预测
        y_hat = model.predict(X_test)
        err_list.append(1 - accuracy_score(y_hat, y_test))

    # 错误率可视化
    plt.figure(facecolor='w')
    plt.plot(depth, err_list, 'ro-', lw=2)
    plt.xlabel('决策树深度', fontsize=15)
    plt.ylabel('错误率', fontsize=15)
    plt.title('决策树深度与过拟合', fontsize=18)
    plt.legend(loc='upper right')
    plt.grid(b=True)
    plt.show()

准确率= 0.7777777777777778

可视化
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

二、决策树回归—决策树用于拟合

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.tree import DecisionTreeRegressor

# 构造数据
def makedata():
    N = 500
    X = np.random.rand(N) * 6 - 3
    X.sort()
    y = np.sin(X) + np.random.randn(N) * 0.05  # 高斯噪声
    X_test = np.linspace(-3, 3, 50).reshape(-1, 1)
    return X, y,X_test


if __name__ == '__main__':
    X, y,X_test= makedata()

    model = DecisionTreeRegressor(criterion='mse', max_depth=9)
    model.fit(X.reshape(-1,1), y)
    # 预测
    y_test_hat = model.predict(X_test)

    # 画图
    plt.figure(facecolor='w')
    plt.plot(X,y,'r*',ms=5,lw=3,label='Accual')
    plt.plot(X_test,y_test_hat,'g-',lw=2,label='Predict')
    plt.legend(loc='upper right')
    plt.show()

    # 决策树深度的影响
    depth = np.arange(1,10,2)
    cm_dark = 'rgbmy'

    for d,c in zip(depth,cm_dark):
        model.set_params(max_depth = d)
        model.fit(X.reshape(-1,1),y)
        y_hat = model.predict(X_test)
        plt.plot(X_test,y_hat,'-',color=c,lw=2,label='Depth=%d'%d)
    plt.legend(loc='best')
    plt.grid(b=True)
    plt.show()

在这里插入图片描述
在这里插入图片描述

三、多输出决策树回归

import warnings
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor


# 构造数据
def makedata():
    N = 500
    X = np.random.rand(N) * 8 - 4
    X.sort()
    y1 = 16 * np.sin(X) ** 3 + np.random.randn(N)
    y2 = 13 * np.cos(X) ** 3 - 5 * np.cos(2 * X) - 2 * np.cos(3 * X) - np.cos(4 * X) + 0.1 * np.random.randn(N)
    y = np.vstack((y1, y2)).T
    X = X.reshape(-1, 1)
    return X, y


if __name__ == '__main__':
    # 消除警告
    warnings.filterwarnings(action='ignore')
    # 设置精度
    np.set_printoptions(suppress=True)
    # 获得数据
    X, y = makedata()
    # 模型
    deep = 8
    model = DecisionTreeRegressor(criterion='mse', max_depth=deep)
    model.fit(X, y)
    # 预测
    X_test = np.linspace(-4, 4, num=1000).reshape(-1, 1)
    y_pred = model.predict(X_test)

    # 画图
    plt.figure(facecolor='w')
    plt.scatter(y[:, 0], y[:, 1], c='r', marker='s', s=60, label='Actual')
    plt.scatter(y_pred[:, 0], y_pred[:, 1], c='g', marker='o', edgecolors='g', s=30, label='Depth=%d' % deep, alpha=0.7)
    plt.legend(loc='best')
    plt.xlabel('y1', fontsize=15)
    plt.ylabel('y2', fontsize=15)
    plt.grid(b=True)
    plt.show()

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/weixin_46649052/article/details/112449545
3.3
今日推荐