李航《统计学习方法》第2版 第5章 编程生成决策树(ID3算法、C4.5算法)、调用sklearn模块实现(CART算法)(课本77页例题5.3)


ID3生成决策树


import numpy as np
from math import log

def loadData():
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
    # 返回数据集和每个维度的名称
    return datasets, labels

def calc_entropy(datasets):
    label_count = {
    
    }
    for dataset in datasets:
        label =  dataset[-1]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    entropy = -sum([(p/len(datasets))*log(p/len(datasets),2) for p in label_count.values()])
    return entropy

def calc_conditional_entropy(datasets, index = 0):
    feature_data = {
    
    }
    for dataset in datasets:
        feature = dataset[index]
        if feature not in feature_data:
            feature_data[feature] = []
        feature_data[feature].append(dataset)
    condEntropy = sum([(len(p)/len(datasets))*calc_entropy(p) for p in feature_data.values()])
    return condEntropy

def info_gain(entropy, condEntropy):
    return entropy - condEntropy

def info_gain_train_childTree(datasets, labels):
    entropy = calc_entropy(datasets)
    features = []
    for index in range(len(datasets[0])-1):
        condEntropy = calc_conditional_entropy(datasets, index)
        c_info_gain = info_gain(entropy, condEntropy)
        features.append((index, c_info_gain))
        print("特征({})的信息增益为{:.3f}".format(labels[index], c_info_gain))
    best_feature = max(features, key=lambda x: x[-1])
    print("特征({})的信息增益最大,选择为当前节点特征".format(labels[best_feature[0]]))
    return best_feature

def info_gain_train(datasets, labels):
    label_count = {
    
    }
    for dataset in datasets:
        label = dataset[-1]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    if len(label_count.keys()) == 1:
        key = list(label_count.keys())[0]
        print("此时类别均为{}".format(key))
        return
    best_feature = info_gain_train_childTree(datasets, labels)

    feature_data = {
    
    }
    for dataset in datasets:
        feature = dataset[best_feature[0]]
        if feature not in feature_data:
            feature_data[feature] = []
        feature_data[feature].append(dataset)

    for data in zip(feature_data.keys(), feature_data.values()):
        print("当{}为{}".format(labels[best_feature[0]], data[0]))
        info_gain_train(data[1], labels)


if __name__ == "__main__":
    datasets, labels = loadData()
    info_gain_train(datasets, labels)


C4.5生成决策树


import numpy as np
from math import log

def loadData():
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
    # 返回数据集和每个维度的名称
    return datasets, labels

def calc_entropy(datasets, index=-1):
    label_count = {
    
    }
    for dataset in datasets:
        label =  dataset[index]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    entropy = -sum([(p/len(datasets))*log(p/len(datasets),2) for p in label_count.values()])
    return entropy

def calc_conditional_entropy(datasets, index = 0):
    feature_data = {
    
    }
    for dataset in datasets:
        feature = dataset[index]
        if feature not in feature_data:
            feature_data[feature] = []
        feature_data[feature].append(dataset)
    condEntropy = sum([(len(p)/len(datasets))*calc_entropy(p) for p in feature_data.values()])
    return condEntropy

def info_gain(entropy, condEntropy):
    return entropy - condEntropy

def info_gain_ratio(c_info_gain, c_entropy):
    return 0 if c_info_gain == 0 else c_info_gain/c_entropy

def info_gain_train_childTree(datasets, labels):
    entropy = calc_entropy(datasets)
    features = []
    for index in range(len(datasets[0])-1):
        condEntropy = calc_conditional_entropy(datasets, index)
        c_info_gain = info_gain(entropy, condEntropy)
        c_entropy = calc_entropy(datasets, index)
        c_info_gain_ratio = info_gain_ratio(c_info_gain, c_entropy)
        features.append((index, c_info_gain_ratio))
        print("特征({})的信息增益比为{:.3f}".format(labels[index], c_info_gain_ratio))
    best_feature = max(features, key=lambda x: x[-1])
    print("特征({})的信息增益比最大,选择为当前节点特征".format(labels[best_feature[0]]))
    return best_feature

def info_gain_train(datasets, labels):
    label_count = {
    
    }
    for dataset in datasets:
        label = dataset[-1]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    if len(label_count.keys()) == 1:
        key = list(label_count.keys())[0]
        print("此时类别均为{}".format(key))
        return
    best_feature = info_gain_train_childTree(datasets, labels)

    feature_data = {
    
    }
    for dataset in datasets:
        feature = dataset[best_feature[0]]
        if feature not in feature_data:
            feature_data[feature] = []
        feature_data[feature].append(dataset)

    for data in zip(feature_data.keys(), feature_data.values()):
        print("当{}为{}".format(labels[best_feature[0]], data[0]))
        info_gain_train(data[1], labels)


if __name__ == "__main__":
    datasets, labels = loadData()
    info_gain_train(datasets, labels)


调用sklearn模块(CART算法)生成决策树


"""
    决策树sklearn的实现书本例题
    算法:CART算法(剪枝)
"""

from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
import numpy as np
import pandas as pd
import time


def main():
    star=time.time()
    # 原始样本数据
    features=["age","work","house","credit"]
    X_train=pd.DataFrame([
                      ["青年", "否", "否", "一般"],
                      ["青年", "否", "否", "好"],
                      ["青年", "是", "否", "好"],
                      ["青年", "是", "是", "一般"],
                      ["青年", "否", "否", "一般"],
                      ["中年", "否", "否", "一般"],
                      ["中年", "否", "否", "好"],
                      ["中年", "是", "是", "好"],
                      ["中年", "否", "是", "非常好"],
                      ["中年", "否", "是", "非常好"],
                      ["老年", "否", "是", "非常好"],
                      ["老年", "否", "是", "好"],
                      ["老年", "是", "否", "好"],
                      ["老年", "是", "否", "非常好"],
                      ["老年", "否", "否", "一般"]
                      ])
    y_train = pd.DataFrame(["否", "否", "是", "是", "否", "否", "否", "是", "是", "是", "是", "是", "是", "是", "否"])
    # 数据预处理
    le_x=preprocessing.LabelEncoder()
    le_x.fit(np.unique(X_train))
    X_train=X_train.apply(le_x.transform)
    print(X_train)
    le_y=preprocessing.LabelEncoder()
    le_y.fit(np.unique(y_train))
    y_train=y_train.apply(le_y.transform)


    # 调用sklearn.DT建立训练模型
    clf=DecisionTreeClassifier()
    clf.fit(X_train,y_train)

    # 用训练得到模型进行预测
    X_new=pd.DataFrame([["青年", "否", "是", "一般"]])
    X=X_new.apply(le_x.transform)
    y_predict=clf.predict(X)
    # 结果输出
    X_show=[{
    
    features[i]:X_new.values[0][i]} for i in range(len(features))]
    print("{0}被分类为:{1}".format(X_show,le_y.inverse_transform(y_predict)))
    print("time:{:.4f}s".format(time.time()-star))

if __name__=="__main__":
    main()

参考链接:


https://blog.csdn.net/qq_41562704/article/details/98590728

猜你喜欢

转载自blog.csdn.net/weixin_43646592/article/details/109663358