机器学习之决策树算法(python3.7版本)

之前在博客上面看见的决策树算法遇到了一些坑,自己一个个填了。外带上jueceshu.csv的文件

分析的问题用一个图片表示:

代码:

from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO

data = open('jueceshu.csv', 'rt')
reader = csv.reader(data)
headers = next(reader)
print(headers)

featureList = []  # 特征集
labelList = []  # 标签集
for row in reader:
    # 最后一列是标签,构造标签集
    labelList.append(row[len(row)-1])
    # 构造特征集
    rowDict = {}
    for i in range(1, len(row)-1):
        # header里面是属性名,用来作键值
        rowDict[headers[i]] = row[i]
    featureList.append(rowDict)

print(featureList)

vec = DictVectorizer()
# 将特征转化为向量
dummyX = vec.fit_transform(featureList).toarray()

print ('dummyX:'+str(dummyX))
# 输出向量中每一项的含义
print(vec.get_feature_names())

print( 'labelList:' + str(labelList))

# 将标签变成列向量
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
print('dummyY:' + str(dummyY))

# 利用tree中的分类器来创建决策树
clf = tree.DecisionTreeClassifier(criterion='entropy')  # 用ID3的算法  信息熵
clf = clf.fit(dummyX, dummyY)
print( 'clf:' + str(clf))

# 画决策树
with open('jueceshu.dot', 'w') as f:
    # 把feature_name返回
    f = tree.export_graphviz(clf,feature_names=vec.get_feature_names(), out_file=f)

oneRowX = dummyX[0, :].reshape(1,-1)
print('oneRowX:' + str(oneRowX))

# 构造新的情况,并预测
newRowX = oneRowX
newRowX[0][0] = 0
newRowX[0][2] = 1
newRowX.reshape(1,-1)
print('newRowX:' + str(newRowX))

# 用模型预测
predictedY = clf.predict(newRowX)
print('predictedY:' + str(predictedY))

如图在excel里面建立表格,保存时候改成.csv格式,并保存在.py文件相同目录下面

运行成功结果:

C:\Users\csj\AppData\Local\Programs\Python\Python37\python.exe C:/Users/csj/PycharmProjects/a/tree.py
C:\Users\csj\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\feature_extraction\text.py:17: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
  from collections import Mapping, defaultdict
['RID', 'age', 'income', 'student', 'credit_rating', 'buys_computer']
[{'age': 'younth', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'younth', 'income': 'high', 'student': 'no', 'credit_rating': 'excellent'}, {'age': 'middle-aged', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'medium', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'low', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'low', 'student': 'yes', 'credit_rating': 'excellent'}, {'age': 'middle-aged', 'income': 'low', 'student': 'yes', 'credit_rating': 'excellent'}, {'age': 'younth', 'income': 'medium', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'younth', 'income': 'low', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'medium', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'younth', 'income': 'medium', 'student': 'yes', 'credit_rating': 'excellent'}, {'age': 'middle-aged', 'income': 'medium', 'student': 'no', 'credit_rating': 'excellent'}, {'age': 'middle-aged', 'income': 'high', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'medium', 'student': 'no', 'credit_rating': 'excellent'}]
dummyX:[[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]
 [0. 0. 1. 1. 0. 1. 0. 0. 1. 0.]
 [1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]
 [0. 1. 0. 0. 1. 0. 0. 1. 1. 0.]
 [0. 1. 0. 0. 1. 0. 1. 0. 0. 1.]
 [0. 1. 0. 1. 0. 0. 1. 0. 0. 1.]
 [1. 0. 0. 1. 0. 0. 1. 0. 0. 1.]
 [0. 0. 1. 0. 1. 0. 0. 1. 1. 0.]
 [0. 0. 1. 0. 1. 0. 1. 0. 0. 1.]
 [0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]
 [0. 0. 1. 1. 0. 0. 0. 1. 0. 1.]
 [1. 0. 0. 1. 0. 0. 0. 1. 1. 0.]
 [1. 0. 0. 0. 1. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]]
['age=middle-aged', 'age=senior', 'age=younth', 'credit_rating=excellent', 'credit_rating=fair', 'income=high', 'income=low', 'income=medium', 'student=no', 'student=yes']
labelList:['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
dummyY:[[0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]]
clf:DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
oneRowX:[[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]]
newRowX:[[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]]
predictedY:[0]

Process finished with exit code 0
digraph Tree {
node [shape=box] ;
0 [label="age=middle-aged <= 0.5\nentropy = 0.94\nsamples = 14\nvalue = [5, 9]"] ;
1 [label="student=no <= 0.5\nentropy = 1.0\nsamples = 10\nvalue = [5, 5]"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="credit_rating=excellent <= 0.5\nentropy = 0.722\nsamples = 5\nvalue = [1, 4]"] ;
1 -> 2 ;
3 [label="entropy = 0.0\nsamples = 3\nvalue = [0, 3]"] ;
2 -> 3 ;
4 [label="age=younth <= 0.5\nentropy = 1.0\nsamples = 2\nvalue = [1, 1]"] ;
2 -> 4 ;
5 [label="entropy = 0.0\nsamples = 1\nvalue = [1, 0]"] ;
4 -> 5 ;
6 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
4 -> 6 ;
7 [label="age=younth <= 0.5\nentropy = 0.722\nsamples = 5\nvalue = [4, 1]"] ;
1 -> 7 ;
8 [label="credit_rating=fair <= 0.5\nentropy = 1.0\nsamples = 2\nvalue = [1, 1]"] ;
7 -> 8 ;
9 [label="entropy = 0.0\nsamples = 1\nvalue = [1, 0]"] ;
8 -> 9 ;
10 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
8 -> 10 ;
11 [label="entropy = 0.0\nsamples = 3\nvalue = [3, 0]"] ;
7 -> 11 ;
12 [label="entropy = 0.0\nsamples = 4\nvalue = [0, 4]"] ;
0 -> 12 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
}

上面为生成的.dot文件

猜你喜欢

转载自blog.csdn.net/qq_41446162/article/details/81331696