之前在博客上面看见的决策树算法遇到了一些坑,自己一个个填了。外带上jueceshu.csv的文件
分析的问题用一个图片表示:
代码:
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO
data = open('jueceshu.csv', 'rt')
reader = csv.reader(data)
headers = next(reader)
print(headers)
featureList = [] # 特征集
labelList = [] # 标签集
for row in reader:
# 最后一列是标签,构造标签集
labelList.append(row[len(row)-1])
# 构造特征集
rowDict = {}
for i in range(1, len(row)-1):
# header里面是属性名,用来作键值
rowDict[headers[i]] = row[i]
featureList.append(rowDict)
print(featureList)
vec = DictVectorizer()
# 将特征转化为向量
dummyX = vec.fit_transform(featureList).toarray()
print ('dummyX:'+str(dummyX))
# 输出向量中每一项的含义
print(vec.get_feature_names())
print( 'labelList:' + str(labelList))
# 将标签变成列向量
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
print('dummyY:' + str(dummyY))
# 利用tree中的分类器来创建决策树
clf = tree.DecisionTreeClassifier(criterion='entropy') # 用ID3的算法 信息熵
clf = clf.fit(dummyX, dummyY)
print( 'clf:' + str(clf))
# 画决策树
with open('jueceshu.dot', 'w') as f:
# 把feature_name返回
f = tree.export_graphviz(clf,feature_names=vec.get_feature_names(), out_file=f)
oneRowX = dummyX[0, :].reshape(1,-1)
print('oneRowX:' + str(oneRowX))
# 构造新的情况,并预测
newRowX = oneRowX
newRowX[0][0] = 0
newRowX[0][2] = 1
newRowX.reshape(1,-1)
print('newRowX:' + str(newRowX))
# 用模型预测
predictedY = clf.predict(newRowX)
print('predictedY:' + str(predictedY))
如图在excel里面建立表格,保存时候改成.csv格式,并保存在.py文件相同目录下面
运行成功结果:
C:\Users\csj\AppData\Local\Programs\Python\Python37\python.exe C:/Users/csj/PycharmProjects/a/tree.py
C:\Users\csj\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\feature_extraction\text.py:17: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
from collections import Mapping, defaultdict
['RID', 'age', 'income', 'student', 'credit_rating', 'buys_computer']
[{'age': 'younth', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'younth', 'income': 'high', 'student': 'no', 'credit_rating': 'excellent'}, {'age': 'middle-aged', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'medium', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'low', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'low', 'student': 'yes', 'credit_rating': 'excellent'}, {'age': 'middle-aged', 'income': 'low', 'student': 'yes', 'credit_rating': 'excellent'}, {'age': 'younth', 'income': 'medium', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'younth', 'income': 'low', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'medium', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'younth', 'income': 'medium', 'student': 'yes', 'credit_rating': 'excellent'}, {'age': 'middle-aged', 'income': 'medium', 'student': 'no', 'credit_rating': 'excellent'}, {'age': 'middle-aged', 'income': 'high', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'medium', 'student': 'no', 'credit_rating': 'excellent'}]
dummyX:[[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]
[0. 0. 1. 1. 0. 1. 0. 0. 1. 0.]
[1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]
[0. 1. 0. 0. 1. 0. 0. 1. 1. 0.]
[0. 1. 0. 0. 1. 0. 1. 0. 0. 1.]
[0. 1. 0. 1. 0. 0. 1. 0. 0. 1.]
[1. 0. 0. 1. 0. 0. 1. 0. 0. 1.]
[0. 0. 1. 0. 1. 0. 0. 1. 1. 0.]
[0. 0. 1. 0. 1. 0. 1. 0. 0. 1.]
[0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]
[0. 0. 1. 1. 0. 0. 0. 1. 0. 1.]
[1. 0. 0. 1. 0. 0. 0. 1. 1. 0.]
[1. 0. 0. 0. 1. 1. 0. 0. 0. 1.]
[0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]]
['age=middle-aged', 'age=senior', 'age=younth', 'credit_rating=excellent', 'credit_rating=fair', 'income=high', 'income=low', 'income=medium', 'student=no', 'student=yes']
labelList:['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
dummyY:[[0]
[0]
[1]
[1]
[1]
[0]
[1]
[0]
[1]
[1]
[1]
[1]
[1]
[0]]
clf:DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
splitter='best')
oneRowX:[[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]]
newRowX:[[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]]
predictedY:[0]
Process finished with exit code 0
digraph Tree {
node [shape=box] ;
0 [label="age=middle-aged <= 0.5\nentropy = 0.94\nsamples = 14\nvalue = [5, 9]"] ;
1 [label="student=no <= 0.5\nentropy = 1.0\nsamples = 10\nvalue = [5, 5]"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="credit_rating=excellent <= 0.5\nentropy = 0.722\nsamples = 5\nvalue = [1, 4]"] ;
1 -> 2 ;
3 [label="entropy = 0.0\nsamples = 3\nvalue = [0, 3]"] ;
2 -> 3 ;
4 [label="age=younth <= 0.5\nentropy = 1.0\nsamples = 2\nvalue = [1, 1]"] ;
2 -> 4 ;
5 [label="entropy = 0.0\nsamples = 1\nvalue = [1, 0]"] ;
4 -> 5 ;
6 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
4 -> 6 ;
7 [label="age=younth <= 0.5\nentropy = 0.722\nsamples = 5\nvalue = [4, 1]"] ;
1 -> 7 ;
8 [label="credit_rating=fair <= 0.5\nentropy = 1.0\nsamples = 2\nvalue = [1, 1]"] ;
7 -> 8 ;
9 [label="entropy = 0.0\nsamples = 1\nvalue = [1, 0]"] ;
8 -> 9 ;
10 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
8 -> 10 ;
11 [label="entropy = 0.0\nsamples = 3\nvalue = [3, 0]"] ;
7 -> 11 ;
12 [label="entropy = 0.0\nsamples = 4\nvalue = [0, 4]"] ;
0 -> 12 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
}
上面为生成的.dot文件