Decision Tree(ID3) using pandas with Python
implementation of ID3
def TreeGenerate(df):
"""
@param df : the pandas dataframe of the dataset
@return root : the root node of decision tree
"""
newNode=Node(None,None,{})
labelArr=df[df.columns[-1]]
labelCount=NodeLabel(labelArr)
if labelCount:
newNode.label=max(labelCount,key=labelCount.get)
if len(labelCount)==1 or len(labelArr)==0:
return newNode
newNode.attr,divValue=OptArr(df)
if divValue==0:
valueCount=ValueCount(df[newNode.attr])
for value in valueCount:
dfV=df[df[newNode.attr].isin([value])]
dfV=dfV.drop(newNode.attr,1)
newNode.attrDown[value]=TreeGenerate(dfV)
else:
valueL="<=%.3f"% divValue
valueR=">%.3f" % divValue
dfVL=df[df[newNode.attr]<=divVakue]
dfVR=df[df[newNode.attr]>divVakue]
newNode.attrDown[valueL]=TreeGenerate(dfVL)
newNode.attrDown[valueR]=TreeGenerate(dfVR)
return newNode
def NodeLabel(labelArr):
"""
@param labelArr:data Array for class labels
@return labelCount:dict,the appeared label and it's counts
"""
labelCount={}
for label in labelArr:
if label in labelCount:
labelCount[label]+=1
else:
labelCount[label]=1
return labelCount
def OptArr(df):
"""
@param df: pandas dataframe of the dataSet
@return optArr: the optimal attribution for branch
@return divVlaue: for discrete variable value=0
for continuous variable value =t for bisection divide value
"""
infoGain=0
for attrId in df.columns[1:-1]:
infoGainTmp,divValueTmp=InfoGain(df,attrId)
if infoGainTmp>infoGain:
infoGain=infoGainTmp
optArr=attrId
divValue=divValueTmp
return optArr,divValue
def InfoGain(df,index):
"""
@param df :the pandas of dataframe
@param index : the attrbution ID
@return : infoGain and divValue
"""
infoGain=InfoEnt(df.values[:,-1])
divValue=0
n=len(df[index])
if df[index].dtype==(float,int):
subInfoEnt={}
df=df.sort_values([index],ascending=1)
df=df.reset_index(drop=True)
dataArr=df[index]
labelArr=df[df.columns[-1]]
for i in range(n-1):
div=(dataAr[i]+dataArr[i+1])/2
subInfoEnt(div)=((i+1)*InfoEnt(labelArr[0:i+1])/n)+((n-i-1)*InfoEnt(labelArr[i+1:-1])/n)
divValue,sunInfoEntMax=min(subInfoEnt.items(),key=lambda x:x[1])
infoGain-=subInfoGainMax
else:
dataArr=df[index]
labelArr=df[df.columns[-1]]
valueCount=ValueCount(dataArr)
for key in valueCount:
keyLabelArr=labelArr[dataArr==key]
infoGain-=valueCount[key]*InfoEnt(keyLabelArr)/n
return infoGain,divValue
def ValueCount(labelArr):
"""
@param labelArr: the attribute of data array
@return valueCount:dict,the appeared value and it's counts
"""
valueCount={}
for label in labelArr:
if label in valueCount:
valueCount[label]+=1
else:
valueCount[label]=1
return valueCount
def InfoEnt(labelArr):
"""
@param labelArr: data array of class label
@return ent: the class information entropy
"""
try:
from math import log2
except ImportError:
print('module math.log2 not found')
ent=0
n=len(labelArr)
labelCount=NodeLabel(labelArr)
for key in labelCount:
ent-=(labelCount[key]/n)*log2(labelCount[key]/n)
return ent
def Predict(root,df_sample):
try:
import re
expect ImportError:
print('module re not found')
while root.attr !=None:
if df_sample[root.attr].dtype==(float,int):
for key in list(root.attr_down):
num=re.findall(r"\d+\.?\d*",key)
div_value=float(num[0])
break
if df_sample[root.attr].values[0]<=div_value:
key="<=%.3f" %div_value
root=root.attr_down[key]
else:
key=">%.3f" %div_value
root=root.attr_down[key]
else:
key=df_sample[root.attr].values[0]
if key in root.attr_down:
root=root.attr_down[key]
else:
break
return root.label
def DrawPng(root,out_file):
"""
@param root : the tree root node
@param out_file: the output name&file path of file
"""
try:
from pydotplus import graphviz
except ImportError:
print("module pydotplus.graphviz not found")
g=graphviz.Dot()
TreeToGraph(0,g,root)
g2=graphviz.graph_from_dot_data(g.to_string())
g2.write_png(out_file)
def TreeToGraph(i,g,root):
"""
@param i: node number in this tree
@param g: pydotplus.garphviz.Dot() object
@param root : the root node
@return i:node number after modified
@return g:...object ater modified
@return g_node: the current root node in graphviz
"""
try:
from pydotplus import graphviz
except ImportError:
print("module p... not found")
if root.attr==None:
g_node_label='Node:%d\n 好瓜:%s'%(i,root,label)
else:
g_node_label="Node:%d\n 好瓜:%s"%(i,root.label,root.attr)
g_node=i
g.add_node(graphviz.Node(g_node,label=g_node_label))
for value in list(root.attr_down):
i,g_child=TreeToGraph(i+1,g,root.attr_down[value])
g.add_edge(graphviz.Edge(g_node,g_child,label=value))
return i,g_node
上面的树的基础已经完成,利用上面的函数来进行数据的处理
root=TreeGenerate(df)
from random import sample
accuracy_scores=[]
for i in range(10):
train=sample(range(len(df.index)),int(1*len(df.index)/2))
df_train=df.iloc[train]
df_test=df.drop(train)
root=TreeGenerate(df_train)
pred_true=0
for i in df_test.index:
label=Predict(root,df[df.index==i])
if label==df_test[df_test.columns[-1]][i]:
pred_true+=1
accuracy=pred_true/len(df_test.index)
accuracy_scores.append(accuracy)
n=len(df.index)
k=5
for i in range(k):
m=int(n/k)
test=[]
for j in range(i*m,i*m+m):
test.append(j)
df_train=df.drop(test)
df_test=df.iloc[test]
root=TreeGenerate(df_train)
pred_true=0
for i in df_test.index:
label=Predict(root,df[df.index==i])
if label==df_test[df_test.columns[-1]][i]:
pred_true+=1
accuracy=pred_true/len(df_test.index)
accuracy_scores.append(accuracy)
accuracy_sum=0
print("accuracy:",end= "")
for i in range(k):
print("%.3f "% accuracy_scores[i],end="")
accuracy_sum+=accuracy_scores[i]
print("\n average accuracy {}".format(accuracy_sum/k))
root=TreeGenerate(df)
DrawPng(root,"decision_tree_ID3.png")