决策树ID3 算法python实现

#!/usr/bin/python
#-*-coding:utf-8 -*-
from math import log
def createDataSET():
    dataSet=[[1,1,"yes"],
             [1,1,"yes"],
             [1,0,"no"],
             [0,1,"no"],
             [0,1,"no"]]
    labels=["no surfacing","flippers"]
    return dataSet,labels

#得到当前条件下最好的特征值
def getBeastFeature(dataSet):
    numFeature=len(dataSet[0])-1
    #得到所有的特征值  (特征值以数字来定位)
    featherList=[i for i in range(numFeature)]
    baseFeather=-1
    basegain=0.0
    print(" --->feathreList"+str(featherList))
    #比较所有的特征值的消息增益来的得到最好的消息增益
    for i in featherList:
        #条件熵
        conEntropy=0.0
        #经验熵
        empirical=getEntropy(dataSet,i)
        feathureList=[data[i] for data in dataSet]
        uniqueValus=set(feathureList)
        print("uniqueValues---->"+str(uniqueValus))
        for value in uniqueValus:
            print("i--->"+str(i)+"---->"+str(value))
            splitData=splitDataSet(dataSet,i,value)
            conEntropy+=(len(splitData)/len(dataSet))*getEntropy(splitData,i)
        nowgain=empirical-conEntropy
        if nowgain>basegain:
            basegain=nowgain
            baseFeather=i
    return baseFeather

#得到当前数据的经验熵
def getEntropy(dataSet,i):
    """
    :param dataSet:数据集
    :param i: 表示当前特征值位于第几个
    :return:得到当前数据的经验熵值
    """
    #按照特征值划分数据
    dataLen=len(dataSet)
    # print(str(dataLen)+" getEntropy(dataSet,i): ---> dataLen")
    entropy=0.0
    # print(str(i)+"   ---->>>"+str(dataSet[0][0]))
    feathureList=[data[i] for data in dataSet]
    uniqueValus=set(feathureList)
    for value in feathureList:
        #类 value的样本个数
        num_value=0
        for data in dataSet:
            if data[i]==value:
                num_value+=1
        # log(x,base) x 必须为float型
        propotibity=float(num_value)/dataLen
        entropy-=propotibity*log(propotibity,2)
    return entropy

# 划分数据,以第i 特征值划分数据
def splitDataSet(dataSet,i,values):
    '''
    :param dataSet: 数据集
    :param i:  以哪一个特征值划分数据
    :param labels:标记这个剩下的特征值对应的名字
    :param values:当前的特征值对应的值
    :return:划分后的数据
    '''
    splitData=[]
    for data in dataSet:
        if data[i]==values:
            nowData=[]
            before=data[:i]
            after=data[i+1:]
            nowData.extend(before)
            nowData.extend(after)
            splitData.append(nowData)
            #对特征标签进行处理
    print("splitData---->"+str(splitData))
    return splitData
#将实例树最大的类Ck作为该节点的类标记
def getMark(dataSet):
    MarkList=[data[-1] for data in dataSet]
    uniqueList=set(MarkList)
    #给当前的类标记赋初值
    MainClass=uniqueList[0]
    for value in uniqueList:
        num_value=0
        for data in dataSet:
            now_value=0
            if data[-1]==value:
                now_value+=1
        if num_value<now_value:
            num_value=now_value
            MainClass=value
    return MainClass


# 数组表示,值代表的是子节点,字典代表的内部节点 如:{"no ":{1:1,0:{"fllipers"{1:1,0:0}}}}
Tree={}
def createTree(dataSet,labels):
    classList=[examle[-1] for examle in dataSet]
    #类别完全相同,则停止划分(意味着某一类别长度等于总长度)
    if classList.count(classList[0])==len(dataSet):
        return classList[0]
    #遍历完所有特征,则返回所有最大的类Ck作为该节点的类标记
    if dataSet[0]==1:
        return getMark(dataSet)
    baseFeature=getBeastFeature(dataSet)
    baseLabel=labels[baseFeature]
    print("baselabel"+str(baseLabel))
    mytree={baseLabel:{}}
    #去除当前的特征名
    del(labels[baseFeature])
    print("baseFeature--->"+str(baseFeature))
    feathureList=[data[baseFeature] for data in dataSet]
    uniqueValus=set(feathureList)
    for values in uniqueValus:
        sublables=labels[:]
        splitData=splitDataSet(dataSet,baseFeature,values)
        #它包含在这里面了
        mytree[baseLabel][values]=createTree(splitData,sublables)
        print("sublabels---->>"+str(sublables))
    return mytree


dataSet,labels=createDataSET()
# baseFeature=getBeastFeature(dataSet,labels)
# print(baseFeature)
mytree=createTree(dataSet,labels)
print(mytree)

猜你喜欢

转载自blog.csdn.net/qq_18617299/article/details/78776344