机器学习实战决策树算法 笔记(Python3)

版权声明:欢迎去我的新家https://www.jianshu.com/u/906a78709f1d https://blog.csdn.net/dongyanwen6036/article/details/82393091

#####计算香农熵

from math import log
def calcShannonEnt(dataset):
    num=len(dataset)
    labelCounts={}
    for featVec in dataset:
        currentlabel=featVec[-1]
        #print(currentlabel)
        if currentlabel not in labelCounts.keys():
            labelCounts[currentlabel]=0
        labelCounts[currentlabel]+=1
    shannonEnt=0.0
    for key in labelCounts:
        prob=float(labelCounts[key]/num)
        shannonEnt-=prob*log(prob,2)
    return shannonEnt


def creatDataset():
    dataset=[[1,1,'yes'],
             [1,1,'yes'],
             [1,0,'no'],
             [0,1,'no'],
             [0,1,'no']
    ]
    labels=['no surfacing','flippers']
    return dataset,labels


myData,labels=creatDataset()
print(calcShannonEnt(myData)    )
    

这里写图片描述

appendextend




#####划分数据集并选择最好的

from math import log
def calcShannonEnt(dataset):
    num=len(dataset)
    labelCounts={}
    for featVec in dataset:
        currentlabel=featVec[-1]
        #print(currentlabel)
        if currentlabel not in labelCounts.keys():
            labelCounts[currentlabel]=0
        labelCounts[currentlabel]+=1
    shannonEnt=0.0
    for key in labelCounts:
        prob=float(labelCounts[key]/num)
        shannonEnt-=prob*log(prob,2)
    return shannonEnt


def creatDataset():
    dataset=[[1,1,'yes'],
             [1,1,'yes'],
             [1,0,'no'],
             [0,1,'no'],
             [0,1,'no']
    ]
    labels=['no surfacing','flippers']
    return dataset,labels




#为了保留原始数据,新建一个数据列表
def splitDataSet(dataset,axis,value):
    '''
    带划分的数据集
    划分数据集的特征
    需要返回的特征的值
    得到除了在axis处其余的数
    '''
    retDataset=[]
    for featVec in dataset:
        if featVec[axis]==value:
            reducedFeatVec=featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataset.append(reducedFeatVec)
    return retDataset

def chooseBestFeatureToSplit(dataset):
    numFeature=len(dataset[0])-1
    print("numberfea is",numFeature)
    baseEntropy=calcShannonEnt(dataset)
    bestinfoGain=0
    bestFeature=-1 #from 0 start
    for i in range(numFeature):
        featlist=[example[i] for example in dataset]
        #print("example is",[example for example in dataset] )
        print("example[i] is",[example[i] for example in dataset])
        uniqueVals=set(featlist)  #自动去重
        print("uniqueVals is",uniqueVals)
        newEntropy=0
        for value in uniqueVals:
            subDataset=splitDataSet(dataset,i,value)
            print("subDataset is",subDataset)
            prob=float(len(subDataset)/len(dataset))
            newEntropy+=prob*calcShannonEnt(subDataset)
        infoGain=baseEntropy-newEntropy
        print("infoGain is",infoGain)
        if(infoGain>bestinfoGain):
            bestinfoGain=infoGain
            bestFeature=i
    return bestFeature


mydat,labels=creatDataset()
print(chooseBestFeatureToSplit(mydat))
 

output:

numberfea is 2
example[i] is [1, 1, 1, 0, 0]
uniqueVals is {0, 1}
subDataset is [[1, 'no'], [1, 'no']]
subDataset is [[1, 'yes'], [1, 'yes'], [0, 'no']]
infoGain is 0.4199730940219749
example[i] is [1, 1, 0, 1, 1]
uniqueVals is {0, 1}
subDataset is [[1, 'no']]
subDataset is [[1, 'yes'], [1, 'yes'], [0, 'no'], [0, 'no']]
infoGain is 0.17095059445466854
0

猜你喜欢

转载自blog.csdn.net/dongyanwen6036/article/details/82393091