版权声明:欢迎去我的新家https://www.jianshu.com/u/906a78709f1d https://blog.csdn.net/dongyanwen6036/article/details/82393091
#####计算香农熵
from math import log
def calcShannonEnt(dataset):
num=len(dataset)
labelCounts={}
for featVec in dataset:
currentlabel=featVec[-1]
#print(currentlabel)
if currentlabel not in labelCounts.keys():
labelCounts[currentlabel]=0
labelCounts[currentlabel]+=1
shannonEnt=0.0
for key in labelCounts:
prob=float(labelCounts[key]/num)
shannonEnt-=prob*log(prob,2)
return shannonEnt
def creatDataset():
dataset=[[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']
]
labels=['no surfacing','flippers']
return dataset,labels
myData,labels=creatDataset()
print(calcShannonEnt(myData) )
#####划分数据集并选择最好的
from math import log
def calcShannonEnt(dataset):
num=len(dataset)
labelCounts={}
for featVec in dataset:
currentlabel=featVec[-1]
#print(currentlabel)
if currentlabel not in labelCounts.keys():
labelCounts[currentlabel]=0
labelCounts[currentlabel]+=1
shannonEnt=0.0
for key in labelCounts:
prob=float(labelCounts[key]/num)
shannonEnt-=prob*log(prob,2)
return shannonEnt
def creatDataset():
dataset=[[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']
]
labels=['no surfacing','flippers']
return dataset,labels
#为了保留原始数据,新建一个数据列表
def splitDataSet(dataset,axis,value):
'''
带划分的数据集
划分数据集的特征
需要返回的特征的值
得到除了在axis处其余的数
'''
retDataset=[]
for featVec in dataset:
if featVec[axis]==value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataset.append(reducedFeatVec)
return retDataset
def chooseBestFeatureToSplit(dataset):
numFeature=len(dataset[0])-1
print("numberfea is",numFeature)
baseEntropy=calcShannonEnt(dataset)
bestinfoGain=0
bestFeature=-1 #from 0 start
for i in range(numFeature):
featlist=[example[i] for example in dataset]
#print("example is",[example for example in dataset] )
print("example[i] is",[example[i] for example in dataset])
uniqueVals=set(featlist) #自动去重
print("uniqueVals is",uniqueVals)
newEntropy=0
for value in uniqueVals:
subDataset=splitDataSet(dataset,i,value)
print("subDataset is",subDataset)
prob=float(len(subDataset)/len(dataset))
newEntropy+=prob*calcShannonEnt(subDataset)
infoGain=baseEntropy-newEntropy
print("infoGain is",infoGain)
if(infoGain>bestinfoGain):
bestinfoGain=infoGain
bestFeature=i
return bestFeature
mydat,labels=creatDataset()
print(chooseBestFeatureToSplit(mydat))
output:
numberfea is 2
example[i] is [1, 1, 1, 0, 0]
uniqueVals is {0, 1}
subDataset is [[1, 'no'], [1, 'no']]
subDataset is [[1, 'yes'], [1, 'yes'], [0, 'no']]
infoGain is 0.4199730940219749
example[i] is [1, 1, 0, 1, 1]
uniqueVals is {0, 1}
subDataset is [[1, 'no']]
subDataset is [[1, 'yes'], [1, 'yes'], [0, 'no'], [0, 'no']]
infoGain is 0.17095059445466854
0