版权声明:学习交流为主,未经博主同意禁止转载,禁止用于商用。 https://blog.csdn.net/u012965373/article/details/83831914
# 选择切分数据集的最佳特征
def choose_best_feature_to_split(self, data_set):
num_features = len(data_set[0]) - 1
# 计算信息熵
base_entropy = self.calc_shannon_ent(data_set)
#best_info_gain, best_feature = 0.0, -1
for i in range(num_features):
# 收集特征的值的集合
feat_list = [example[i] for example in data_set]
# 去重
unique_vals = set(feat_list)
new_entropy = 0.0
for value in unique_vals:
sub_data_set = self.split_data_set(data_set, i, value)
# 计算均值Pi
prob = len(sub_data_set) / float(len(data_set))
# 计算信息增益
new_entropy += prob * self.calc_shannon_ent(sub_data_set)
info_gain = base_entropy - new_entropy
print('infoGain=', info_gain, 'bestFeature=', i, base_entropy, new_entropy)
# 比较信息增益
if (info_gain > best_info_gain):
best_info_gain = info_gain
bestFeature = i
return bestFeature