Python实现DescionTree决策树 --- 选择切分数据集的最佳特征

版权声明:学习交流为主,未经博主同意禁止转载,禁止用于商用。 https://blog.csdn.net/u012965373/article/details/83831914
# 选择切分数据集的最佳特征
    def choose_best_feature_to_split(self, data_set):
        num_features = len(data_set[0]) - 1
        # 计算信息熵
        base_entropy = self.calc_shannon_ent(data_set)
        #best_info_gain, best_feature = 0.0, -1
        for i in range(num_features):
            # 收集特征的值的集合
            feat_list = [example[i] for example in data_set]
            # 去重
            unique_vals = set(feat_list)
            new_entropy = 0.0
            for value in unique_vals:
                sub_data_set = self.split_data_set(data_set, i, value)
                # 计算均值Pi
                prob = len(sub_data_set) / float(len(data_set))
                # 计算信息增益
                new_entropy += prob * self.calc_shannon_ent(sub_data_set)
            info_gain = base_entropy - new_entropy
            print('infoGain=', info_gain, 'bestFeature=', i, base_entropy, new_entropy)
            # 比较信息增益
            if (info_gain > best_info_gain):
                best_info_gain = info_gain
                bestFeature = i
        return bestFeature

猜你喜欢

转载自blog.csdn.net/u012965373/article/details/83831914