基于单层决策树构建弱分类器

单层决策树(decision stump)是一种简单的决策树，它仅仅基于单个特征来做决策。由于这棵树实际上只有一次分裂的过程，因此它实际上就是一个树桩。

如果想要试着从某个坐标轴上选择一个值（即选择一条与坐标轴平行的直线）来将所有的不同类型的数据分隔开来，这显然是不可能的。这就是单层决策树难以处理的一个著名问题。通过使用多棵单层决策树，我们就可以构建出一个能够对解决此类问题的分类器。

import numpy as np


def load_simple_data():
    data_mat = np.matrix([[1., 2.1],
                          [2., 1.1],
                          [1.3, 1.],
                          [1., 1.],
                          [2., 1.]])
    class_labels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return data_mat, class_labels


def stump_classify(data_matrix, dimension, threshold, thresh_ineq):
    return_array = np.ones((np.shape(data_matrix)[0], 1))
    if thresh_ineq == 'lt':
        return_array[data_matrix[:, dimension] <= threshold] = -1.0
    else:
        return_array[data_matrix[:, dimension] > threshold] = -1.0
    return return_array


def build_stump(data_array, class_labels, d_array):
    data_matrix = np.mat(data_array)
    label_mat = np.mat(class_labels).T
    m, n = np.shape(data_matrix)
    step_num = 10
    best_stump = {}
    best_class = np.mat(np.zeros((m, 1)))
    min_error = np.inf
    for i in range(n):
        range_min = data_matrix[:, i].min()
        range_max = data_matrix[:, i].max()
        step_size = (range_max - range_min) / step_num
        for j in range(-1, step_num + 1):
            for inequal in ['lt', 'gt']:
                threshold = (range_min + float(j) * step_size)
                predicted_value = stump_classify(data_matrix, i, threshold, inequal)
                error_array = np.mat(np.ones((m, 1)))
                error_array[predicted_value == label_mat] = 0
                weighted_error = d_array.T * error_array
                print("split: dim %d, threshold %.2f, thresh inequal: %s, the weighted error is %.3f" % (
                i, threshold, inequal, weighted_error))
                if weighted_error < min_error:
                    min_error = weighted_error
                    best_class = predicted_value.copy()
                    best_stump['dim'] = i
                    best_stump['threshold'] = threshold
                    best_stump["ineq"] = inequal
    return best_stump, min_error, best_class


if __name__ == '__main__':
    d_array = np.mat(np.ones((5, 1)) / 5)
    data_mat, class_labels = load_simple_data()
    build_stump(data_mat, class_labels, d_array)

以上代码构建了一个弱分类器，我们构造一个简单的数据，同时将权重向量D手动设置为[0.2, 0.2, 0.2, 0.2, 0.2]，单层决策树弱分类器的测试结果为：

split: dim 0, threshold 0.90, thresh inequal: lt, the weighted error is 0.400
split: dim 0, threshold 0.90, thresh inequal: gt, the weighted error is 0.600
split: dim 0, threshold 1.00, thresh inequal: lt, the weighted error is 0.400
split: dim 0, threshold 1.00, thresh inequal: gt, the weighted error is 0.600
split: dim 0, threshold 1.10, thresh inequal: lt, the weighted error is 0.400
split: dim 0, threshold 1.10, thresh inequal: gt, the weighted error is 0.600
split: dim 0, threshold 1.20, thresh inequal: lt, the weighted error is 0.400
split: dim 0, threshold 1.20, thresh inequal: gt, the weighted error is 0.600
split: dim 0, threshold 1.30, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.30, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.40, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.40, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.50, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.50, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.60, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.60, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.70, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.70, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.80, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.80, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.90, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.90, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 2.00, thresh inequal: lt, the weighted error is 0.600
split: dim 0, threshold 2.00, thresh inequal: gt, the weighted error is 0.400
split: dim 1, threshold 0.89, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 0.89, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.00, thresh inequal: lt, the weighted error is 0.200
split: dim 1, threshold 1.00, thresh inequal: gt, the weighted error is 0.800
split: dim 1, threshold 1.11, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.11, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.22, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.22, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.33, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.33, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.44, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.44, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.55, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.55, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.66, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.66, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.77, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.77, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.88, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.88, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.99, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.99, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 2.10, thresh inequal: lt, the weighted error is 0.600
split: dim 1, threshold 2.10, thresh inequal: gt, the weighted error is 0.400

使用多个弱分类器来构建AdaBoost算法

整个实现的伪代码如下：

对每次迭代：

利用build_stump()函数找到最佳的单层决策树

将最佳单层决策树加入到单层决策树数组

计算alpha

计算新的权重向量

更新累计类别估计值

如果错误率等于0.0，则退出循环

import numpy as np


def load_simple_data():
    data_mat = np.matrix([[1., 2.1],
                          [2., 1.1],
                          [1.3, 1.],
                          [1., 1.],
                          [2., 1.]])
    class_labels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return data_mat, class_labels


def stump_classify(data_matrix, dimension, threshold, thresh_ineq):
    return_array = np.ones((np.shape(data_matrix)[0], 1))
    if thresh_ineq == 'lt':
        return_array[data_matrix[:, dimension] <= threshold] = -1.0
    else:
        return_array[data_matrix[:, dimension] > threshold] = -1.0
    return return_array


def build_stump(data_array, class_labels, d_array):
    data_matrix = np.mat(data_array)
    label_mat = np.mat(class_labels).T
    m, n = np.shape(data_matrix)
    step_num = 10
    best_stump = {}
    best_class = np.mat(np.zeros((m, 1)))
    min_error = np.inf
    for i in range(n):
        range_min = data_matrix[:, i].min()
        range_max = data_matrix[:, i].max()
        step_size = (range_max - range_min) / step_num
        for j in range(-1, step_num + 1):
            for inequal in ['lt', 'gt']:
                threshold = (range_min + float(j) * step_size)
                predicted_value = stump_classify(data_matrix, i, threshold, inequal)
                error_array = np.mat(np.ones((m, 1)))
                error_array[predicted_value == label_mat] = 0
                weighted_error = d_array.T * error_array
                print("split: dim %d, threshold %.2f, thresh inequal: %s, the weighted error is %.3f" % (
                i, threshold, inequal, weighted_error))
                if weighted_error < min_error:
                    min_error = weighted_error
                    best_class = predicted_value.copy()
                    best_stump['dim'] = i
                    best_stump['threshold'] = threshold
                    best_stump["ineq"] = inequal
    return best_stump, min_error, best_class


def adaboost_train(data_array, class_labels, iter_num=40):
    weak_class_array = []
    m = np.shape(data_array)[0]
    d_array = np.mat(np.ones((m, 1))/m)
    agg_classes = np.mat(np.zeros((m, 1)))
    for i in range(iter_num):
        best_stump, error, classes = build_stump(data_array, class_labels, d_array)
        print("weight array D: ", d_array.T)
        # alpha 为每个弱分类器的权值
        alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16)))
        best_stump['alpha'] = alpha
        weak_class_array.append(best_stump)
        print("classes: ", classes.T)
        expon = np.multiply(-1 * alpha * np.mat(class_labels).T, classes)
        d_array = np.multiply(d_array, np.exp(expon))
        d_array = d_array / d_array.sum()
        agg_classes += alpha * classes
        print("agg_classes: ", agg_classes)
        agg_error = np.multiply(np.sign(agg_classes) != np.mat(class_labels).T, np.ones((m, 1)))
        error_rate = agg_error.sum() / m
        print("total error rate: ", error_rate)
        if error_rate == 0.0:
            break
    return weak_class_array


if __name__ == '__main__':
    d_array = np.mat(np.ones((5, 1)) / 5)
    data_mat, class_labels = load_simple_data()
    adaboost_train(data_mat, class_labels)

测试结果：

...

agg_classes:  [[ 1.17568763]
 [ 2.56198199]
 [-0.77022252]
 [-0.77022252]
 [ 0.61607184]]
total error rate:  0.0

测试算法：基于AdaBoost的分类

完整代码：

import numpy as np


def load_simple_data():
    data_mat = np.matrix([[1., 2.1],
                          [2., 1.1],
                          [1.3, 1.],
                          [1., 1.],
                          [2., 1.]])
    class_labels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return data_mat, class_labels


def stump_classify(data_matrix, dimension, threshold, thresh_ineq):
    return_array = np.ones((np.shape(data_matrix)[0], 1))
    if thresh_ineq == 'lt':
        return_array[data_matrix[:, dimension] <= threshold] = -1.0
    else:
        return_array[data_matrix[:, dimension] > threshold] = -1.0
    return return_array


def build_stump(data_array, class_labels, d_array):
    data_matrix = np.mat(data_array)
    label_mat = np.mat(class_labels).T
    m, n = np.shape(data_matrix)
    step_num = 10
    best_stump = {}
    best_class = np.mat(np.zeros((m, 1)))
    min_error = np.inf
    for i in range(n):
        range_min = data_matrix[:, i].min()
        range_max = data_matrix[:, i].max()
        step_size = (range_max - range_min) / step_num
        for j in range(-1, step_num + 1):
            for inequal in ['lt', 'gt']:
                threshold = (range_min + float(j) * step_size)
                predicted_value = stump_classify(data_matrix, i, threshold, inequal)
                error_array = np.mat(np.ones((m, 1)))
                error_array[predicted_value == label_mat] = 0
                weighted_error = d_array.T * error_array
                # print("split: dim %d, threshold %.2f, thresh inequal: %s, the weighted error is %.3f" % (
                # i, threshold, inequal, weighted_error))
                if weighted_error < min_error:
                    min_error = weighted_error
                    best_class = predicted_value.copy()
                    best_stump['dim'] = i
                    best_stump['threshold'] = threshold
                    best_stump["ineq"] = inequal
    return best_stump, min_error, best_class


def adaboost_train(data_array, class_labels, iter_num=40):
    weak_class_array = []
    m = np.shape(data_array)[0]
    d_array = np.mat(np.ones((m, 1))/m)
    agg_classes = np.mat(np.zeros((m, 1)))
    for i in range(iter_num):
        best_stump, error, classes = build_stump(data_array, class_labels, d_array)
        # print("weight array D: ", d_array.T)
        # alpha 为每个弱分类器的权值
        alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16)))
        best_stump['alpha'] = alpha
        weak_class_array.append(best_stump)
        # print("classes: ", classes.T)
        expon = np.multiply(-1 * alpha * np.mat(class_labels).T, classes)
        d_array = np.multiply(d_array, np.exp(expon))
        d_array = d_array / d_array.sum()
        agg_classes += alpha * classes
        # print("agg_classes: ", agg_classes)
        agg_error = np.multiply(np.sign(agg_classes) != np.mat(class_labels).T, np.ones((m, 1)))
        error_rate = agg_error.sum() / m
        # print("total error rate: ", error_rate)
        if error_rate == 0.0:
            break
    return weak_class_array


def adaboost_classify(data, classifier_array):
    data_matrix = np.mat(data)
    m = np.shape(data_matrix)[0]
    agg_classes = np.mat(np.zeros((m, 1)))
    for i in range(len(classifier_array)):
        classes = stump_classify(data_matrix, classifier_array[i]['dim'], classifier_array[i]['threshold'], classifier_array[i]['ineq'])
        agg_classes += classifier_array[i]['alpha'] * classes
        print("result after classifier %d: %s" % (i, agg_classes))
    result_classes = np.sign(agg_classes)
    print("final result: ", result_classes)
    return result_classes


if __name__ == '__main__':
    data_mat, class_labels = load_simple_data()
    classifier_array = adaboost_train(data_mat, class_labels)
    adaboost_classify([[5, 5], [0, 0]], classifier_array)

结果：

result after classifier 0: [[ 0.69314718]
 [-0.69314718]]
result after classifier 1: [[ 1.66610226]
 [-1.66610226]]
result after classifier 2: [[ 2.56198199]
 [-2.56198199]]
final result:  [[ 1.]
 [-1.]]

AdaBoost 将单层决策树构建弱分类器组建为强分类器

基于单层决策树构建弱分类器

使用多个弱分类器来构建AdaBoost算法

测试算法：基于AdaBoost的分类

猜你喜欢