版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_30241709/article/details/87971151
目录
基于单层决策树构建弱分类器
单层决策树(decision stump)是一种简单的决策树,它仅仅基于单个特征来做决策。由于这棵树实际上只有一次分裂的过程,因此它实际上就是一个树桩。
如果想要试着从某个坐标轴上选择一个值(即选择一条与坐标轴平行的直线)来将所有的不同类型的数据分隔开来,这显然是不可能的。这就是单层决策树难以处理的一个著名问题。通过使用多棵单层决策树,我们就可以构建出一个能够对解决此类问题的分类器。
import numpy as np
def load_simple_data():
data_mat = np.matrix([[1., 2.1],
[2., 1.1],
[1.3, 1.],
[1., 1.],
[2., 1.]])
class_labels = [1.0, 1.0, -1.0, -1.0, 1.0]
return data_mat, class_labels
def stump_classify(data_matrix, dimension, threshold, thresh_ineq):
return_array = np.ones((np.shape(data_matrix)[0], 1))
if thresh_ineq == 'lt':
return_array[data_matrix[:, dimension] <= threshold] = -1.0
else:
return_array[data_matrix[:, dimension] > threshold] = -1.0
return return_array
def build_stump(data_array, class_labels, d_array):
data_matrix = np.mat(data_array)
label_mat = np.mat(class_labels).T
m, n = np.shape(data_matrix)
step_num = 10
best_stump = {}
best_class = np.mat(np.zeros((m, 1)))
min_error = np.inf
for i in range(n):
range_min = data_matrix[:, i].min()
range_max = data_matrix[:, i].max()
step_size = (range_max - range_min) / step_num
for j in range(-1, step_num + 1):
for inequal in ['lt', 'gt']:
threshold = (range_min + float(j) * step_size)
predicted_value = stump_classify(data_matrix, i, threshold, inequal)
error_array = np.mat(np.ones((m, 1)))
error_array[predicted_value == label_mat] = 0
weighted_error = d_array.T * error_array
print("split: dim %d, threshold %.2f, thresh inequal: %s, the weighted error is %.3f" % (
i, threshold, inequal, weighted_error))
if weighted_error < min_error:
min_error = weighted_error
best_class = predicted_value.copy()
best_stump['dim'] = i
best_stump['threshold'] = threshold
best_stump["ineq"] = inequal
return best_stump, min_error, best_class
if __name__ == '__main__':
d_array = np.mat(np.ones((5, 1)) / 5)
data_mat, class_labels = load_simple_data()
build_stump(data_mat, class_labels, d_array)
以上代码构建了一个弱分类器,我们构造一个简单的数据,同时将权重向量D手动设置为[0.2, 0.2, 0.2, 0.2, 0.2],单层决策树弱分类器的测试结果为:
split: dim 0, threshold 0.90, thresh inequal: lt, the weighted error is 0.400
split: dim 0, threshold 0.90, thresh inequal: gt, the weighted error is 0.600
split: dim 0, threshold 1.00, thresh inequal: lt, the weighted error is 0.400
split: dim 0, threshold 1.00, thresh inequal: gt, the weighted error is 0.600
split: dim 0, threshold 1.10, thresh inequal: lt, the weighted error is 0.400
split: dim 0, threshold 1.10, thresh inequal: gt, the weighted error is 0.600
split: dim 0, threshold 1.20, thresh inequal: lt, the weighted error is 0.400
split: dim 0, threshold 1.20, thresh inequal: gt, the weighted error is 0.600
split: dim 0, threshold 1.30, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.30, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.40, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.40, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.50, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.50, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.60, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.60, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.70, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.70, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.80, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.80, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 1.90, thresh inequal: lt, the weighted error is 0.200
split: dim 0, threshold 1.90, thresh inequal: gt, the weighted error is 0.800
split: dim 0, threshold 2.00, thresh inequal: lt, the weighted error is 0.600
split: dim 0, threshold 2.00, thresh inequal: gt, the weighted error is 0.400
split: dim 1, threshold 0.89, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 0.89, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.00, thresh inequal: lt, the weighted error is 0.200
split: dim 1, threshold 1.00, thresh inequal: gt, the weighted error is 0.800
split: dim 1, threshold 1.11, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.11, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.22, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.22, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.33, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.33, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.44, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.44, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.55, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.55, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.66, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.66, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.77, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.77, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.88, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.88, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 1.99, thresh inequal: lt, the weighted error is 0.400
split: dim 1, threshold 1.99, thresh inequal: gt, the weighted error is 0.600
split: dim 1, threshold 2.10, thresh inequal: lt, the weighted error is 0.600
split: dim 1, threshold 2.10, thresh inequal: gt, the weighted error is 0.400
使用多个弱分类器来构建AdaBoost算法
整个实现的伪代码如下:
- 对每次迭代:
- 利用build_stump()函数找到最佳的单层决策树
- 将最佳单层决策树加入到单层决策树数组
- 计算alpha
- 计算新的权重向量
- 更新累计类别估计值
- 如果错误率等于0.0,则退出循环
import numpy as np
def load_simple_data():
data_mat = np.matrix([[1., 2.1],
[2., 1.1],
[1.3, 1.],
[1., 1.],
[2., 1.]])
class_labels = [1.0, 1.0, -1.0, -1.0, 1.0]
return data_mat, class_labels
def stump_classify(data_matrix, dimension, threshold, thresh_ineq):
return_array = np.ones((np.shape(data_matrix)[0], 1))
if thresh_ineq == 'lt':
return_array[data_matrix[:, dimension] <= threshold] = -1.0
else:
return_array[data_matrix[:, dimension] > threshold] = -1.0
return return_array
def build_stump(data_array, class_labels, d_array):
data_matrix = np.mat(data_array)
label_mat = np.mat(class_labels).T
m, n = np.shape(data_matrix)
step_num = 10
best_stump = {}
best_class = np.mat(np.zeros((m, 1)))
min_error = np.inf
for i in range(n):
range_min = data_matrix[:, i].min()
range_max = data_matrix[:, i].max()
step_size = (range_max - range_min) / step_num
for j in range(-1, step_num + 1):
for inequal in ['lt', 'gt']:
threshold = (range_min + float(j) * step_size)
predicted_value = stump_classify(data_matrix, i, threshold, inequal)
error_array = np.mat(np.ones((m, 1)))
error_array[predicted_value == label_mat] = 0
weighted_error = d_array.T * error_array
print("split: dim %d, threshold %.2f, thresh inequal: %s, the weighted error is %.3f" % (
i, threshold, inequal, weighted_error))
if weighted_error < min_error:
min_error = weighted_error
best_class = predicted_value.copy()
best_stump['dim'] = i
best_stump['threshold'] = threshold
best_stump["ineq"] = inequal
return best_stump, min_error, best_class
def adaboost_train(data_array, class_labels, iter_num=40):
weak_class_array = []
m = np.shape(data_array)[0]
d_array = np.mat(np.ones((m, 1))/m)
agg_classes = np.mat(np.zeros((m, 1)))
for i in range(iter_num):
best_stump, error, classes = build_stump(data_array, class_labels, d_array)
print("weight array D: ", d_array.T)
# alpha 为每个弱分类器的权值
alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16)))
best_stump['alpha'] = alpha
weak_class_array.append(best_stump)
print("classes: ", classes.T)
expon = np.multiply(-1 * alpha * np.mat(class_labels).T, classes)
d_array = np.multiply(d_array, np.exp(expon))
d_array = d_array / d_array.sum()
agg_classes += alpha * classes
print("agg_classes: ", agg_classes)
agg_error = np.multiply(np.sign(agg_classes) != np.mat(class_labels).T, np.ones((m, 1)))
error_rate = agg_error.sum() / m
print("total error rate: ", error_rate)
if error_rate == 0.0:
break
return weak_class_array
if __name__ == '__main__':
d_array = np.mat(np.ones((5, 1)) / 5)
data_mat, class_labels = load_simple_data()
adaboost_train(data_mat, class_labels)
测试结果:
...
agg_classes: [[ 1.17568763]
[ 2.56198199]
[-0.77022252]
[-0.77022252]
[ 0.61607184]]
total error rate: 0.0
测试算法:基于AdaBoost的分类
完整代码:
import numpy as np
def load_simple_data():
data_mat = np.matrix([[1., 2.1],
[2., 1.1],
[1.3, 1.],
[1., 1.],
[2., 1.]])
class_labels = [1.0, 1.0, -1.0, -1.0, 1.0]
return data_mat, class_labels
def stump_classify(data_matrix, dimension, threshold, thresh_ineq):
return_array = np.ones((np.shape(data_matrix)[0], 1))
if thresh_ineq == 'lt':
return_array[data_matrix[:, dimension] <= threshold] = -1.0
else:
return_array[data_matrix[:, dimension] > threshold] = -1.0
return return_array
def build_stump(data_array, class_labels, d_array):
data_matrix = np.mat(data_array)
label_mat = np.mat(class_labels).T
m, n = np.shape(data_matrix)
step_num = 10
best_stump = {}
best_class = np.mat(np.zeros((m, 1)))
min_error = np.inf
for i in range(n):
range_min = data_matrix[:, i].min()
range_max = data_matrix[:, i].max()
step_size = (range_max - range_min) / step_num
for j in range(-1, step_num + 1):
for inequal in ['lt', 'gt']:
threshold = (range_min + float(j) * step_size)
predicted_value = stump_classify(data_matrix, i, threshold, inequal)
error_array = np.mat(np.ones((m, 1)))
error_array[predicted_value == label_mat] = 0
weighted_error = d_array.T * error_array
# print("split: dim %d, threshold %.2f, thresh inequal: %s, the weighted error is %.3f" % (
# i, threshold, inequal, weighted_error))
if weighted_error < min_error:
min_error = weighted_error
best_class = predicted_value.copy()
best_stump['dim'] = i
best_stump['threshold'] = threshold
best_stump["ineq"] = inequal
return best_stump, min_error, best_class
def adaboost_train(data_array, class_labels, iter_num=40):
weak_class_array = []
m = np.shape(data_array)[0]
d_array = np.mat(np.ones((m, 1))/m)
agg_classes = np.mat(np.zeros((m, 1)))
for i in range(iter_num):
best_stump, error, classes = build_stump(data_array, class_labels, d_array)
# print("weight array D: ", d_array.T)
# alpha 为每个弱分类器的权值
alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16)))
best_stump['alpha'] = alpha
weak_class_array.append(best_stump)
# print("classes: ", classes.T)
expon = np.multiply(-1 * alpha * np.mat(class_labels).T, classes)
d_array = np.multiply(d_array, np.exp(expon))
d_array = d_array / d_array.sum()
agg_classes += alpha * classes
# print("agg_classes: ", agg_classes)
agg_error = np.multiply(np.sign(agg_classes) != np.mat(class_labels).T, np.ones((m, 1)))
error_rate = agg_error.sum() / m
# print("total error rate: ", error_rate)
if error_rate == 0.0:
break
return weak_class_array
def adaboost_classify(data, classifier_array):
data_matrix = np.mat(data)
m = np.shape(data_matrix)[0]
agg_classes = np.mat(np.zeros((m, 1)))
for i in range(len(classifier_array)):
classes = stump_classify(data_matrix, classifier_array[i]['dim'], classifier_array[i]['threshold'], classifier_array[i]['ineq'])
agg_classes += classifier_array[i]['alpha'] * classes
print("result after classifier %d: %s" % (i, agg_classes))
result_classes = np.sign(agg_classes)
print("final result: ", result_classes)
return result_classes
if __name__ == '__main__':
data_mat, class_labels = load_simple_data()
classifier_array = adaboost_train(data_mat, class_labels)
adaboost_classify([[5, 5], [0, 0]], classifier_array)
结果:
result after classifier 0: [[ 0.69314718]
[-0.69314718]]
result after classifier 1: [[ 1.66610226]
[-1.66610226]]
result after classifier 2: [[ 2.56198199]
[-2.56198199]]
final result: [[ 1.]
[-1.]]