数据准备
本文将要实现的是二分类的AdaBoost算法,使用的数据是两类别数据mnist_binary.csv;由于原数据特征值在0-255之间,这样会使得AdaBoost中的基本分类器阈值分布较广;因此还将数据进行二值化到0-1,使阈值在[-0.5, 0.5, 1.5]三个值当中进行选择。二值化步骤在代码中完成,不另外生成相应的数据集了。
AdaBoost算法
Adaboost算法的思想比较简单:通过将多个的弱分类器组合成一个强分类器,而多个弱分类器的学习是通过改变训练的权值或分布来实现的;而这样一种加法模型的方式被成为提升方法。
书中对AdaBoost的分治思想和提升过程有明确的阐述:
AdaBoost算法步骤清晰,原理也比较简单,详细步骤如下:
具体代码实现如下:
# @Author: phd
# @Date: 2019-11-08
# @Site: github.com/phdsky
# @Description: NULL
import time
import logging
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Binarizer
def log(func):
def warpper(*args, **kwargs):
start_time = time.time()
ret = func(*args, **kwargs)
end_time = time.time()
logging.debug("%s() cost %s seconds" % (func.__name__, end_time - start_time))
return ret
return warpper
def calc_accuracy(y_pred, y_truth):
assert len(y_pred) == len(y_truth)
n = len(y_pred)
hit_count = 0
for i in range(0, n):
if y_pred[i] == y_truth[i]:
hit_count += 1
print("Accuracy %f\n" % (hit_count / n))
return hit_count / n
def sign(x):
if x >= 0:
return 1
elif x < 0:
return -1
else:
print("Sign function input wrong!\n")
class AdaBoost(object):
def __init__(self, X_train, y_train, max_classfifers):
self.X = X_train
self.Y = y_train
self.sample_num = len(X_train) # sample num
self.feature_num = len(X_train[0]) # feature num
self.D = np.full(self.sample_num, (1./self.sample_num)) # weight distribution
self.M = max_classfifers # max classifier number
self.axis = np.full(self.M, -1) # min ei axis selected
self.alpha = np.zeros(self.M)
self.Gm = np.zeros(self.M) # basic classifier
self.thresh_array = np.arange(np.min(self.X)-0.5, np.max(self.X)+0.51, 1)
self.direction = np.full(self.M, -1)
def basic_classifier(self, threshold, value, direction):
if direction == 0:
if value < threshold:
return 1
else:
return -1
elif direction == 1:
if value > threshold:
return 1
else:
return -1
else:
print("WTF the operation direction is?")
def train_basic_classifier(self, classifier):
# After binarization, the value is 0 ~ 1, so the
# threshold should be [-0.5, 0.5, 1.5]
# For multi dimensional data, choose the axis which
# has the min ei value to take part in decision
min_ei = self.sample_num # all weight is 1 and hit
selected_axis = -1
threshold = self.thresh_array[-1] + 1
direction_array = [0, 1]
direction = -1
for axis in range(self.feature_num):
for th in self.thresh_array:
axis_vector = self.X[:, axis]
thresh_vector = np.full(self.sample_num, th)
for direct in direction_array:
# Use vector format calculation for accelerating
if direct == 0:
compare_vector = np.asarray([axis_vector < thresh_vector], dtype=int) * 2 - 1
elif direct == 1:
compare_vector = np.asarray([axis_vector > thresh_vector], dtype=int) * 2 - 1
calc_ei = np.sum((compare_vector != self.Y)*self.D)
# calc_ei = 0.
# for sample in range(self.sample_num):
# calc_ei += self.D[sample]*\
# int(self.basic_classifier(thresh, self.X[sample][axis]) != self.Y[sample])
if calc_ei < min_ei:
min_ei = calc_ei
selected_axis = axis
threshold = th
direction = direct
self.axis[classifier] = selected_axis
self.Gm[classifier] = threshold
self.direction[classifier] = direction
return min_ei
@log
def train(self):
m = 0
while m < self.M:
print("Training %d classifier..." % m)
# Train basic classifier and classify error
ei = self.train_basic_classifier(classifier=m)
# Calculate alpha value
self.alpha[m] = 0.5*np.log((1 - ei) / ei)
# Validate training
train_label = self.predict(X_test=self.X, classifier_number=(m + 1))
accuracy = calc_accuracy(train_label, self.Y)
if accuracy == 1.:
print("Fitting perfect on training set!")
return m + 1
# Calculate regulator
Zm = 0.
for i in range(self.sample_num):
Zm += self.D[i] * np.exp(-self.alpha[m]*self.Y[i] *
self.basic_classifier(self.Gm[m], self.X[i][self.axis[m]], self.direction[m]))
# Update weight distribution
for i in range(self.sample_num):
self.D[i] = self.D[i] * np.exp(-self.alpha[m]*self.Y[i] *
self.basic_classifier(self.Gm[m], self.X[i][self.axis[m]], self.direction[m])) / Zm
m += 1
return m
# @log
def predict(self, X_test, classifier_number):
n = len(X_test)
predict_label = np.full(n, -1)
for i in range(n):
to_predict = X_test[i]
result = 0.
for m in range(classifier_number):
result += self.alpha[m] * self.basic_classifier(self.Gm[m], to_predict[self.axis[m]], self.direction[m])
predict_label[i] = sign(result)
return predict_label
def example_large():
mnist_data = pd.read_csv("../data/mnist_binary.csv")
mnist_values = mnist_data.values
images = mnist_values[::, 1::]
labels = mnist_values[::, 0]
X_train, X_test, y_train, y_test = train_test_split(
images, labels, test_size=0.33, random_state=42
)
# Binary the images to avoid AdaBoost classifier threshold complex
binarizer_train = Binarizer(threshold=127).fit(X_train)
X_train_binary = binarizer_train.transform(X_train)
binarizer_test = Binarizer(threshold=127).fit(X_test)
X_test_binary = binarizer_test.transform(X_test)
adaboost = AdaBoost(X_train=X_train_binary, y_train=y_train, max_classfifers=233)
print("AdaBoost training...")
classifier_trained = adaboost.train()
print("\nTraining done...")
print("\nTraining done with %d classifiers!" % classifier_trained)
print("Testing on %d samples..." % len(X_test))
y_predicted = adaboost.predict(X_test=X_test_binary, classifier_number=classifier_trained)
calc_accuracy(y_pred=y_predicted, y_truth=y_test)
def example_small():
X_train = np.asarray([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]])
y_train = np.asarray([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])
adaboost = AdaBoost(X_train=X_train, y_train=y_train, max_classfifers=5)
print("Adaboost training...")
classifier_trained = adaboost.train()
print("\nTraining done with %d classifiers!" % classifier_trained)
if __name__ == "__main__":
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# example_large()
example_small()
代码实现过程中有注意点如下:
- 要注意到书中的弱分类器中符号是有方向的;刚开始实现的过程中没有注意到这一点,以至于在写完算法后跑AdaBoost时,准确率一直维持在一个略高的水平,并且没有看到想要的提升过程;于是就搬了书上的例子来做对比,实现完后对照书上的发现最后一步数据对不上,仔细看了一下才发现最后一个弱分类器的符号反向了;改过来给每个分类器加个flag之后重新跑,结果就比较符合预期了。
- 代码中有较多的比较操作,文中基本按向量方式实现以加速,不然算下来太慢了。
输出结果:
/Users/phd/Softwares/anaconda3/bin/python /Users/phd/Desktop/ML/boosting/adaboost.py
AdaBoost training...
Training 0 classifier...
Accuracy 0.661194
Training 1 classifier...
Accuracy 0.661194
Training 2 classifier...
Accuracy 0.682836
Training 3 classifier...
Accuracy 0.702807
Training 4 classifier...
Accuracy 0.706681
Training 5 classifier...
Accuracy 0.719829
Training 6 classifier...
Accuracy 0.718728
Training 7 classifier...
Accuracy 0.727505
Training 8 classifier...
Accuracy 0.741294
Training 9 classifier...
Accuracy 0.741649
Training 10 classifier...
Accuracy 0.755508
Training 11 classifier...
Accuracy 0.761443
Training 12 classifier...
Accuracy 0.762864
Training 13 classifier...
Accuracy 0.763291
Training 14 classifier...
Accuracy 0.765601
Training 15 classifier...
Accuracy 0.763042
Training 16 classifier...
Accuracy 0.770860
Training 17 classifier...
Accuracy 0.771357
Training 18 classifier...
Accuracy 0.769581
Training 19 classifier...
Accuracy 0.774982
Training 20 classifier...
Accuracy 0.777967
Training 21 classifier...
Accuracy 0.776048
Training 22 classifier...
Accuracy 0.778820
Training 23 classifier...
Accuracy 0.778216
Training 24 classifier...
Accuracy 0.777683
Training 25 classifier...
Accuracy 0.781414
Training 26 classifier...
Accuracy 0.780242
Training 27 classifier...
Accuracy 0.778536
Training 28 classifier...
Accuracy 0.783440
Training 29 classifier...
Accuracy 0.781485
Training 30 classifier...
Accuracy 0.784790
Training 31 classifier...
Accuracy 0.784009
Training 32 classifier...
Accuracy 0.787456
Training 33 classifier...
Accuracy 0.785679
Training 34 classifier...
Accuracy 0.789197
Training 35 classifier...
Accuracy 0.785537
Training 36 classifier...
Accuracy 0.792502
Training 37 classifier...
Accuracy 0.787918
Training 38 classifier...
Accuracy 0.793888
Training 39 classifier...
Accuracy 0.789552
Training 40 classifier...
Accuracy 0.792324
Training 41 classifier...
Accuracy 0.791720
Training 42 classifier...
Accuracy 0.795060
Training 43 classifier...
Accuracy 0.794812
Training 44 classifier...
Accuracy 0.797477
Training 45 classifier...
Accuracy 0.797157
Training 46 classifier...
Accuracy 0.798436
Training 47 classifier...
Accuracy 0.798045
Training 48 classifier...
Accuracy 0.800391
Training 49 classifier...
Accuracy 0.799112
Training done...
Training done with 50 classifiers!
Testing on 13860 samples...
DEBUG:root:train() cost 266.6678547859192 seconds
Accuracy 0.794444
Process finished with exit code 0
从结果可以看到,AdaBoost确实是有一个“Boosting”过程的,算法的速度也还可以,对于二值化二分类问题准确率跟之前的模型大致持平,勉强还可以,不过到后面可以看到训练再多的弱分类器对效果提升甚微。
提升树
不管是AdaBoost还是提升树,本质上都是一种 算法是前向分布算法,模型是加法模型 的实例化算法;此外AdaBoost也可以看作是由一个根结点直接连接两个叶节点的简单决策树(决策树桩),两者算法流程相似。
提升树对于损失函数的不同,在每步模型拟合的过程中的对象是不一样的,具体表现在:
- 如果模型损失是平方损失函数(回归问题)或是指数损失函数(分类问题)时,那么拟合残差即可;利用平方损失函数的回归提升树算法步骤如下:
- 如果模型损失是一般损失函数(一般决策问题)的话,其优化过程就需要通过梯度下降方法,利用损失函数在当前模型的负梯度来近似残差;梯度提升树算法步骤如下:
总结
参考
- 《统计学习方法》