import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# 參考《統計學習方法》 P138
class AdaBoost:
def __init__(self, n_estimators=50, learning_rate=1.0):
self.clf_num = n_estimators
self.learning_rate = learning_rate
def init_args(self, datasets, labels):
self.X = datasets # 特徵空間
self.Y = labels # 標記空間
self.M, self.N = datasets.shape # 獲取數據集的維度
# 弱分類器數目和集合
# 使用具有權值分佈(weights)的訓練數據集學習,得到基本的分類器
self.clf_sets = []
# 初始化訓練數據集的權值分佈(weights)
self.weights = [1.0 / self.M] * self.M
# 基本分類器G(x)的係數alpha
self.alpha = []
# 參考《統計學習方法》P140-141
# 尋找合適的閾值v,使得G(x)在訓練數據集上的誤差率最小
def _G(self, features, labels, weights):
m = len(features)
error = 100000.0 # 無窮大
best_v = 0.0
# 單維features
features_min = min(features)
features_max = max(features)
n_step = (features_max - features_min + self.learning_rate) // self.learning_rate
direct, compare_array = None, None
for i in range(1, int(n_step)):
# 假設弱分類器由x<v或x>v產生
# 閾值v使該分類器在訓練數據集上分類誤差率最低
v = features_min + self.learning_rate * i
if v not in features:
# 誤分類計算
# 基於基本分類器G(x)後產生新的標記
compare_array_positive = np.array([1 if features[k] > v else -1 for k in range(m)])
# 誤差率
weight_error_positive = sum([weights[k] for k in range(m) if compare_array_positive[k] != labels[k]])
# 基於基本分類器G(x)後產生新的標記
compare_array_nagetive = np.array([-1 if features[k] > v else 1 for k in range(m)])
# 誤差率
weight_error_nagetive = sum([weights[k] for k in range(m) if compare_array_nagetive[k] != labels[k]])
# 第一種方式產生的誤差率小
if weight_error_positive < weight_error_nagetive:
weight_error = weight_error_positive
_compare_array = compare_array_positive
direct = "positive"
# 第二種方式產生的誤差率小
else:
weight_error = weight_error_nagetive
_compare_array = compare_array_nagetive
direct = "nagetive"
if weight_error < error:
error = weight_error
compare_array = _compare_array
best_v = v
# 返回最好的閾值、差生誤差率的方式、誤差率、新的標記空間
return best_v, direct, error, compare_array
# 計算alpha
def _alpha(self, error):
return 0.5 * np.log((1 - error) / error)
# 規範化因子
# 參考《統計學習方法》P139 公式(8.5)
def _Z(self, weights, a, clf):
return sum([weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) for i in range(self.M)])
# 權值更新
# 參考《統計學習方法》P139 公式(8.4)
def _w(self, a, clf, Z):
for i in range(self.M):
self.weights[i] = self.weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) / Z
# 基本分類器
def G(self, x, v, direct):
if direct == "positive":
return 1 if x > v else -1
else:
return -1 if x > v else 1
def fit(self, X, y):
self.init_args(X, y)
for epoch in range(self.clf_num):
best_clf_error, best_v, clf_result = 100000, None, None
# 根據特徵維度, 選擇誤差最小的
for j in range(self.N):
features = self.X[:, j]
# 分類閾值,分類誤差,分類結果
v, direct, error, compare_array = self._G(features, self.Y, self.weights)
if error < best_clf_error:
best_clf_error = error
best_v = v
final_direct = direct
clf_result = compare_array
axis = j
if best_clf_error == 0:
break
# 計算G(x)係數α
a = self._alpha(best_clf_error)
self.alpha.append(a)
# 記錄分類器
self.clf_sets.append((axis, best_v, final_direct))
# 規範化因子
Z = self._Z(self.weights, a, clf_result)
# 更新權值
self._w(a, clf_result, Z)
def predict(self, feature):
result = 0.0
for i in range(len(self.clf_sets)):
axis, clf_v, direct = self.clf_sets[i]
f_input = feature[axis]
# 參考《統計學習方法》P139 公式(8.6)
result += self.alpha[i] * self.G(f_input, clf_v, direct)
return 1 if result > 0 else -1
def score(self, X_test, y_test):
right_count = 0
for i in range(len(X_test)):
feature = X_test[i]
if self.predict(feature) == y_test[i]:
right_count += 1
return right_count / len(X_test)
# 數據集
def create_data():
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:100, [0, 1, -1]])
for i in range(len(data)):
if data[i, -1] == 0:
data[i, -1] = -1
return data[:, :2], data[:, -1]
X, y= create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.33)
clf = AdaBoost()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))