GBDT算法解釋與Python實現

迴歸樹

統計學習的部分也差不多該結束了，我希望以當前最效果最好的一種統計學習模型，Xgboost的原型GBDT來結尾。
GBDT的基礎是CART決策樹。在CART基學習器上使用boosting，形成更好的集成學習器，就是GBDT的思想。CART在離散特徵上的表現並不特別，也就是把我們之前學過的C4.5樹用基尼係數劃分。但在連續特徵上使用樹算法進行擬合迴歸就並沒有那麼輕鬆，一是劃分標準不容易確定，二是決策樹的本質決定了決策樹容易過擬合。但總之，我們還是先嚐試實現用CART樹來做常見的迴歸預測。
CART迴歸樹預測迴歸連續型數據，假設X與Y分別是輸入和輸出變量，並且Y是連續變量。在訓練數據集所在的輸入空間中，遞歸的將每個區域劃分爲兩個子區域並決定每個子區域上的輸出值，構建二叉決策樹。

考慮對一組數據，尋找一個合適的常數C表徵數據讓MSE最小，則C就是樣本均值。由此，迴歸樹的生成方法就是遍歷所有N個樣本的N-1個劃分點，計算劃分後兩類數據的MSE。找到最優劃分處。反覆迭代劃分就是最小二乘迴歸樹。

import numpy as np
import random
from copy import deepcopy
import matplotlib
from matplotlib import pyplot as plt 
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore')

首先生成數據用於迴歸

X_train = np.linspace(1,5,100)
y_train = 10*np.log(X_train)
#加噪聲
y_train += np.random.randn(100)

plt.scatter(X_train,y_train)
X_train = X_train.reshape(-1,1)

class Node:
    def __init__(self):
        self.dim = None #dimension ，劃分特徵維度
        self.mid = None #middle，劃分中值點
        self.y = None  #結點預測返回值
        self.left = None
        self.right = None
        
    def grow(self, X, y, max_depth = 5):
        '''
        根據數據X和y進行劃分點選取
        參數max_depth是還允許向下生長的層數
        '''
        self.y = np.mean(y)
        #劃分點選擇需要遍歷所有元素的所有中值點
        N,M = X.shape
        if max_depth == 0:
            return
        if N<5:
            return
        
        winner = (0,0)
        win_error = float("inf")
        for dim in range(M):
            args = np.argsort(X[:,dim])
            X = X[args]
            y = y[args]
            X_dim = X[:,dim]
            for ii in range(N-1):
                mid = 0.5*(X_dim[ii]+X_dim[ii+1])
                y_left_mean = np.mean(y[:ii+1])
                y_left_MSE = np.sum((y[:ii+1]-y_left_mean)**2)
                y_right_mean = np.mean(y[ii+1:])
                y_right_MSE = np.sum((y[ii+1:]-y_right_mean)**2)
                err = y_left_MSE+y_right_MSE
                if err<win_error:
                    win_error = err
                    winner = (dim,mid)
                    
        #完成遍歷之後將會找到一個合適的dim和mid進行劃分
        X_left = []
        y_left = []
        X_right = []
        y_right = []
        self.dim,self.mid = winner
        for i in range(N):
            if X[i][self.dim]<self.mid:
                X_left.append(X[i])
                y_left.append(y[i])
            else:
                X_right.append(X[i])
                y_right.append(y[i])
        X_left = np.array(X_left)
        y_left = np.array(y_left)
        X_right = np.array(X_right)
        y_right = np.array(y_right)
        
        if len(X_left)==0 or len(X_right)==0:
            return
        self.left = Node()
        self.left.grow(X_left,y_left,max_depth-1)
        self.right = Node()
        self.right.grow(X_right,y_right,max_depth-1)
        
        
    def predict(self, x):
        '''
        預測函數，如果有子結點就下溯，沒有就返回self.y
        '''
        if self.left == None:
            return self.y
        if x[self.dim]<self.mid:
            return self.left.predict(x)
        else:
            return self.right.predict(x)

tree = Node()
tree.grow(X_train,y_train,3)

y_pred = [tree.predict(x) for x in X_train]
X_train = X_train.reshape(-1)
plt.scatter(X_train,y_train,c='y')
plt.plot(X_train,y_pred,c='r')
X_train = X_train.reshape(-1,1)

嘗試多維曲面擬合，這裏用3dplot打印

#希望模型學習一個函數, f(x1,x2) = sin(x1)cos(x2)
#生成僞數據集
row = np.linspace(0,2,10)
col = np.linspace(0,2,10)
x1,x2 = np.meshgrid(row,col)
X = np.concatenate((x1.reshape(1,-1),x2.reshape(1,-1)),axis=0).T

y = np.zeros(100)
for i in range(len(X)):
    y[i] = (X[i][0]+X[i][1])**2
    
tree = Node()
tree.grow(X,y,8)

#用3dplot打印model生成的曲面
from mpl_toolkits.mplot3d.axes3d import Axes3D
from matplotlib import pyplot as plt
xx = np.linspace(0,2,100)
yy = np.linspace(0,2,100)
X,Y = np.meshgrid(xx,yy)

Z = np.zeros(X.shape)
for i in range(len(X)):
    for j in range(len(X[0])):
        x = np.array((X[i][j],Y[i][j]))
        Z[i][j] = tree.predict(x)
        
fig = plt.figure()
axes3d = Axes3D(fig)
axes3d.plot_surface(X,Y,Z,color='grey')

調整深度參數和剪枝可以有效控制過擬合。如果想要更精確，更強大的預測，我們有一種藉助boosting提升的方法，就是下面的的GBDT，Gradient Boosting Decision Tree：梯度提升決策樹

GBDT

有了迴歸樹做基學習器，就能用集成方法增強樹模型。Freidman提出了梯度提升算法：利用最速下降的近似方法，即利用損失函數的負梯度在當前模型的值，作爲迴歸問題中提升樹算法的殘差的近似值，擬合一個迴歸樹。（注：鄙人私以爲，與其說負梯度作爲殘差的近似值，不如說殘差是負梯度的一種特例）算法如下（截圖來自《The Elements of Statistical Learning》）

import numpy as np
import random
from copy import deepcopy
import matplotlib
from matplotlib import pyplot as plt 
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore')

class Node:
    def __init__(self):
        self.dim = None #dimension ，劃分特徵維度
        self.mid = None #middle，劃分中值點
        self.y = None  #結點預測返回值
        self.left = None
        self.right = None
        
    def grow(self, X, y, max_depth = 5):
        '''
        根據數據X和y進行劃分點選取
        參數max_depth是還允許向下生長的層數
        '''
        self.y = np.mean(y)
        #劃分點選擇需要遍歷所有元素的所有中值點
        N,M = X.shape
        if max_depth == 0:
            return
        if N<2:
            return
        
        winner = (0,0)
        win_error = float("inf")
        for dim in range(M):
            args = np.argsort(X[:,dim])
            X = X[args]
            y = y[args]
            X_dim = X[:,dim]
            for ii in range(N-1):
                mid = 0.5*(X_dim[ii]+X_dim[ii+1])
                y_left_mean = np.mean(y[:ii+1])
                y_left_MSE = np.sum((y[:ii+1]-y_left_mean)**2)
                y_right_mean = np.mean(y[ii+1:])
                y_right_MSE = np.sum((y[ii+1:]-y_right_mean)**2)
                err = y_left_MSE+y_right_MSE
                if err<win_error:
                    win_error = err
                    winner = (dim,mid)
                    
        #完成遍歷之後將會找到一個合適的dim和mid進行劃分
        X_left = []
        y_left = []
        X_right = []
        y_right = []
        self.dim,self.mid = winner
        for i in range(N):
            if X[i][self.dim]<self.mid:
                X_left.append(X[i])
                y_left.append(y[i])
            else:
                X_right.append(X[i])
                y_right.append(y[i])
        X_left = np.array(X_left)
        y_left = np.array(y_left)
        X_right = np.array(X_right)
        y_right = np.array(y_right)
        
        if len(X_left)==0 or len(X_right)==0:
            return
        self.left = Node()
        self.left.grow(X_left,y_left,max_depth-1)
        self.right = Node()
        self.right.grow(X_right,y_right,max_depth-1)
        
        
    def predict(self, x):
        '''
        預測函數，如果有子結點就下溯，沒有就返回self.y
        '''
        if self.left == None:
            return self.y
        if x[self.dim]<self.mid:
            return self.left.predict(x)
        else:
            return self.right.predict(x)

這裏用均方誤差作爲損失函數，從而梯度就是殘差y-f(x)

class GBDT:
    def __init__(self, max_num = 5, max_depth = 6):
        self.num = max_num
        self.depth = max_depth
        self.trees = []
        
    def fit(self, X, y):
        X_train = X.copy()
        y_train = y.copy()
        
        for _ in range(self.num):
            tree = Node()
            tree.grow(X_train,y_train,self.depth)
            for i in range(len(X)):
                X_train[i] -= tree.predict(X_train[i])
            self.trees.append(tree)
            
    def predict(self, x):
        return sum(tree.predict(x) for tree in self.trees)

首先生成數據用於迴歸

X_train = np.linspace(1,5,100)
y_train = np.log(X_train)+np.sin(X_train)
#加噪聲
y_train += np.random.randn(100)*0.1

plt.scatter(X_train,y_train)
X_train = X_train.reshape(-1,1)

model = GBDT(max_num=20,max_depth=6)
model.fit(X_train,y_train)

y_pred = [model.trees[0].predict(x) for x in X_train]
X_train = X_train.reshape(-1)
plt.scatter(X_train,y_train,c='y')
plt.plot(X_train,y_pred,c='r')
X_train = X_train.reshape(-1,1)

可以看出非常強大的擬合能力，而GBDT的潛力遠不止擬合。其具有天然優勢可以發現多種有區分性的特徵以及特徵組合。業界中，Facebook使用其來自動發現有效的特徵、特徵組合，來作爲LR模型中的特徵，以提高 CTR預估（Click-Through Rate Prediction）的準確性（詳見參考文獻）；GBDT在淘寶的搜索及預測業務上也發揮了重要作用（詳見參考文獻）。《Practical Lessons from Predicting Clicks on Ads at Facebook》

另外，GBDT也在分類問題裏發揮作用。只要我們用one-hot編碼類別，然後對每個類別訓練一個GBDT來預測類別的y值，就能近似實現多分類問題。我們用下面這個iris的例子來說明。

from sklearn.datasets import load_iris

iris = load_iris()
descr = iris['DESCR']
data = iris['data']
feature_names = iris['feature_names']
target = iris['target']
target_names = iris['target_names']



def one_hot(y):
    size = np.max(y)+1
    out = np.zeros((len(y),size))
    for i in range(len(y)):
        out[i][int(y[i])] = 1
    return out


y = one_hot(target)
X = data
y = y.T

#對每一個類別(這裏是3個類別)訓練一個GBDT
classifiers = []
for y0 in y:
    model = GBDT(max_num=8,max_depth=6)
    model.fit(X,y0)
    classifiers.append(model)

for x in data:
    c = [np.argmax(np.array([classifiers[i].predict(x) for i in range(3)])) for x in data]
    right = np.sum(c==target)
print("Accuracy:",right/len(target))


Accuracy: 0.98

這個結果已經相當不錯了，由此我們可以看出GBDT今天在數據科學領域火爆的幾個原因。一是它的泛用性非常好，既能用於迴歸又能用於分類。再就是訓練快，效果好。並且我們還可以用它篩選特徵。今天大量的互聯網公司和金融公司都在用GBDT做各種各樣的事情。

GBDT算法解釋與Python實現

迴歸樹

GBDT

可解釋的卷積神經網絡

Meta Learning技術 MAML

統計學習方法(4) GBDT算法解釋與Python實現

線性代數的應用場景

FGSM攻擊機器學習模型

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結