GBDT算法解釋與Python實現

迴歸樹

統計學習的部分也差不多該結束了,我希望以當前最效果最好的一種統計學習模型,Xgboost的原型GBDT來結尾。
GBDT的基礎是CART決策樹。在CART基學習器上使用boosting,形成更好的集成學習器,就是GBDT的思想。CART在離散特徵上的表現並不特別,也就是把我們之前學過的C4.5樹用基尼係數劃分。但在連續特徵上使用樹算法進行擬合迴歸就並沒有那麼輕鬆,一是劃分標準不容易確定,二是決策樹的本質決定了決策樹容易過擬合。但總之,我們還是先嚐試實現用CART樹來做常見的迴歸預測。
CART迴歸樹預測迴歸連續型數據,假設X與Y分別是輸入和輸出變量,並且Y是連續變量。在訓練數據集所在的輸入空間中,遞歸的將每個區域劃分爲兩個子區域並決定每個子區域上的輸出值,構建二叉決策樹。

在這裏插入圖片描述在這裏插入圖片描述
考慮對一組數據,尋找一個合適的常數C表徵數據讓MSE最小,則C就是樣本均值。由此,迴歸樹的生成方法就是遍歷所有N個樣本的N-1個劃分點,計算劃分後兩類數據的MSE。找到最優劃分處。反覆迭代劃分就是最小二乘迴歸樹。
在這裏插入圖片描述

import numpy as np
import random
from copy import deepcopy
import matplotlib
from matplotlib import pyplot as plt 
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore')

首先生成數據用於迴歸

X_train = np.linspace(1,5,100)
y_train = 10*np.log(X_train)
#加噪聲
y_train += np.random.randn(100)

plt.scatter(X_train,y_train)
X_train = X_train.reshape(-1,1)

在這裏插入圖片描述

class Node:
    def __init__(self):
        self.dim = None #dimension ,劃分特徵維度
        self.mid = None #middle,劃分中值點
        self.y = None  #結點預測返回值
        self.left = None
        self.right = None
        
    def grow(self, X, y, max_depth = 5):
        '''
        根據數據X和y進行劃分點選取
        參數max_depth是還允許向下生長的層數
        '''
        self.y = np.mean(y)
        #劃分點選擇需要遍歷所有元素的所有中值點
        N,M = X.shape
        if max_depth == 0:
            return
        if N<5:
            return
        
        winner = (0,0)
        win_error = float("inf")
        for dim in range(M):
            args = np.argsort(X[:,dim])
            X = X[args]
            y = y[args]
            X_dim = X[:,dim]
            for ii in range(N-1):
                mid = 0.5*(X_dim[ii]+X_dim[ii+1])
                y_left_mean = np.mean(y[:ii+1])
                y_left_MSE = np.sum((y[:ii+1]-y_left_mean)**2)
                y_right_mean = np.mean(y[ii+1:])
                y_right_MSE = np.sum((y[ii+1:]-y_right_mean)**2)
                err = y_left_MSE+y_right_MSE
                if err<win_error:
                    win_error = err
                    winner = (dim,mid)
                    
        #完成遍歷之後將會找到一個合適的dim和mid進行劃分
        X_left = []
        y_left = []
        X_right = []
        y_right = []
        self.dim,self.mid = winner
        for i in range(N):
            if X[i][self.dim]<self.mid:
                X_left.append(X[i])
                y_left.append(y[i])
            else:
                X_right.append(X[i])
                y_right.append(y[i])
        X_left = np.array(X_left)
        y_left = np.array(y_left)
        X_right = np.array(X_right)
        y_right = np.array(y_right)
        
        if len(X_left)==0 or len(X_right)==0:
            return
        self.left = Node()
        self.left.grow(X_left,y_left,max_depth-1)
        self.right = Node()
        self.right.grow(X_right,y_right,max_depth-1)
        
        
    def predict(self, x):
        '''
        預測函數,如果有子結點就下溯,沒有就返回self.y
        '''
        if self.left == None:
            return self.y
        if x[self.dim]<self.mid:
            return self.left.predict(x)
        else:
            return self.right.predict(x)
tree = Node()
tree.grow(X_train,y_train,3)

y_pred = [tree.predict(x) for x in X_train]
X_train = X_train.reshape(-1)
plt.scatter(X_train,y_train,c='y')
plt.plot(X_train,y_pred,c='r')
X_train = X_train.reshape(-1,1)

在這裏插入圖片描述
嘗試多維曲面擬合,這裏用3dplot打印

#希望模型學習一個函數, f(x1,x2) = sin(x1)cos(x2)
#生成僞數據集
row = np.linspace(0,2,10)
col = np.linspace(0,2,10)
x1,x2 = np.meshgrid(row,col)
X = np.concatenate((x1.reshape(1,-1),x2.reshape(1,-1)),axis=0).T

y = np.zeros(100)
for i in range(len(X)):
    y[i] = (X[i][0]+X[i][1])**2
    
tree = Node()
tree.grow(X,y,8)

#用3dplot打印model生成的曲面
from mpl_toolkits.mplot3d.axes3d import Axes3D
from matplotlib import pyplot as plt
xx = np.linspace(0,2,100)
yy = np.linspace(0,2,100)
X,Y = np.meshgrid(xx,yy)

Z = np.zeros(X.shape)
for i in range(len(X)):
    for j in range(len(X[0])):
        x = np.array((X[i][j],Y[i][j]))
        Z[i][j] = tree.predict(x)
        
fig = plt.figure()
axes3d = Axes3D(fig)
axes3d.plot_surface(X,Y,Z,color='grey')

在這裏插入圖片描述
調整深度參數和剪枝可以有效控制過擬合。如果想要更精確,更強大的預測,我們有一種藉助boosting提升的方法,就是下面的的GBDT,Gradient Boosting Decision Tree:梯度提升決策樹

GBDT

有了迴歸樹做基學習器,就能用集成方法增強樹模型。Freidman提出了梯度提升算法:利用最速下降的近似方法,即利用損失函數的負梯度在當前模型的值,作爲迴歸問題中提升樹算法的殘差的近似值,擬合一個迴歸樹。(注:鄙人私以爲,與其說負梯度作爲殘差的近似值,不如說殘差是負梯度的一種特例)算法如下(截圖來自《The Elements of Statistical Learning》)
在這裏插入圖片描述

import numpy as np
import random
from copy import deepcopy
import matplotlib
from matplotlib import pyplot as plt 
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore')

class Node:
    def __init__(self):
        self.dim = None #dimension ,劃分特徵維度
        self.mid = None #middle,劃分中值點
        self.y = None  #結點預測返回值
        self.left = None
        self.right = None
        
    def grow(self, X, y, max_depth = 5):
        '''
        根據數據X和y進行劃分點選取
        參數max_depth是還允許向下生長的層數
        '''
        self.y = np.mean(y)
        #劃分點選擇需要遍歷所有元素的所有中值點
        N,M = X.shape
        if max_depth == 0:
            return
        if N<2:
            return
        
        winner = (0,0)
        win_error = float("inf")
        for dim in range(M):
            args = np.argsort(X[:,dim])
            X = X[args]
            y = y[args]
            X_dim = X[:,dim]
            for ii in range(N-1):
                mid = 0.5*(X_dim[ii]+X_dim[ii+1])
                y_left_mean = np.mean(y[:ii+1])
                y_left_MSE = np.sum((y[:ii+1]-y_left_mean)**2)
                y_right_mean = np.mean(y[ii+1:])
                y_right_MSE = np.sum((y[ii+1:]-y_right_mean)**2)
                err = y_left_MSE+y_right_MSE
                if err<win_error:
                    win_error = err
                    winner = (dim,mid)
                    
        #完成遍歷之後將會找到一個合適的dim和mid進行劃分
        X_left = []
        y_left = []
        X_right = []
        y_right = []
        self.dim,self.mid = winner
        for i in range(N):
            if X[i][self.dim]<self.mid:
                X_left.append(X[i])
                y_left.append(y[i])
            else:
                X_right.append(X[i])
                y_right.append(y[i])
        X_left = np.array(X_left)
        y_left = np.array(y_left)
        X_right = np.array(X_right)
        y_right = np.array(y_right)
        
        if len(X_left)==0 or len(X_right)==0:
            return
        self.left = Node()
        self.left.grow(X_left,y_left,max_depth-1)
        self.right = Node()
        self.right.grow(X_right,y_right,max_depth-1)
        
        
    def predict(self, x):
        '''
        預測函數,如果有子結點就下溯,沒有就返回self.y
        '''
        if self.left == None:
            return self.y
        if x[self.dim]<self.mid:
            return self.left.predict(x)
        else:
            return self.right.predict(x)

這裏用均方誤差作爲損失函數,從而梯度就是殘差y-f(x)

class GBDT:
    def __init__(self, max_num = 5, max_depth = 6):
        self.num = max_num
        self.depth = max_depth
        self.trees = []
        
    def fit(self, X, y):
        X_train = X.copy()
        y_train = y.copy()
        
        for _ in range(self.num):
            tree = Node()
            tree.grow(X_train,y_train,self.depth)
            for i in range(len(X)):
                X_train[i] -= tree.predict(X_train[i])
            self.trees.append(tree)
            
    def predict(self, x):
        return sum(tree.predict(x) for tree in self.trees)

首先生成數據用於迴歸

X_train = np.linspace(1,5,100)
y_train = np.log(X_train)+np.sin(X_train)
#加噪聲
y_train += np.random.randn(100)*0.1

plt.scatter(X_train,y_train)
X_train = X_train.reshape(-1,1)

在這裏插入圖片描述

model = GBDT(max_num=20,max_depth=6)
model.fit(X_train,y_train)

y_pred = [model.trees[0].predict(x) for x in X_train]
X_train = X_train.reshape(-1)
plt.scatter(X_train,y_train,c='y')
plt.plot(X_train,y_pred,c='r')
X_train = X_train.reshape(-1,1)

在這裏插入圖片描述
可以看出非常強大的擬合能力,而GBDT的潛力遠不止擬合。其具有天然優勢可以發現多種有區分性的特徵以及特徵組合。業界中,Facebook使用其來自動發現有效的特徵、特徵組合,來作爲LR模型中的特徵,以提高 CTR預估(Click-Through Rate Prediction)的準確性(詳見參考文獻);GBDT在淘寶的搜索及預測業務上也發揮了重要作用(詳見參考文獻)。 《Practical Lessons from Predicting Clicks on Ads at Facebook》

另外,GBDT也在分類問題裏發揮作用。只要我們用one-hot編碼類別,然後對每個類別訓練一個GBDT來預測類別的y值,就能近似實現多分類問題。我們用下面這個iris的例子來說明。

from sklearn.datasets import load_iris

iris = load_iris()
descr = iris['DESCR']
data = iris['data']
feature_names = iris['feature_names']
target = iris['target']
target_names = iris['target_names']



def one_hot(y):
    size = np.max(y)+1
    out = np.zeros((len(y),size))
    for i in range(len(y)):
        out[i][int(y[i])] = 1
    return out


y = one_hot(target)
X = data
y = y.T

#對每一個類別(這裏是3個類別)訓練一個GBDT
classifiers = []
for y0 in y:
    model = GBDT(max_num=8,max_depth=6)
    model.fit(X,y0)
    classifiers.append(model)

for x in data:
    c = [np.argmax(np.array([classifiers[i].predict(x) for i in range(3)])) for x in data]
    right = np.sum(c==target)
print("Accuracy:",right/len(target))


Accuracy: 0.98

這個結果已經相當不錯了,由此我們可以看出GBDT今天在數據科學領域火爆的幾個原因。一是它的泛用性非常好,既能用於迴歸又能用於分類。再就是訓練快,效果好。並且我們還可以用它篩選特徵。今天大量的互聯網公司和金融公司都在用GBDT做各種各樣的事情。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章