迴歸樹
統計學習的部分也差不多該結束了,我希望以當前最效果最好的一種統計學習模型,Xgboost的原型GBDT來結尾。
GBDT的基礎是CART決策樹。在CART基學習器上使用boosting,形成更好的集成學習器,就是GBDT的思想。CART在離散特徵上的表現並不特別,也就是把我們之前學過的C4.5樹用基尼係數劃分。但在連續特徵上使用樹算法進行擬合迴歸就並沒有那麼輕鬆,一是劃分標準不容易確定,二是決策樹的本質決定了決策樹容易過擬合。但總之,我們還是先嚐試實現用CART樹來做常見的迴歸預測。
CART迴歸樹預測迴歸連續型數據,假設X與Y分別是輸入和輸出變量,並且Y是連續變量。在訓練數據集所在的輸入空間中,遞歸的將每個區域劃分爲兩個子區域並決定每個子區域上的輸出值,構建二叉決策樹。
考慮對一組數據,尋找一個合適的常數C表徵數據讓MSE最小,則C就是樣本均值。由此,迴歸樹的生成方法就是遍歷所有N個樣本的N-1個劃分點,計算劃分後兩類數據的MSE。找到最優劃分處。反覆迭代劃分就是最小二乘迴歸樹。
import numpy as np
import random
from copy import deepcopy
import matplotlib
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
首先生成數據用於迴歸
X_train = np.linspace(1,5,100)
y_train = 10*np.log(X_train)
#加噪聲
y_train += np.random.randn(100)
plt.scatter(X_train,y_train)
X_train = X_train.reshape(-1,1)
class Node:
def __init__(self):
self.dim = None #dimension ,劃分特徵維度
self.mid = None #middle,劃分中值點
self.y = None #結點預測返回值
self.left = None
self.right = None
def grow(self, X, y, max_depth = 5):
'''
根據數據X和y進行劃分點選取
參數max_depth是還允許向下生長的層數
'''
self.y = np.mean(y)
#劃分點選擇需要遍歷所有元素的所有中值點
N,M = X.shape
if max_depth == 0:
return
if N<5:
return
winner = (0,0)
win_error = float("inf")
for dim in range(M):
args = np.argsort(X[:,dim])
X = X[args]
y = y[args]
X_dim = X[:,dim]
for ii in range(N-1):
mid = 0.5*(X_dim[ii]+X_dim[ii+1])
y_left_mean = np.mean(y[:ii+1])
y_left_MSE = np.sum((y[:ii+1]-y_left_mean)**2)
y_right_mean = np.mean(y[ii+1:])
y_right_MSE = np.sum((y[ii+1:]-y_right_mean)**2)
err = y_left_MSE+y_right_MSE
if err<win_error:
win_error = err
winner = (dim,mid)
#完成遍歷之後將會找到一個合適的dim和mid進行劃分
X_left = []
y_left = []
X_right = []
y_right = []
self.dim,self.mid = winner
for i in range(N):
if X[i][self.dim]<self.mid:
X_left.append(X[i])
y_left.append(y[i])
else:
X_right.append(X[i])
y_right.append(y[i])
X_left = np.array(X_left)
y_left = np.array(y_left)
X_right = np.array(X_right)
y_right = np.array(y_right)
if len(X_left)==0 or len(X_right)==0:
return
self.left = Node()
self.left.grow(X_left,y_left,max_depth-1)
self.right = Node()
self.right.grow(X_right,y_right,max_depth-1)
def predict(self, x):
'''
預測函數,如果有子結點就下溯,沒有就返回self.y
'''
if self.left == None:
return self.y
if x[self.dim]<self.mid:
return self.left.predict(x)
else:
return self.right.predict(x)
tree = Node()
tree.grow(X_train,y_train,3)
y_pred = [tree.predict(x) for x in X_train]
X_train = X_train.reshape(-1)
plt.scatter(X_train,y_train,c='y')
plt.plot(X_train,y_pred,c='r')
X_train = X_train.reshape(-1,1)
嘗試多維曲面擬合,這裏用3dplot打印
#希望模型學習一個函數, f(x1,x2) = sin(x1)cos(x2)
#生成僞數據集
row = np.linspace(0,2,10)
col = np.linspace(0,2,10)
x1,x2 = np.meshgrid(row,col)
X = np.concatenate((x1.reshape(1,-1),x2.reshape(1,-1)),axis=0).T
y = np.zeros(100)
for i in range(len(X)):
y[i] = (X[i][0]+X[i][1])**2
tree = Node()
tree.grow(X,y,8)
#用3dplot打印model生成的曲面
from mpl_toolkits.mplot3d.axes3d import Axes3D
from matplotlib import pyplot as plt
xx = np.linspace(0,2,100)
yy = np.linspace(0,2,100)
X,Y = np.meshgrid(xx,yy)
Z = np.zeros(X.shape)
for i in range(len(X)):
for j in range(len(X[0])):
x = np.array((X[i][j],Y[i][j]))
Z[i][j] = tree.predict(x)
fig = plt.figure()
axes3d = Axes3D(fig)
axes3d.plot_surface(X,Y,Z,color='grey')
調整深度參數和剪枝可以有效控制過擬合。如果想要更精確,更強大的預測,我們有一種藉助boosting提升的方法,就是下面的的GBDT,Gradient Boosting Decision Tree:梯度提升決策樹
GBDT
有了迴歸樹做基學習器,就能用集成方法增強樹模型。Freidman提出了梯度提升算法:利用最速下降的近似方法,即利用損失函數的負梯度在當前模型的值,作爲迴歸問題中提升樹算法的殘差的近似值,擬合一個迴歸樹。(注:鄙人私以爲,與其說負梯度作爲殘差的近似值,不如說殘差是負梯度的一種特例)算法如下(截圖來自《The Elements of Statistical Learning》)
import numpy as np
import random
from copy import deepcopy
import matplotlib
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
class Node:
def __init__(self):
self.dim = None #dimension ,劃分特徵維度
self.mid = None #middle,劃分中值點
self.y = None #結點預測返回值
self.left = None
self.right = None
def grow(self, X, y, max_depth = 5):
'''
根據數據X和y進行劃分點選取
參數max_depth是還允許向下生長的層數
'''
self.y = np.mean(y)
#劃分點選擇需要遍歷所有元素的所有中值點
N,M = X.shape
if max_depth == 0:
return
if N<2:
return
winner = (0,0)
win_error = float("inf")
for dim in range(M):
args = np.argsort(X[:,dim])
X = X[args]
y = y[args]
X_dim = X[:,dim]
for ii in range(N-1):
mid = 0.5*(X_dim[ii]+X_dim[ii+1])
y_left_mean = np.mean(y[:ii+1])
y_left_MSE = np.sum((y[:ii+1]-y_left_mean)**2)
y_right_mean = np.mean(y[ii+1:])
y_right_MSE = np.sum((y[ii+1:]-y_right_mean)**2)
err = y_left_MSE+y_right_MSE
if err<win_error:
win_error = err
winner = (dim,mid)
#完成遍歷之後將會找到一個合適的dim和mid進行劃分
X_left = []
y_left = []
X_right = []
y_right = []
self.dim,self.mid = winner
for i in range(N):
if X[i][self.dim]<self.mid:
X_left.append(X[i])
y_left.append(y[i])
else:
X_right.append(X[i])
y_right.append(y[i])
X_left = np.array(X_left)
y_left = np.array(y_left)
X_right = np.array(X_right)
y_right = np.array(y_right)
if len(X_left)==0 or len(X_right)==0:
return
self.left = Node()
self.left.grow(X_left,y_left,max_depth-1)
self.right = Node()
self.right.grow(X_right,y_right,max_depth-1)
def predict(self, x):
'''
預測函數,如果有子結點就下溯,沒有就返回self.y
'''
if self.left == None:
return self.y
if x[self.dim]<self.mid:
return self.left.predict(x)
else:
return self.right.predict(x)
這裏用均方誤差作爲損失函數,從而梯度就是殘差y-f(x)
class GBDT:
def __init__(self, max_num = 5, max_depth = 6):
self.num = max_num
self.depth = max_depth
self.trees = []
def fit(self, X, y):
X_train = X.copy()
y_train = y.copy()
for _ in range(self.num):
tree = Node()
tree.grow(X_train,y_train,self.depth)
for i in range(len(X)):
X_train[i] -= tree.predict(X_train[i])
self.trees.append(tree)
def predict(self, x):
return sum(tree.predict(x) for tree in self.trees)
首先生成數據用於迴歸
X_train = np.linspace(1,5,100)
y_train = np.log(X_train)+np.sin(X_train)
#加噪聲
y_train += np.random.randn(100)*0.1
plt.scatter(X_train,y_train)
X_train = X_train.reshape(-1,1)
model = GBDT(max_num=20,max_depth=6)
model.fit(X_train,y_train)
y_pred = [model.trees[0].predict(x) for x in X_train]
X_train = X_train.reshape(-1)
plt.scatter(X_train,y_train,c='y')
plt.plot(X_train,y_pred,c='r')
X_train = X_train.reshape(-1,1)
可以看出非常強大的擬合能力,而GBDT的潛力遠不止擬合。其具有天然優勢可以發現多種有區分性的特徵以及特徵組合。業界中,Facebook使用其來自動發現有效的特徵、特徵組合,來作爲LR模型中的特徵,以提高 CTR預估(Click-Through Rate Prediction)的準確性(詳見參考文獻);GBDT在淘寶的搜索及預測業務上也發揮了重要作用(詳見參考文獻)。 《Practical Lessons from Predicting Clicks on Ads at Facebook》
另外,GBDT也在分類問題裏發揮作用。只要我們用one-hot編碼類別,然後對每個類別訓練一個GBDT來預測類別的y值,就能近似實現多分類問題。我們用下面這個iris的例子來說明。
from sklearn.datasets import load_iris
iris = load_iris()
descr = iris['DESCR']
data = iris['data']
feature_names = iris['feature_names']
target = iris['target']
target_names = iris['target_names']
def one_hot(y):
size = np.max(y)+1
out = np.zeros((len(y),size))
for i in range(len(y)):
out[i][int(y[i])] = 1
return out
y = one_hot(target)
X = data
y = y.T
#對每一個類別(這裏是3個類別)訓練一個GBDT
classifiers = []
for y0 in y:
model = GBDT(max_num=8,max_depth=6)
model.fit(X,y0)
classifiers.append(model)
for x in data:
c = [np.argmax(np.array([classifiers[i].predict(x) for i in range(3)])) for x in data]
right = np.sum(c==target)
print("Accuracy:",right/len(target))
Accuracy: 0.98
這個結果已經相當不錯了,由此我們可以看出GBDT今天在數據科學領域火爆的幾個原因。一是它的泛用性非常好,既能用於迴歸又能用於分類。再就是訓練快,效果好。並且我們還可以用它篩選特徵。今天大量的互聯網公司和金融公司都在用GBDT做各種各樣的事情。