初識機器學習 | 5.梯度下降

import numpy as np
import matplotlib.pyplot as plt

%matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Using matplotlib backend: MacOSX

梯度下降求解一元二次方程

y=(x2.5)21 y = (x-2.5)^2 -1

x = np.linspace(-1,6,200)
y = (x - 2.5)**2 - 1

plt.plot(x, y)
plt.show()

[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-xq7J7mUM-1584938277770)(output_2_0.png)]

def j(theta):
    """一元二次方程"""
    try:
        return (theta - 2.5) ** 2 - 1
    except:
        return float('inf')

def dj(theta):
    """求導"""
    return 2 * (theta - 2.5)

def gradient_descent(theta=0.0, eta=0.01, epsilon=1e-8, max_iters=10000):
    """
    theta: 參數
    eta: 學習率
    epsilon: 最小值
    max_iters: 最大嘗試次數
    """
    theta_history = [theta]
    while max_iters>0:
        gradient = dj(theta)
        last_theta = theta
        theta = theta - eta * gradient
        theta_history.append(theta)
        
        if(abs(j(theta)-j(last_theta)) < epsilon):
            break
        max_iters -= 1
    print('theta: ', theta)
    print('min j(theta): ', j(theta))
    print('theta_history length: ', len(theta_history))
    plt.plot(x, y)
    plt.plot(np.array(theta_history), j(np.array(theta_history)), color="r", marker='+')
    plt.show()
    
# 使用默認參數
gradient_descent()
theta:  2.4995140741236224
min j(theta):  -0.9999997638760426
theta_history length:  424

在這裏插入圖片描述

# 當學習率eta比較小時,下降的步子很小。程序需要循環很多次纔會找到最小值。 一般學習率會設置爲0.01
gradient_descent(eta=0.001)
theta:  2.4984243400819484
min j(theta):  -0.9999975172958226
theta_history length:  3682

[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-RsxQlmRr-1584938277784)(output_5_1.png)]

# 當學習率eta較大時候, theta跳到右邊
gradient_descent(eta=0.8)
theta:  2.500054842376601
min j(theta):  -0.9999999969923137
theta_history length:  22

在這裏插入圖片描述

# 當特別大時,如1.5. 程序會陷入死循環, 所以限定遞歸次數爲500
gradient_descent(eta=1.5, max_iters=500)
theta:  -8.183476519740352e+150
min j(theta):  6.696928794914166e+301
theta_history length:  501

[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-0gb2qpZc-1584938277788)(output_7_1.png)]

線性迴歸模型中使用梯度下降

損失函數:

J(θ)=1mi=1m(y(i)y^(i))2 J(\theta) = \frac{1}{m} \sum_{i=1}^{m}\left(y^{(i)}-\hat{y}^{(i)}\right)^{2}

J(θ)=(J/θ0J/θ1J/θ2Jθn)=2m(i=1m(Xb(i)θy(0))i=1m(Xb(i)θy(i))X1(i)i=1m(Xb(i)θy(i))X2(i)i=1m(Xb(i)θy(i))Xn(i) \nabla J(\boldsymbol{\theta})=\left(\begin{array}{c} \partial J / \partial_{\theta_{0}} \\ \partial J / \partial \theta_{1} \\ \partial J / \partial_{\theta_{2}} \\ \ldots \\ \partial J_{\partial \theta_{n}} \end{array}\right)=\frac{2}{m} \cdot\left(\begin{array}{c} \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(0)}\right) \\ \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{1}^{(i)} \\ \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{2}^{(i)} \\ \cdots \\ \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{n}^{(i)} \end{array}\right.

其中y的預測值:

Xb=(1X1(1)X2(1)Xn(1)1X1(2)X2(2)Xn(2)1X1(m)X2(m)Xn(m))θ=(θ0θ1θ2θn) X_{b}=\left(\begin{array}{ccccc} 1 & X_{1}^{(1)} & X_{2}^{(1)} & \ldots & X_{n}^{(1)} \\ 1 & X_{1}^{(2)} & X_{2}^{(2)} & \ldots & X_{n}^{(2)} \\ \ldots & & & & \ldots \\ 1 & X_{1}^{(m)} & X_{2}^{(m)} & \ldots & X_{n}^{(m)} \end{array}\right) \quad \theta=\left(\begin{array}{c} \theta_{0} \\ \theta_{1} \\ \theta_{2} \\ \ldots \\ \theta_{n} \end{array}\right)

y^=Xbθ \hat{y}=X_{b} \cdot \theta

num_size = 10000
np.random.seed(100)
# 1. np.random.random(size=num_size) 0-1的隨機數; 2. np.random.normal(size=num_size) 正態分佈
x_1 = np.random.random(size=num_size)
X_1 = x_1.reshape(-1,1)
# 函數 y=10x+5 然後設置一定的噪音
y_1 = 10 * x_1 + 5 + np.random.normal(size=num_size) 

plt.scatter(x_1, y_1, s=0.1)
plt.show()

[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-fEWeSz2G-1584938277790)(output_9_0.png)]

循環方式實現

class LinearRegression:
    def __init__(self):
        self._thetas = None
        # 截距
        self.intercept = None
        # 參數係數
        self.coefs = None

    def fit(self, x_train, y_train):
        raise NotImplementedError

    def predict(self, x_predict):
        raise NotImplementedError

    @staticmethod
    def score(y, y_predict):
        """R評價"""
        return 1 - np.dot(y_predict - y, y_predict - y) / len(y) / np.var(y)
    
class LinearRegressionBGDLoop(LinearRegression):
    """ 批量梯度下降(BGD)_循環維度 """
    def j(self, x, y, theta):
        """目標函數"""
        try:
            return np.sum(y - np.dot(x, theta)) ** 2 / len(x)
        except:
            # 當數據特別大,報錯時,返回無窮大。
            return float('inf')

    def dj(self, x, y, thetas):
        """求導(梯度)"""
        res = np.empty(len(thetas))
        # 第0個theta,其實就是截距
        res[0] = np.sum(np.dot(x, thetas) - y)
        for col in range(1, len(thetas)):
            res[col] = np.sum((np.dot(x, thetas) - y).dot(x[:, col]))

        return res * 2 / len(x)

    def gradient_descent(self, x, y, initial_thetas, eta, epsilon, max_iters):
        """梯度下降"""
        thetas = initial_thetas
        while max_iters > 0:
            # 梯度gradient
            gradient = self.dj(x, y, thetas)
            last_thetas = thetas
            thetas = thetas - eta * gradient
            if(abs(self.j(x, y, thetas) - self.j(x, y, last_thetas)) < epsilon):
                break
            max_iters -= 1

        self._thetas = thetas
        self.intercept = thetas[0]
        self.coefs = thetas[1:]

    def fit(self, x_train, y_train, eta=0.01, epsilon=1e-8, max_iters=1e4):
        """訓練"""
        # 加上一列全爲1
        X_b = np.hstack([np.ones((len(x_train), 1)), x_train])
        initial_thetas = np.zeros(X_b.shape[1])
        self.gradient_descent(X_b, y_train, initial_thetas, eta, epsilon, max_iters)

    def predict(self, x_predict):
        """預測"""
        X_b = np.hstack([np.ones((len(x_predict), 1)), x_predict])
        return np.dot(X_b, self._thetas)

    @staticmethod
    def score(y, y_predict):
        """R評價"""
        return 1 - np.dot(y_predict - y, y_predict - y) / len(y) / np.var(y)

# 主函數    
linear = LinearRegressionBGDLoop()
%time linear.fit(X_1, y_1)
%time predict_y_1 = linear.predict(X_1)
print('coefs: ', linear.coefs)
print('intercept: ', linear.intercept)
print('score: ', linear.score(y_1, predict_y_1))

CPU times: user 2.03 s, sys: 186 ms, total: 2.22 s
Wall time: 1.35 s
CPU times: user 874 µs, sys: 334 µs, total: 1.21 ms
Wall time: 643 µs
coefs:  [9.95834636]
intercept:  5.030511016464405
score:  0.8927947969714475

向量化計算

J(θ)=2m(i=1m(Xb(i)θy(i))X0(i)i=1m(Xb(i)θy(i))X1(i)i=1m(Xb(i)θy(i))X2(i)i=1m(Xb(i)θy(i))Xn(i)=2m(Xbθy)TXb=2mXbT(Xbθy) \nabla J(\boldsymbol{\theta}) =\frac{2}{m} \cdot\left(\begin{array}{c} \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{0}^{(i)} \\ \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{1}^{(i)} \\ \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{2}^{(i)} \\ \cdots \\ \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{n}^{(i)} \end{array}\right. =\frac{2}{m} \cdot\left(X_{b} \theta-y\right)^{T} \cdot X_{b} =\frac{2}{m} \cdot X_{b}^{T} \cdot\left(X_{b} \theta-y\right)

class LinearRegressionBGDVector(LinearRegressionBGDLoop):
    """批量梯度下降(BGD)_向量化維度"""
    def dj(self, x, y, thetas):
        """向量實現"""
        return (x.T).dot(x.dot(thetas)-y) * 2 / len(x)
# 主函數    
lin_reg = LinearRegressionBGDVector()
%time lin_reg.fit(X_1, y_1)
%time predict_y = lin_reg.predict(X_1)
print('coefs: ', lin_reg.coefs)
print('intercept: ', lin_reg.intercept)
print('score: ', lin_reg.score(y_1, predict_y_1))

CPU times: user 1.22 s, sys: 104 ms, total: 1.32 s
Wall time: 738 ms
CPU times: user 602 µs, sys: 182 µs, total: 784 µs
Wall time: 426 µs
coefs:  [9.95834636]
intercept:  5.030511016464405
score:  0.8927947969714475

循環實現與向量實現值一樣,向量會快一點,但不是很明顯,是因爲這裏coefs只有一個,沒有區分點。創建以下特徵數爲m,樣本爲n的樣本集. 效果就很明顯。

# 創建樣本集: 特徵數爲m,樣本爲n
m = 100
n = 10000

x_m = np.random.normal(size=(n, m))
true_thetas = np.random.uniform(0.0, 100.0, size=m+1)
y_m = x_m.dot(true_thetas[1:]) + true_thetas[0] + np.random.normal(0., 10., size=n)
lin_reg1 = LinearRegressionBGDLoop()
%time lin_reg1.fit(x_m, y_m)

lin_reg2 = LinearRegressionBGDVector()
%time lin_reg2.fit(x_m, y_m)
CPU times: user 1min 8s, sys: 2.25 s, total: 1min 11s
Wall time: 42.2 s
CPU times: user 2.92 s, sys: 84.9 ms, total: 3 s
Wall time: 2.22 s

梯度下降法大家族

  • 批量梯度下降法: Batch Gradient Descent(BGD)
  • 隨機梯度下降法: Stochastic Gradient Descen(SGD)
  • 小批量梯度下降法: Mini-batch Gradient Descent(MBGD) 融合了BGD與SGD

批量梯度下降法

以上實現的就是批量梯度下降法,求每次theta梯度都需要全量計算一下樣本

J(θ)=2m(i=1m(Xb(i)θy(i))X0(i)i=1m(Xb(i)θy(i))X1(i)i=1m(Xb(i)θy(i))X2(i)i=1m(Xb(i)θy(i))Xn(i)=2m(Xbθy)TXb=2mXbT(Xbθy) \nabla J(\boldsymbol{\theta}) =\frac{2}{m} \cdot\left(\begin{array}{c} \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{0}^{(i)} \\ \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{1}^{(i)} \\ \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{2}^{(i)} \\ \cdots \\ \sum_{i=1}^{m}\left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{n}^{(i)} \end{array}\right. =\frac{2}{m} \cdot\left(X_{b} \theta-y\right)^{T} \cdot X_{b} =\frac{2}{m} \cdot X_{b}^{T} \cdot\left(X_{b} \theta-y\right)

隨機梯度下降法

每次只隨機選取一個樣本計算theta梯度

J(θ)=2((Xb(i)θy(i))X0(i)(Xb(i)θy(i))X1(i)(Xb(i)θy(i))X2(i)(Xb(i)θy(i))Xn(i))=2(Xb(i))T(Xb(i)θy(i)) \nabla J(\boldsymbol{\theta}) =2 \cdot\left(\begin{array}{c} \left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{0}^{(i)} \\ \left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{1}^{(i)} \\ \left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{2}^{(i)} \\ \cdots \\ \left(X_{b}^{(i)} \theta-y^{(i)}\right) \cdot X_{n}^{(i)} \end{array}\right)=2 \cdot\left(X_{b}^{(i)}\right)^{T} \cdot\left(X_{b}^{(i)} \theta-y^{(i)}\right)

學習率逐漸變小(經驗值 a=5,b=50)。這是個典型的模擬退火思想

η=t0iiters +t1 \eta=\frac{t_{0}}{i_{-} \text {iters }+t_{1}}

class LinearRegressionSGD(LinearRegressionBGDLoop):
    """隨機梯度下降(SDG)"""
    def dj(self, x_i, y_i, thetas):
        return (x_i.T).dot(x_i.dot(thetas)-y_i) * 2

    def gradient_descent(self, x, y, initial_thetas, eta, epsilon, max_iters, t0, t1):
        """梯度下降"""
        thetas = initial_thetas
        while max_iters > 0:
            # 隨機抽取一個樣本
            i = np.random.randint(len(x))
            # 梯度gradient
            gradient = self.dj(x[i], y[i], thetas)
            last_thetas = thetas
            # 學習率模擬退火
            eta = (t0 + eta) / (max_iters + t1)
            thetas = thetas - eta * gradient
            if(abs(self.j(x, y, thetas) - self.j(x, y, last_thetas)) < epsilon):
                break
            max_iters -= 1

        self._thetas = thetas
        self.intercept = thetas[0]
        self.coefs = thetas[1:]

    def fit(self, x_train, y_train, eta=0.01, epsilon=1e-8, max_iters=1e4, t0=5, t1=50):
        """訓練"""
        # 加上一列全爲1
        X_b = np.hstack([np.ones((len(x_train), 1)), x_train])
        initial_thetas = np.zeros(X_b.shape[1])
        self.gradient_descent(X_b, y_train, initial_thetas, eta, epsilon, max_iters, t0, t1)
# 主函數    
lin_reg = LinearRegressionSGD()
%time lin_reg.fit(X_1, y_1)
%time predict_y = lin_reg.predict(X_1)
print('---- SGD ----')
print('coefs: ', lin_reg.coefs)
print('intercept: ', lin_reg.intercept)
print('score: ', lin_reg.score(y_1, predict_y_1))
CPU times: user 1.99 s, sys: 175 ms, total: 2.17 s
Wall time: 1.49 s
CPU times: user 559 µs, sys: 215 µs, total: 774 µs
Wall time: 997 µs
---- Vector ----
coefs:  [9.90608818]
intercept:  4.81501911723699
score:  0.8927947969714475

小批量梯度下降法

TODO: 待補充

BGD、SGD、MBGD比較

print('----- BGD ----- ')
lin_reg1 = LinearRegressionBGDLoop()
%time lin_reg1.fit(x_m, y_m)
%time predict_y_m1 = lin_reg1.predict(x_m)
lin_reg1.score(y_m, predict_y_m1) 
print('score: ', lin_reg1.score(y_m, predict_y_m1))
      
print('----- SGD ----- ')
lin_reg2 = LinearRegressionSGD()
%time lin_reg2.fit(x_m, y_m)
predict_y_m2 = lin_reg2.predict(x_m)
print('score: ', lin_reg2.score(y_m, predict_y_m2))
----- BGD ----- 
CPU times: user 1min 13s, sys: 2.64 s, total: 1min 15s
Wall time: 48.4 s
CPU times: user 4.07 ms, sys: 934 µs, total: 5 ms
Wall time: 2.38 ms
score:  0.9996949647594553
----- SGD ----- 
CPU times: user 12.9 s, sys: 379 ms, total: 13.3 s
Wall time: 7.37 s
score:  -1.0385235598722087e+28

線性迴歸 梯度下降 vs 正規方程解

  • 正規方程解時間複雜度較大,不如梯度下降。
  • 正規方程不需要進行數據標準話,而梯度下降需要。
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

# 正規方程求解
print('---- LinearRegression ----')
lin_reg = LinearRegression() 
%time lin_reg.fit(x_m, y_m)
%time y_predict = lin_reg.predict(x)
print('score', lin_reg.score(x, y_predict))

# 梯度下降求解
print('/n---- SGDRegressor ----')
lin_reg = SGDRegressor(learning_rate="constant") 
%time lin_reg.fit(x_m, y_m)
%time y_predict_m = lin_reg.predict(x_m)
print('score', lin_reg.score(x_m, y_predict_m))

---- LinearRegression ----
CPU times: user 995 ms, sys: 70.5 ms, total: 1.07 s
Wall time: 689 ms
CPU times: user 115 ms, sys: 111 ms, total: 225 ms
Wall time: 253 ms
score 1.0
/n---- SGDRegressor ----
CPU times: user 715 ms, sys: 14.6 ms, total: 729 ms
Wall time: 627 ms
CPU times: user 12.1 ms, sys: 207 µs, total: 12.3 ms
Wall time: 8.79 ms
score 1.0

調試梯度

用這個樸實的梯度調試方法,求得梯度,帶入目標函數中求得想要的thetas(但這個時間複雜度高,只適合小批量debug使用). 用來驗證推導的梯度求解公式.

對於一元

dJdθ=J(θ+ε)J(θε)12ε\frac{d J}{d \theta}=\frac{J(\theta+\varepsilon)-J(\theta-\varepsilon)}{12 \varepsilon}

對於多元

θ=(θ0,θ1,θ2,,θn)θ0+=(θ0+ε,θ1,θ2,,θn)θ0=(θ0ε,θ1,θ2,,θn)Jθ=(Jθ0,Jθ1,Jθ2,,Jθn) \theta=\left(\theta_{0}, \theta_{1}, \theta_{2}, \ldots, \theta_{n}\right) \\ \theta_{0}^{+}=\left(\theta_{0}+\varepsilon, \theta_{1}, \theta_{2}, \ldots, \theta_{n}\right) \\ \theta_{0}^{-}=\left(\theta_{0}-\varepsilon, \theta_{1}, \theta_{2}, \ldots, \theta_{n}\right) \\ \frac{\partial J}{\partial \theta}=\left(\frac{\partial J}{\partial \theta_{0}}, \frac{\partial J}{\partial \theta_{1}}, \frac{\partial J}{\partial \theta_{2}}, \ldots, \frac{\partial J}{\partial \theta_{n}}\right)

推導出 =>

Jθ0=J(θ0+)J(θ0)2ε \frac{\partial J}{\partial \theta_{0}}=\frac{J\left(\theta_{0}^{+}\right)-J\left(\theta_{0}^{-}\right)}{2 \varepsilon}

參考

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章