import numpy as np
import matplotlib.pyplot as plt
%matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Using matplotlib backend: MacOSX
梯度下降求解一元二次方程
x = np.linspace(-1,6,200)
y = (x - 2.5)**2 - 1
plt.plot(x, y)
plt.show()
def j(theta):
"""一元二次方程"""
try:
return (theta - 2.5) ** 2 - 1
except:
return float('inf')
def dj(theta):
"""求導"""
return 2 * (theta - 2.5)
def gradient_descent(theta=0.0, eta=0.01, epsilon=1e-8, max_iters=10000):
"""
theta: 參數
eta: 學習率
epsilon: 最小值
max_iters: 最大嘗試次數
"""
theta_history = [theta]
while max_iters>0:
gradient = dj(theta)
last_theta = theta
theta = theta - eta * gradient
theta_history.append(theta)
if(abs(j(theta)-j(last_theta)) < epsilon):
break
max_iters -= 1
print('theta: ', theta)
print('min j(theta): ', j(theta))
print('theta_history length: ', len(theta_history))
plt.plot(x, y)
plt.plot(np.array(theta_history), j(np.array(theta_history)), color="r", marker='+')
plt.show()
# 使用默認參數
gradient_descent()
theta: 2.4995140741236224
min j(theta): -0.9999997638760426
theta_history length: 424
# 當學習率eta比較小時,下降的步子很小。程序需要循環很多次纔會找到最小值。 一般學習率會設置爲0.01
gradient_descent(eta=0.001)
theta: 2.4984243400819484
min j(theta): -0.9999975172958226
theta_history length: 3682
# 當學習率eta較大時候, theta跳到右邊
gradient_descent(eta=0.8)
theta: 2.500054842376601
min j(theta): -0.9999999969923137
theta_history length: 22
# 當特別大時,如1.5. 程序會陷入死循環, 所以限定遞歸次數爲500
gradient_descent(eta=1.5, max_iters=500)
theta: -8.183476519740352e+150
min j(theta): 6.696928794914166e+301
theta_history length: 501
線性迴歸模型中使用梯度下降
損失函數:
其中y的預測值:
num_size = 10000
np.random.seed(100)
# 1. np.random.random(size=num_size) 0-1的隨機數; 2. np.random.normal(size=num_size) 正態分佈
x_1 = np.random.random(size=num_size)
X_1 = x_1.reshape(-1,1)
# 函數 y=10x+5 然後設置一定的噪音
y_1 = 10 * x_1 + 5 + np.random.normal(size=num_size)
plt.scatter(x_1, y_1, s=0.1)
plt.show()
循環方式實現
class LinearRegression:
def __init__(self):
self._thetas = None
# 截距
self.intercept = None
# 參數係數
self.coefs = None
def fit(self, x_train, y_train):
raise NotImplementedError
def predict(self, x_predict):
raise NotImplementedError
@staticmethod
def score(y, y_predict):
"""R評價"""
return 1 - np.dot(y_predict - y, y_predict - y) / len(y) / np.var(y)
class LinearRegressionBGDLoop(LinearRegression):
""" 批量梯度下降(BGD)_循環維度 """
def j(self, x, y, theta):
"""目標函數"""
try:
return np.sum(y - np.dot(x, theta)) ** 2 / len(x)
except:
# 當數據特別大,報錯時,返回無窮大。
return float('inf')
def dj(self, x, y, thetas):
"""求導(梯度)"""
res = np.empty(len(thetas))
# 第0個theta,其實就是截距
res[0] = np.sum(np.dot(x, thetas) - y)
for col in range(1, len(thetas)):
res[col] = np.sum((np.dot(x, thetas) - y).dot(x[:, col]))
return res * 2 / len(x)
def gradient_descent(self, x, y, initial_thetas, eta, epsilon, max_iters):
"""梯度下降"""
thetas = initial_thetas
while max_iters > 0:
# 梯度gradient
gradient = self.dj(x, y, thetas)
last_thetas = thetas
thetas = thetas - eta * gradient
if(abs(self.j(x, y, thetas) - self.j(x, y, last_thetas)) < epsilon):
break
max_iters -= 1
self._thetas = thetas
self.intercept = thetas[0]
self.coefs = thetas[1:]
def fit(self, x_train, y_train, eta=0.01, epsilon=1e-8, max_iters=1e4):
"""訓練"""
# 加上一列全爲1
X_b = np.hstack([np.ones((len(x_train), 1)), x_train])
initial_thetas = np.zeros(X_b.shape[1])
self.gradient_descent(X_b, y_train, initial_thetas, eta, epsilon, max_iters)
def predict(self, x_predict):
"""預測"""
X_b = np.hstack([np.ones((len(x_predict), 1)), x_predict])
return np.dot(X_b, self._thetas)
@staticmethod
def score(y, y_predict):
"""R評價"""
return 1 - np.dot(y_predict - y, y_predict - y) / len(y) / np.var(y)
# 主函數
linear = LinearRegressionBGDLoop()
%time linear.fit(X_1, y_1)
%time predict_y_1 = linear.predict(X_1)
print('coefs: ', linear.coefs)
print('intercept: ', linear.intercept)
print('score: ', linear.score(y_1, predict_y_1))
CPU times: user 2.03 s, sys: 186 ms, total: 2.22 s
Wall time: 1.35 s
CPU times: user 874 µs, sys: 334 µs, total: 1.21 ms
Wall time: 643 µs
coefs: [9.95834636]
intercept: 5.030511016464405
score: 0.8927947969714475
向量化計算
class LinearRegressionBGDVector(LinearRegressionBGDLoop):
"""批量梯度下降(BGD)_向量化維度"""
def dj(self, x, y, thetas):
"""向量實現"""
return (x.T).dot(x.dot(thetas)-y) * 2 / len(x)
# 主函數
lin_reg = LinearRegressionBGDVector()
%time lin_reg.fit(X_1, y_1)
%time predict_y = lin_reg.predict(X_1)
print('coefs: ', lin_reg.coefs)
print('intercept: ', lin_reg.intercept)
print('score: ', lin_reg.score(y_1, predict_y_1))
CPU times: user 1.22 s, sys: 104 ms, total: 1.32 s
Wall time: 738 ms
CPU times: user 602 µs, sys: 182 µs, total: 784 µs
Wall time: 426 µs
coefs: [9.95834636]
intercept: 5.030511016464405
score: 0.8927947969714475
循環實現與向量實現值一樣,向量會快一點,但不是很明顯,是因爲這裏coefs只有一個,沒有區分點。創建以下特徵數爲m,樣本爲n的樣本集. 效果就很明顯。
# 創建樣本集: 特徵數爲m,樣本爲n
m = 100
n = 10000
x_m = np.random.normal(size=(n, m))
true_thetas = np.random.uniform(0.0, 100.0, size=m+1)
y_m = x_m.dot(true_thetas[1:]) + true_thetas[0] + np.random.normal(0., 10., size=n)
lin_reg1 = LinearRegressionBGDLoop()
%time lin_reg1.fit(x_m, y_m)
lin_reg2 = LinearRegressionBGDVector()
%time lin_reg2.fit(x_m, y_m)
CPU times: user 1min 8s, sys: 2.25 s, total: 1min 11s
Wall time: 42.2 s
CPU times: user 2.92 s, sys: 84.9 ms, total: 3 s
Wall time: 2.22 s
梯度下降法大家族
- 批量梯度下降法: Batch Gradient Descent(BGD)
- 隨機梯度下降法: Stochastic Gradient Descen(SGD)
- 小批量梯度下降法: Mini-batch Gradient Descent(MBGD) 融合了BGD與SGD
批量梯度下降法
以上實現的就是批量梯度下降法,求每次theta梯度都需要全量計算一下樣本
隨機梯度下降法
每次只隨機選取一個樣本計算theta梯度
學習率逐漸變小(經驗值 a=5,b=50)。這是個典型的模擬退火思想
class LinearRegressionSGD(LinearRegressionBGDLoop):
"""隨機梯度下降(SDG)"""
def dj(self, x_i, y_i, thetas):
return (x_i.T).dot(x_i.dot(thetas)-y_i) * 2
def gradient_descent(self, x, y, initial_thetas, eta, epsilon, max_iters, t0, t1):
"""梯度下降"""
thetas = initial_thetas
while max_iters > 0:
# 隨機抽取一個樣本
i = np.random.randint(len(x))
# 梯度gradient
gradient = self.dj(x[i], y[i], thetas)
last_thetas = thetas
# 學習率模擬退火
eta = (t0 + eta) / (max_iters + t1)
thetas = thetas - eta * gradient
if(abs(self.j(x, y, thetas) - self.j(x, y, last_thetas)) < epsilon):
break
max_iters -= 1
self._thetas = thetas
self.intercept = thetas[0]
self.coefs = thetas[1:]
def fit(self, x_train, y_train, eta=0.01, epsilon=1e-8, max_iters=1e4, t0=5, t1=50):
"""訓練"""
# 加上一列全爲1
X_b = np.hstack([np.ones((len(x_train), 1)), x_train])
initial_thetas = np.zeros(X_b.shape[1])
self.gradient_descent(X_b, y_train, initial_thetas, eta, epsilon, max_iters, t0, t1)
# 主函數
lin_reg = LinearRegressionSGD()
%time lin_reg.fit(X_1, y_1)
%time predict_y = lin_reg.predict(X_1)
print('---- SGD ----')
print('coefs: ', lin_reg.coefs)
print('intercept: ', lin_reg.intercept)
print('score: ', lin_reg.score(y_1, predict_y_1))
CPU times: user 1.99 s, sys: 175 ms, total: 2.17 s
Wall time: 1.49 s
CPU times: user 559 µs, sys: 215 µs, total: 774 µs
Wall time: 997 µs
---- Vector ----
coefs: [9.90608818]
intercept: 4.81501911723699
score: 0.8927947969714475
小批量梯度下降法
TODO: 待補充
BGD、SGD、MBGD比較
print('----- BGD ----- ')
lin_reg1 = LinearRegressionBGDLoop()
%time lin_reg1.fit(x_m, y_m)
%time predict_y_m1 = lin_reg1.predict(x_m)
lin_reg1.score(y_m, predict_y_m1)
print('score: ', lin_reg1.score(y_m, predict_y_m1))
print('----- SGD ----- ')
lin_reg2 = LinearRegressionSGD()
%time lin_reg2.fit(x_m, y_m)
predict_y_m2 = lin_reg2.predict(x_m)
print('score: ', lin_reg2.score(y_m, predict_y_m2))
----- BGD -----
CPU times: user 1min 13s, sys: 2.64 s, total: 1min 15s
Wall time: 48.4 s
CPU times: user 4.07 ms, sys: 934 µs, total: 5 ms
Wall time: 2.38 ms
score: 0.9996949647594553
----- SGD -----
CPU times: user 12.9 s, sys: 379 ms, total: 13.3 s
Wall time: 7.37 s
score: -1.0385235598722087e+28
線性迴歸 梯度下降 vs 正規方程解
- 正規方程解時間複雜度較大,不如梯度下降。
- 正規方程不需要進行數據標準話,而梯度下降需要。
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
# 正規方程求解
print('---- LinearRegression ----')
lin_reg = LinearRegression()
%time lin_reg.fit(x_m, y_m)
%time y_predict = lin_reg.predict(x)
print('score', lin_reg.score(x, y_predict))
# 梯度下降求解
print('/n---- SGDRegressor ----')
lin_reg = SGDRegressor(learning_rate="constant")
%time lin_reg.fit(x_m, y_m)
%time y_predict_m = lin_reg.predict(x_m)
print('score', lin_reg.score(x_m, y_predict_m))
---- LinearRegression ----
CPU times: user 995 ms, sys: 70.5 ms, total: 1.07 s
Wall time: 689 ms
CPU times: user 115 ms, sys: 111 ms, total: 225 ms
Wall time: 253 ms
score 1.0
/n---- SGDRegressor ----
CPU times: user 715 ms, sys: 14.6 ms, total: 729 ms
Wall time: 627 ms
CPU times: user 12.1 ms, sys: 207 µs, total: 12.3 ms
Wall time: 8.79 ms
score 1.0
調試梯度
用這個樸實的梯度調試方法,求得梯度,帶入目標函數中求得想要的thetas(但這個時間複雜度高,只適合小批量debug使用). 用來驗證推導的梯度求解公式.
對於一元
對於多元
推導出 =>