梯度下降法
- 1.梯度下降法簡介
- 2.簡單線性迴歸中梯度下降法的模擬
- 3.多元線性迴歸中的梯度下降法
- 4.隨機梯度下降法
- 5.波士頓房價預測問題
一、梯度下降法簡介
- 不是一個機器學習算法
- 是一個基於搜索的最優化方法
- 作用:最小化損失函數
- 梯度上升法:最大化一個效用函數
以下是定義了一個損失函數以後,參數theta對應的損失函數J的值對應的示例圖,我們需要找到一個使得損失函數值J取得最小值的對應的theta(選用二維平面,即參數只有一個)
1.η
a.η的介紹
- η稱爲學習率(learnng rate)
- η的取值影響獲得最優解的速度
- η取值不合適,甚至得不到最優解
- η是梯度下降法的一個超參數
b.η取值太小,影響收斂學習速度
c.η太大,甚至導致不收斂
2.注意事項
並不是所有的函數都有唯一的極值點
解決方案:
- 多次運行,隨機化初始點
- 梯度下降法的初始點也是一個超參數
二、簡單線性迴歸中梯度下降法的模擬
1.繪製一個簡單的損失函數
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# 簡單模擬一個損失函數
plot_x = np.linspace(-1,6,144)
plot_y = (plot_x-2.5)**2-1
plt.plot(plot_x,plot_y)
[<matplotlib.lines.Line2D at 0x176fa5c2dd8>]
2.定義梯度下降函數
def dJ(theta):
"""損失函數的導數"""
return 2*(theta-2.5)
def J(theta):
"""損失函數"""
try:
return (theta-2.5)**2-1
except:
return float("inf")
def gradient_descent(initial_theta,eta,n_iters=1e4,epsilon=1e-8):
"""梯度下降法封裝"""
"""
initial_theta:初始化的theta值
eta:學習率
n_iters:最大循環次數
epsilon:精度
"""
theta = initial_theta
theta_history.append(initial_theta) # 保存theta的變化值
i_iters = 0
while i_iters < n_iters:
"""
如果theta兩次變化之間的損失函數值的變化小於我們定義的精度
則可以說明我們已經找到了最低的損失函數值和對應的theta
如果循環次數超過了我們設置的循環次數,
則說明可能由於η設置的過大導致無止境的循環
"""
gradient = dJ(theta) # 導數
last_theta = theta # 保存上一步的theta
theta = theta-eta*gradient # 新的theta
theta_history.append(theta)
# 判斷是否小於精度
if (abs(J(theta)-J(last_theta))<epsilon):
break
i_iters += 1
def plot_theta_history():
plt.plot(plot_x,J(plot_x))
plt.plot(np.array(theta_history),J(np.array(theta_history)),color="r",marker="+")
plt.show()
print("the size of theta_history is %d"%len(theta_history))
3.使用不同的η學習率進行測試
a.η=0.1
eta = 0.1
theta_history = []
gradient_descent(0.,eta)
plot_theta_history()
the size of theta_history is 46
b.η=0.01
eta = 0.01
theta_history = []
gradient_descent(0.,eta)
plot_theta_history()
the size of theta_history is 424
c.η=0.001
eta = 0.001
theta_history = []
gradient_descent(0.,eta)
plot_theta_history()
the size of theta_history is 3682
d.η=0.8
eta = 0.8
theta_history = []
gradient_descent(0.,eta)
plot_theta_history()
the size of theta_history is 22
e.η=1.1
eta = 1.1
theta_history = []
gradient_descent(0.,eta,n_iters=10)
plot_theta_history()
the size of theta_history is 11
三、多元線性迴歸中的梯度下降法
1.模擬數據的準備
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
x = 2*np.random.random(size=100)
y = x*3.+4+np.random.normal(size=100)
X = x.reshape(-1,1)
plt.scatter(X,y)
<matplotlib.collections.PathCollection at 0x176fa87e390>
2.梯度下降法的實現
a.簡單的直接實現
class LinearRegression:
def __init__(self):
"""初始化Simple Linear Regression模型"""
self.a_ = None
self.b_ = None
def fit(self, x_train, y_train):
"""根據訓練數據集x_train,y_train訓練Simple Linear Regression模型"""
assert x_train.ndim == 1, \
"Simple Linear Regressor can only solve single feature training data."
assert len(x_train) == len(y_train), \
"the size of x_train must be equal to the size of y_train"
# 均值
x_mean = np.mean(x_train)
y_mean = np.mean(y_train)
# 使用向量化點乘計算參數a和b
self.a_ = (x_train - x_mean).dot(y_train - y_mean) / (x_train - x_mean).dot(x_train - x_mean)
self.b_ = y_mean - self.a_ * x_mean
return self
def predict(self, x_predict):
"""給定待預測數據集x_predict,返回表示x_predict的結果向量"""
assert x_predict.ndim == 1, \
"Simple Linear Regressor can only solve single feature training data."
assert self.a_ is not None and self.b_ is not None, \
"must fit before predict!"
return np.array([self._predict(x) for x in x_predict])
def _predict(self, x_single):
"""給定單個待預測數據x_single,返回x_single的預測結果值"""
return self.a_ * x_single + self.b_
def __repr__(self):
return "LinearRegression()"
def fit_gd(self, X_train, y_train, eta=0.01, n_iters = 1e4):
"""根據訓練數據集X_train,y_train, 使用梯度下降法訓練Linear Regression 模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta))**2) / len(X_b)
except:
return float('inf')
def dJ(theta, X_b, y):
res = np.empty(len(theta))
res[0] = np.sum(X_b.dot(theta) - y)
for i in range(1, len(theta)):
res[i] = np.sum((X_b.dot(theta) - y).dot(X_b[:, i]))
return res * 2 / len(X_b)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=n_iters, epsilon=1e-8):
"""
梯度下降法封裝
X_b: X特徵矩陣
y: 結果向量
initial_theta:初始化的theta值
eta:學習率η
n_iters: 最大循環次數
epsilon: 精度
"""
theta = initial_theta
i_iters = 0
while i_iters < n_iters:
"""
如果theta兩次變化之間的損失函數值的變化小於我們定義的精度
則可以說明我們已經找到了最低的損失函數值和對應的theta
如果循環次數超過了我們設置的循環次數,
則說明可能由於η設置的過大導致無止境的循環
"""
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon:
break
i_iters += 1
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(X_b, y_train, initial_theta, eta)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
lin_reg = LinearRegression()
lin_reg.fit_gd(X,y)
# 查看係數
print(lin_reg.coef_)
# 查看截距
print(lin_reg.interception_)
[3.00706277]
4.021457858204859
b.向量化的處理
import numpy as np
from math import sqrt
def accuracy_score(y_true, y_predict):
"""計算y_true和y_predict之間的準確率"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"
return np.sum(y_true == y_predict) / len(y_true)
def mean_squared_error(y_true, y_predict):
"""計算y_true和y_predict之間的MSE"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"
return np.sum((y_true - y_predict)**2) / len(y_true)
def root_mean_squared_error(y_true, y_predict):
"""計算y_true和y_predict之間的RMSE"""
return sqrt(mean_squared_error(y_true, y_predict))
def mean_absolute_error(y_true, y_predict):
"""計算y_true和y_predict之間的MAE"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"
return np.sum(np.absolute(y_true - y_predict)) / len(y_true)
def r2_score(y_true, y_predict):
"""計算y_true和y_predict之間的R Square"""
return 1 - mean_squared_error(y_true, y_predict)/np.var(y_true)
class LinearRegression:
def __init__(self):
"""初始化Linear Regression模型"""
self.coef_ = None
self.intercept_ = None
self._theta = None
def fit_normal(self, X_train, y_train):
"""根據訓練數據集X_train, y_train訓練Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
"""根據訓練數據集X_train, y_train, 使用梯度下降法訓練Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def predict(self, X_predict):
"""給定待預測數據集X_predict,返回表示X_predict的結果向量"""
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return X_b.dot(self._theta)
def accuracy_score(y_true, y_predict):
"""計算y_true和y_predict之間的準確率"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"
return np.sum(y_true == y_predict) / len(y_true)
def score(self, X_test, y_test):
"""根據測試數據集 X_test 和 y_test 確定當前模型的準確度"""
y_predict = self.predict(X_test)
return r2_score(y_test, y_predict)
def __repr__(self):
return "LinearRegression()"
lin_reg = LinearRegression()
lin_reg.fit_gd(X,y)
print(lin_reg.coef_)
print(lin_reg.intercept_)
[3.00706277]
4.021457858204859
3.波士頓房價預測
a.準備數據集
from sklearn import datasets
from sklearn.model_selection import train_test_split
boston = datasets.load_boston()
x = boston.data
y = boston.target
x = x[y<50.0]
y = y[y<50.0]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=666)
b.數據歸一化
由於數據的規模在不同的特徵上不同,所以需要對數據進行歸一化
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
standardScaler.fit(x_train)
x_train_standard = standardScaler.transform(x_train)
x_test_standard = standardScaler.transform(x_test)
c.調用梯度下降法
lin_reg = LinearRegression()
lin_reg.fit_gd(x_train_standard,y_train)
lin_reg.score(x_test_standard,y_test)
0.8129873310487505
然而
,如果樣本數非常多的情況下,那麼即使使用梯度下降法也會導致速度非常慢,因爲在梯度下降法中,每一個樣本都需要運算,這時候就需要隨機梯度下降法。
四、隨機梯度下降法
1.隨機梯度下降法介紹
a.批量梯度下降法
批量梯度下降法帶來的一個問題是
η的值需要設置的比較小,在樣本數比較多的時候導致不是速度特別慢,這時候觀察隨機梯度下降法損失函數的求導公式,可以發現,我們對每一個Xb都做了求和操作,又在最外面除以了m,那麼可以考慮將求和和除以m的兩個運算約掉,採用每次使用一個隨機的Xb
b.隨機梯度下降法
由於我們使用的是隨機梯度下降法
,所以導致我們的最終結果不會像批量梯度下降法一樣準確的朝着一個方向運算,而是曲線行下降,這時候我們就希望,越到下面,η值相應減小,事運算次數變多,從而精確計算結果
2.模擬數據進行測試
a.數據準備:
import numpy as np
import matplotlib.pyplot as plt
m = 100000
x = np.random.normal(size=m)
y = 4.*x+3.+np.random.normal(0,3,size=m)
X = x.reshape(-1,1)
b.批量隨機梯度法:
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
%%time
X_b = np.hstack([np.ones((len(X),1)),X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b,y,initial_theta,eta)
Wall time: 1.27 s
print(theta)
[3.00590902 4.00776602]
c.隨機梯度下降法:
def dJ_sgd(theta,X_b_i,y_i):
return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2
def sgd(X_b,y,initial_theta,n_iters):
t0 = 5
t1 = 50
def learning_rate(t):
return t0 / (t + t1)
theta = initial_theta
for cur_iter in range(n_iters):
rand_i = np.random.randint(len(X_b))
gradient = dJ_sgd(theta,X_b[rand_i],y[rand_i])
theta = theta - learning_rate(cur_iter) * gradient
return theta
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, n_iters=m//3)
Wall time: 325 ms
3.對隨機梯度函數進行封裝
class LinearRegression():
def fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50):
"""
根據訓練數據集X_train, y_train, 使用隨機梯度下降法訓練Linear Regression模型
:param X_train:
:param y_train:
:param n_iters: 在隨機梯度下降法中,n_iters代表所有的樣本會被循環幾次
:param t0:
:param t1:
:return:
"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
assert n_iters >= 1
def dJ_sgd(theta, X_b_i, y_i):
"""
X_b,y 中的隨機一個元素進行導數公式的計算
:param theta:
:param X_b_i:
:param y_i:
:return:
"""
return X_b_i * (X_b_i.dot(theta) - y_i) * 2.
def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):
def learning_rate(t):
"""
計算學習率,t1 爲了減慢變化速度,t0爲了增加隨機性
:param t: 第t次循環
:return:
"""
return t0 / (t + t1)
theta = initial_theta
m = len(X_b)
for cur_iter in range(n_iters):
# 對X_b進行一個亂序的排序
indexes = np.random.permutation(m)
X_b_new = X_b[indexes]
y_new = y[indexes]
# 對整個數據集看一遍
for i in range(m):
gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
theta = theta - learning_rate(cur_iter * m + i) * gradient
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.random.randn(X_b.shape[1])
self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
五、波士頓房價預測問題
1.數據集獲取
from sklearn import datasets
boston = datasets.load_boston()
x = boston.data
y = boston.target
x = x[y<50.0]
y = y[y<50.0]
2.數據集的劃分
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=666)
3.數據集歸一化處理
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
standardScaler.fit(x_train)
x_train_standard = standardScaler.transform(x_train)
x_test_standard = standardScaler.transform(x_test)
4.調用SGD方法
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(n_iter=100)
%%time
sgd_reg.fit(x_train_standard,y_train)
Wall time: 5.98 ms
D:\software\Anaconda\workplace\lib\site-packages\sklearn\linear_model\stochastic_gradient.py:152: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.
DeprecationWarning)
SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
eta0=0.01, fit_intercept=True, l1_ratio=0.15,
learning_rate='invscaling', loss='squared_loss', max_iter=None,
n_iter=100, n_iter_no_change=5, penalty='l2', power_t=0.25,
random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
verbose=0, warm_start=False)
sgd_reg.score(x_test_standard,y_test)
0.7997484818787682
# 打印theta參數
print(sgd_reg.coef_)
# 打印截距
print(sgd_reg.intercept_)
[-0.972382 0.6864189 -0.34498804 -0.0125929 -1.26078593 2.27274023
-0.40157318 -2.30498603 1.96081398 -1.83022108 -1.83893614 0.74687792
-2.81437963]
[21.53228681]
總結
綜合二者的優缺點,出現了小批量梯度下降法.
小批量梯度下降法:
我們每一次不看全部樣本那麼多,也不是隻看一次樣本那麼少,每次只看k個樣本。
def fit_lit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50,k=10):
"""
根據訓練數據集X_train, y_train, 使用隨機梯度下降法訓練Linear Regression模型
:param X_train:
:param y_train:
:param n_iters: 在隨機梯度下降法中,n_iters代表所有的樣本會被看幾圈
:param t0:
:param t1:
:param k: 小批量隨機下降法的超參數k
:return:
"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
assert n_iters >= 1
def dJ_sgd(theta, X_b_k, y_k):
"""
去X_b,y 中的隨機選擇k個元素進行導數公式的計算
:param theta:
:param X_b_i:
:param y_i:
:return:
"""
return np.sum((X_b_k * (X_b_k.dot(theta) - y_k) ))* 2/len(X_b_k)
def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):
def learning_rate(t):
"""
計算學習率,t1 爲了減慢變化速度,t0爲了增加隨機性
:param t: 第t次循環
:return:
"""
return t0 / (t + t1)
theta = initial_theta
m = len(X_b)
for cur_iter in range(n_iters):
# 每次看k個元素
i =0
while i < m:
X_b_new = X_b[i:i+k]
y_new = y[i:i+k]
gradient = dJ_sgd(theta, X_b_new, y_new)
theta = theta - learning_rate(cur_iter * m + i+k) * gradient
i = i+k
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.random.randn(X_b.shape[1])
self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]
return self