一、什麼是優化器
pytorch的優化器: 管理並更新模型中可學習參數的值,使得模型輸出更接真實標籤
- 導數: 函數在指定座標軸上的變化率
- 方向導數: 指定方向上的變化率
- 梯度:一個向量,方向爲方向導數取得最大值的方向
二、optimizer的屬性
class Optimizer(object):
def __init__(self, params, defaults):
self.defaults = defaults
self.state = defaultdict(dict)
self.param_groups=[]
...
param_groups = [{'params':param_groups}]
基本屬性:
- defaults: 優化器超參數,如學習率
- state: 參數的緩存,如momentum的緩存
- params_groups:管理的參數組
- _step_count:記錄更新次數,學習率調整中使用
三、optimizer的方法
3.1 zero_grad()
class Optimizer(object):
def zero_grad(self):
for group in self.param_groups:
for p in group['params"]:
if p.grad is not None:
p.grad.detach_()
p.grad.zero_()
功能: 清空所管理參數的梯度
pytorch特性:張量梯度不自動清零
# -*- coding:utf-8 -*-
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
import torch
import torch.optim as optim
from tools.common_tools import set_seed
set_seed(1) # 設置隨機種子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
# ----------------------------------- zero_grad -----------------------------------
# flag = 0
flag = 1
if flag:
print("weight before step:{}".format(weight.data))
optimizer.step() # 修改lr=1 0.1觀察結果
print("weight after step:{}".format(weight.data))
print("weight in optimizer:{}\nweight in weight:{}\n".format(id(optimizer.param_groups[0]['params'][0]), id(weight)))
print("weight.grad is {}\n".format(weight.grad))
optimizer.zero_grad()
print("after optimizer.zero_grad(), weight.grad is\n{}".format(weight.grad))
說明:
- 在優化器中保存的是參數的地址,根據地址尋找參數,減少內存消耗
- 通過zero_grad()方法後,就實現了參數梯度的清零
3.2 step()
功能:執行一步更新
詳細說明:當我們計算得到了loss,然後反向傳播計算各個參數的梯度後,就需要使用step()方法執行一步更新,更新參數,而更新的策略有很多,如隨機梯度下降法,momentum等等
# -*- coding:utf-8 -*-
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
import torch
import torch.optim as optim
from tools.common_tools import set_seed
set_seed(1) # 設置隨機種子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
# ----------------------------------- step -----------------------------------
# flag = 0
flag = 1
if flag:
print("weight before step:{}".format(weight.data))
optimizer.step() # 修改lr=1 0.1觀察結果
print("weight after step:{}".format(weight.data))
說明:這裏的梯度是1,所以執行一步更新參數,即0.6614-0.1*1=0.5614,其中0.1是學習率
3.3 add_param_group()
class Optimizer(object):
def add_param_group(self, param_group):
for group in self.param_groups:
param_set.update(set(group['params']))
...
self.param_groups.append(param_group)
功能:添加一組參數到優化器當中
# -*- coding:utf-8 -*-
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
import torch
import torch.optim as optim
from tools.common_tools import set_seed
set_seed(1) # 設置隨機種子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
# ----------------------------------- add_param_group -----------------------------------
# flag = 0
flag = 1
if flag:
print("optimizer.param_groups is\n{}".format(optimizer.param_groups))
w2 = torch.randn((3, 3), requires_grad=True)
optimizer.add_param_group({"params": w2, 'lr': 0.0001})
print("optimizer.param_groups is\n{}".format(optimizer.param_groups))
3.4 state_dict()
class Optimizer(object):
def state_dict(self):
...
return {'state': packed_state, 'param_groups': param_groups,}
def load state_dict(self, state_dict):
功能: 獲取優化器當前狀態信息字典
# -*- coding:utf-8 -*-
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
import torch
import torch.optim as optim
from tools.common_tools import set_seed
set_seed(1) # 設置隨機種子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
# ----------------------------------- state_dict -----------------------------------
# flag = 0
flag = 1
if flag:
optimizer = optim.SGD([weight], lr=0.1, momentum=0.9)
opt_state_dict = optimizer.state_dict()
print("state_dict before step:\n", opt_state_dict)
for i in range(10):
optimizer.step()
print("state_dict after step:\n", optimizer.state_dict())
torch.save(optimizer.state_dict(), os.path.join(BASE_DIR, "optimizer_state_dict.pkl"))
說明:當訓練到某個階段或時刻,通過獲取優化器當前狀態信息字典 ,然後將其保存下來,後面再使用的時候,就可以通過加載狀態信息字典,來繼續之前的訓練
3.5 load_state_dict()
class Optimizer(object):
def state_dict(self):
...
return {'state': packed_state, 'param_groups': param_groups,}
def load state_dict(self, state_dict):
功能 : 加載狀態信息字典
說明:用於模型的續訓練
# -*- coding:utf-8 -*-
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
import torch
import torch.optim as optim
from tools.common_tools import set_seed
set_seed(1) # 設置隨機種子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
# -----------------------------------load state_dict -----------------------------------
flag = 0
# flag = 1
if flag:
optimizer = optim.SGD([weight], lr=0.1, momentum=0.9)
state_dict = torch.load(os.path.join(BASE_DIR, "optimizer_state_dict.pkl"))
print("state_dict before load state:\n", optimizer.state_dict())
optimizer.load_state_dict(state_dict)
print("state_dict after load state:\n", optimizer.state_dict())
四、learning rate學習率
4.1 學習率的概念
學習率(learning rate): 控制更新的步伐
梯度下降:
應用學習率後:
4.2 學習率的設置演示
# -*- coding:utf-8 -*-
import torch
import numpy as np
import matplotlib.pyplot as plt
torch.manual_seed(1)
def func(x_t):
"""
y = (2x)^2 = 4*x^2 dy/dx = 8x
"""
return torch.pow(2*x_t, 2)
# init
x = torch.tensor([2.], requires_grad=True)
# ------------------------------ plot data ------------------------------
flag = 0
# flag = 1
if flag:
x_t = torch.linspace(-3, 3, 100)
y = func(x_t)
plt.plot(x_t.numpy(), y.numpy(), label="y = 4*x^2")
plt.grid()
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()
# ------------------------------ gradient descent ------------------------------
flag = 0
# flag = 1
if flag:
iter_rec, loss_rec, x_rec = list(), list(), list()
lr = 0.1 # /1. /.5 /.2 /.1 /.125
max_iteration = 4 # /1. 4 /.5 4 /.2 20 200
for i in range(max_iteration):
y = func(x)
y.backward()
print("Iter:{}, X:{:8}, X.grad:{:8}, loss:{:10}".format(
i, x.detach().numpy()[0], x.grad.detach().numpy()[0], y.item()))
x_rec.append(x.item())
x.data.sub_(lr * x.grad) # x -= x.grad 數學表達式意義: x = x - x.grad # 0.5 0.2 0.1 0.125
x.grad.zero_()
iter_rec.append(i)
loss_rec.append(y)
plt.subplot(121).plot(iter_rec, loss_rec, '-ro')
plt.xlabel("Iteration")
plt.ylabel("Loss value")
x_t = torch.linspace(-3, 3, 100)
y = func(x_t)
plt.subplot(122).plot(x_t.numpy(), y.numpy(), label="y = 4*x^2")
plt.grid()
y_rec = [func(torch.tensor(i)).item() for i in x_rec]
plt.subplot(122).plot(x_rec, y_rec, '-ro')
plt.legend()
plt.show()
# ------------------------------ multi learning rate ------------------------------
# flag = 0
flag = 1
if flag:
iteration = 100
num_lr = 10
lr_min, lr_max = 0.01, 0.2 # .5 .3 .2
lr_list = np.linspace(lr_min, lr_max, num=num_lr).tolist()
loss_rec = [[] for l in range(len(lr_list))]
iter_rec = list()
for i, lr in enumerate(lr_list):
x = torch.tensor([2.], requires_grad=True)
for iter in range(iteration):
y = func(x)
y.backward()
x.data.sub_(lr * x.grad) # x.data -= x.grad
x.grad.zero_()
loss_rec[i].append(y.item())
for i, loss_r in enumerate(loss_rec):
plt.plot(range(len(loss_r)), loss_r, label="LR: {}".format(lr_list[i]))
plt.legend()
plt.xlabel('Iterations')
plt.ylabel('Loss value')
plt.show()
學習率=1,迭代次數=4
由圖可知,學習率過大,從而出現loss激增
學習率=0.1,迭代次數=4
由上圖可知,學習率適當的情況,loss下降至收斂
通過設置多學習率的演示可知,不同的學習率的收斂速度不同
五、momentum動量
5.1 momentum動量概念
Momentum(動量,衝量) : 結合當前梯度與上一次更新信息, 用於當前更新
使用學習率的梯度下降法,其中學習率是固定的,即更新的步長是固定的,就如同圖上滑雪,每次更新,向下一個梯度方向更新步長固定,而Momentum動量會結合當前梯度方向和上次更新信息,更新的步長不固定,如圖中,在上一次的基礎上會滑更長的距離
應用momentum後的更新公式:
5.2 指數加權平均
基本思想: 當我們要求取當前時刻的平均值,距離當前時刻越近的那些參數值越具有參考性,所佔的權重越大,這個權重會隨着時間間隔的增大呈指數下降
具體例子:
- 圖中橫軸是天數,縱軸是溫度
- 是第t天溫度指數加權平均值
- 是第t天的溫度值
- 爲超參,值小於1,用以控制記憶週期
由最後一項可知,距離當前時刻越遠的那些溫度的權重是越小的,由於β是小於1的,所以距離當前時刻越遠的那些溫度的權重是呈指數下降的
代碼演示:
# -*- coding:utf-8 -*-
import torch
import numpy as np
import torch.optim as optim
import matplotlib.pyplot as plt
torch.manual_seed(1)
def exp_w_func(beta, time_list):
return [(1 - beta) * np.power(beta, exp) for exp in time_list]
beta = 0.9
num_point = 100
time_list = np.arange(num_point).tolist()
# ------------------------------ exponential weight ------------------------------
flag = 0
# flag = 1
if flag:
weights = exp_w_func(beta, time_list)
plt.plot(time_list, weights, '-ro', label="Beta: {}\ny = B^t * (1-B)".format(beta))
plt.xlabel("time")
plt.ylabel("weight")
plt.legend()
plt.title("exponentially weighted average")
plt.show()
print(np.sum(weights))
# ------------------------------ multi weights ------------------------------
flag = 0
# flag = 1
if flag:
beta_list = [0.98, 0.95, 0.9, 0.8]
w_list = [exp_w_func(beta, time_list) for beta in beta_list]
for i, w in enumerate(w_list):
plt.plot(time_list, w, label="Beta: {}".format(beta_list[i]))
plt.xlabel("time")
plt.ylabel("weight")
plt.legend()
plt.show()
# ------------------------------ SGD momentum ------------------------------
# flag = 0
flag = 1
if flag:
def func(x):
return torch.pow(2*x, 2) # y = (2x)^2 = 4*x^2 dy/dx = 8x
iteration = 100
m = 0.63 # .9 .63
lr_list = [0.01, 0.03]
momentum_list = list()
loss_rec = [[] for l in range(len(lr_list))]
iter_rec = list()
for i, lr in enumerate(lr_list):
x = torch.tensor([2.], requires_grad=True)
momentum = 0. if lr == 0.03 else m
momentum_list.append(momentum)
optimizer = optim.SGD([x], lr=lr, momentum=momentum)
for iter in range(iteration):
y = func(x)
y.backward()
optimizer.step()
optimizer.zero_grad()
loss_rec[i].append(y.item())
for i, loss_r in enumerate(loss_rec):
plt.plot(range(len(loss_r)), loss_r, label="LR: {} M:{}".format(lr_list[i], momentum_list[i]))
plt.legend()
plt.xlabel('Iterations')
plt.ylabel('Loss value')
plt.show()
由圖可知,距離當前時刻越遠,其權重成指數下降趨勢,而權重越小說明該時刻的溫度對當前時刻溫度的加權指數平均的貢獻越小
由圖可知
- 設置不同的β值,權重下降的趨勢不同
- β值可理解爲記憶週期,越小記憶週期越短
- β通常設置爲0.9,爲了更加關注當前10天左右的數據
由圖所示,在確定學習率的基礎下,適當的momentum係數(β值),能加速收斂
六、torch.optim.SGD
optim.SGD(params,
Ir=<object object>,
momentum=0,
dampening=0,
weight_decay=0,
nesterov=False)
功能:隨機梯度下降法優化器
主要參數:
- params:管理的參數組
- lr:初始學習率
- momentum:動量係數,貝塔
- weight_decay: L2正則化係數
- nesterov:是否採用NAG,通常是不使用
NAG參考文獻: 《On the importance of initialization and momentum in deep learning》
七、Pytorch的十種優化器
1 optim.SGD:隨機梯度下降法
2 optim.Adagrad:自適應學習率梯度下降法
3 optim.RMSprop: Adagrad的改進
4 optim.Adadelta : Adagrad的改進
5 optim.Adam : RMSprop結合Momentum
6 optim.Adamax: Adam增加學習率上限
7 optim.SparseAdam:稀疏版的Adam
8 optim.ASGD:隨機平均梯度下降
9 optim.Rprop :彈性反向傳播
10 optim.LBFGS: BFGS的改進