Pytorch —— 學習率調整策略

1、爲什麼要調整學習率

學習率控制梯度更新的快慢,在訓練中,開始時的學習率比較大,梯度更新步伐比較大,後期時學習率比較小,梯度更新步伐比較小。

梯度下降:wi+1=wig(wi)w_{i+1}=w_{i}-g\left(w_{i}\right)使用學習率的梯度下降:wi+1=wiLRg(wi)w_{i+1}=w_{i}-L R * g\left(w_{i}\right)學習率的作用是控制更新的步伐;

Pytorch提供了一個調整學習率的方法——class_LRScheduler

主要參數

  • optimizer:關聯的優化器;
  • last_epoch:記錄epoch數;
  • base_lrs:記錄初始學習率;
class_LRScheduler(object):
	def __init__(self,optimizer,last_epoch=1):

	def get_lr(self):
		raise NotlmplementedError

主要方法

  • step():更新下一個epoch的學習率;
  • get_lr():虛函數,計算下一個epoch的學習率;

下面通過代碼看一下學習率調整類的具體使用:

import os
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.optim as optim
from PIL import Image
from matplotlib import pyplot as plt
from model.lenet import LeNet
from toolss.my_dataset import RMBDataset

import torchvision

def transform_invert(img_, transform_train):
    """
    將data 進行反transfrom操作
    :param img_: tensor
    :param transform_train: torchvision.transforms
    :return: PIL image
    """
    if 'Normalize' in str(transform_train):
        norm_transform = list(filter(lambda x: isinstance(x, transforms.Normalize), transform_train.transforms))
        mean = torch.tensor(norm_transform[0].mean, dtype=img_.dtype, device=img_.device)
        std = torch.tensor(norm_transform[0].std, dtype=img_.dtype, device=img_.device)
        img_.mul_(std[:, None, None]).add_(mean[:, None, None])

    img_ = img_.transpose(0, 2).transpose(0, 1)  # C*H*W --> H*W*C
    if 'ToTensor' in str(transform_train):
        img_ = np.array(img_) * 255

    if img_.shape[2] == 3:
        img_ = Image.fromarray(img_.astype('uint8')).convert('RGB')
    elif img_.shape[2] == 1:
        img_ = Image.fromarray(img_.astype('uint8').squeeze())
    else:
        raise Exception("Invalid img shape, expected 1 or 3 in axis 2, but got {}!".format(img_.shape[2]) )

    return img_


def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


set_seed()  # 設置隨機種子
rmb_label = {"1": 0, "100": 1}

# 參數設置
MAX_EPOCH = 10
BATCH_SIZE = 16
LR = 0.01
log_interval = 10
val_interval = 1

# ============================ step 1/5 數據 ============================

split_dir = os.path.join("F:/Pytorch框架班/Pytorch-Camp-master/代碼合集/rmb_split")
train_dir = os.path.join(split_dir, "train")
valid_dir = os.path.join(split_dir, "valid")

norm_mean = [0.485, 0.456, 0.406]
norm_std = [0.229, 0.224, 0.225]

train_transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.RandomCrop(32, padding=4),
    transforms.RandomGrayscale(p=0.8),
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std),
])

valid_transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std),
])

# 構建MyDataset實例
train_data = RMBDataset(data_dir=train_dir, transform=train_transform)
valid_data = RMBDataset(data_dir=valid_dir, transform=valid_transform)

# 構建DataLoder
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=valid_data, batch_size=BATCH_SIZE)

# ============================ step 2/5 模型 ============================

net = LeNet(classes=2)
net.initialize_weights()

# ============================ step 3/5 損失函數 ============================
criterion = nn.CrossEntropyLoss()                                                   # 選擇損失函數

# ============================ step 4/5 優化器 ============================
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9)                        # 選擇優化器
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)     # 設置學習率下降策略

# ============================ step 5/5 訓練 ============================
train_curve = list()
valid_curve = list()

for epoch in range(MAX_EPOCH):

    loss_mean = 0.
    correct = 0.
    total = 0.

    net.train()
    for i, data in enumerate(train_loader):

        # forward
        inputs, labels = data
        outputs = net(inputs)

        # backward
        optimizer.zero_grad()
        loss = criterion(outputs, labels)
        loss.backward()

        # update weights
        optimizer.step()

        # 統計分類情況
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).squeeze().sum().numpy()

        # 打印訓練信息
        loss_mean += loss.item()
        train_curve.append(loss.item())
        if (i+1) % log_interval == 0:
            loss_mean = loss_mean / log_interval
            print("Training:Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] Loss: {:.4f} Acc:{:.2%}".format(
                epoch, MAX_EPOCH, i+1, len(train_loader), loss_mean, correct / total))
            loss_mean = 0.

    scheduler.step()  # 更新學習率

    # validate the model
    if (epoch+1) % val_interval == 0:

        correct_val = 0.
        total_val = 0.
        loss_val = 0.
        net.eval()
        with torch.no_grad():
            for j, data in enumerate(valid_loader):
                inputs, labels = data
                outputs = net(inputs)
                loss = criterion(outputs, labels)

                _, predicted = torch.max(outputs.data, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).squeeze().sum().numpy()

                loss_val += loss.item()

            valid_curve.append(loss_val)
            print("Valid:\t Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] Loss: {:.4f} Acc:{:.2%}".format(
                epoch, MAX_EPOCH, j+1, len(valid_loader), loss_val, correct / total))


train_x = range(len(train_curve))
train_y = train_curve

train_iters = len(train_loader)
valid_x = np.arange(1, len(valid_curve)+1) * train_iters*val_interval # 由於valid中記錄的是epochloss,需要對記錄點進行轉換到iterations
valid_y = valid_curve

plt.plot(train_x, train_y, label='Train')
plt.plot(valid_x, valid_y, label='Valid')

plt.legend(loc='upper right')
plt.ylabel('loss value')
plt.xlabel('Iteration')
plt.show()

# ============================ inference ============================

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
test_dir = os.path.join(BASE_DIR, "test_data")

test_data = RMBDataset(data_dir=test_dir, transform=valid_transform)
valid_loader = DataLoader(dataset=test_data, batch_size=1)

for i, data in enumerate(valid_loader):
    # forward
    inputs, labels = data
    outputs = net(inputs)
    _, predicted = torch.max(outputs.data, 1)

    rmb = 1 if predicted.numpy()[0] == 0 else 100

    img_tensor = inputs[0, ...]  # C H W
    img = transform_invert(img_tensor, train_transform)
    plt.imshow(img)
    plt.title("LeNet got {} Yuan".format(rmb))
    plt.show()
    plt.pause(0.5)
    plt.close()

上述代碼中使用到學習率調整的代碼爲:

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)     # 設置學習率下降策略

通過設置斷點進入該函數源碼觀察其具體實現:

class StepLR(_LRScheduler):

    def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1):
        self.step_size = step_size
        self.gamma = gamma
        super(StepLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        return [base_lr * self.gamma ** (self.last_epoch // self.step_size)
                for base_lr in self.base_lrs]

可以發現,代碼進入StepLR類,這個類繼承於_LRScheduler,也就是我們上面學習的scheduler的基類,代碼中的__init__()執行的操作爲初始化參數並繼承父類的參數,現在觀察一下__init__()的代碼:

super(StepLR, self).__init__(optimizer, last_epoch)

通過步進進入StepLR的父類:

class _LRScheduler(object):
    def __init__(self, optimizer, last_epoch=-1):
        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer
        if last_epoch == -1:
            for group in optimizer.param_groups:
                group.setdefault('initial_lr', group['lr'])
            last_epoch = 0
        else:
            for i, group in enumerate(optimizer.param_groups):
                if 'initial_lr' not in group:
                    raise KeyError("param 'initial_lr' is not specified "
                                   "in param_groups[{}] when resuming an optimizer".format(i))
        self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
        self.last_epoch = last_epoch

        # Following https://github.com/pytorch/pytorch/issues/20124
        # We would like to ensure that `lr_scheduler.step()` is called after
        # `optimizer.step()`
        def with_counter(func, opt):
            @wraps(func)
            def wrapper(*args, **kwargs):
                opt._step_count += 1
                return func(*args, **kwargs)
            wrapper._with_counter = True
            return wrapper

        self.optimizer.step = with_counter(self.optimizer.step, self.optimizer)
        self.optimizer._step_count = 0
        self._step_count = 0
        self.step(last_epoch)

    def state_dict(self):
        """Returns the state of the scheduler as a :class:`dict`.

        It contains an entry for every variable in self.__dict__ which
        is not the optimizer.
        """
        return {key: value for key, value in self.__dict__.items() if key != 'optimizer'}

    def load_state_dict(self, state_dict):
        """Loads the schedulers state.

        Arguments:
            state_dict (dict): scheduler state. Should be an object returned
                from a call to :meth:`state_dict`.
        """
        self.__dict__.update(state_dict)

    def get_lr(self):
        raise NotImplementedError

    def step(self, epoch=None):
        # Raise a warning if old pattern is detected
        # https://github.com/pytorch/pytorch/issues/20124
        if self._step_count == 1:
            if not hasattr(self.optimizer.step, "_with_counter"):
                warnings.warn("Seems like `optimizer.step()` has been overridden after learning rate scheduler "
                              "initialization. Please, make sure to call `optimizer.step()` before "
                              "`lr_scheduler.step()`. See more details at "
                              "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)

            # Just check if there were two first lr_scheduler.step() calls before optimizer.step()
            elif self.optimizer._step_count < 1:
                warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
                              "In PyTorch 1.1.0 and later, you should call them in the opposite order: "
                              "`optimizer.step()` before `lr_scheduler.step()`.  Failure to do this "
                              "will result in PyTorch skipping the first value of the learning rate schedule."
                              "See more details at "
                              "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
        self._step_count += 1

        if epoch is None:
            epoch = self.last_epoch + 1
        self.last_epoch = epoch
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

主要觀察代碼中的:

self.optimizer = optimizer  # optimizer是要關聯的優化器
self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
self.last_epoch = last_epoch

代碼中的base_lrs是一個list,因爲優化器中可能有多個學習率;代碼中使用到map,map會對optimizer.param_groups中的每一個元素執行函數lambda group,optimizer.param_groups是一個參數組,參數組是一個list,list中的元素都是dict,所以lambda函數的作用是取出dict中的value值,也就是每個參數的學習率。

所以self.base_lrs = list(map(lambda group: group[‘initial_lr’], optimizer.param_groups))的作用是把每一個參數組的初始學習率提取出來,構建一個list存放到self.base_lrs中。

self.last_epoch用於學習率的更新,這樣就構建好了一個基本的scheduler。

設置好一個scheduler之後,在人民幣二分類任務中會在每一個epoch訓練後進行學習率的更新,也就是如下代碼:

scheduler.step()  # 更新學習率

我們進入step()觀察step的具體工作原理,通過設置斷點並進行步進調試,代碼如下:

    def step(self, epoch=None):
        # Raise a warning if old pattern is detected
        # https://github.com/pytorch/pytorch/issues/20124
        if self._step_count == 1:
            if not hasattr(self.optimizer.step, "_with_counter"):
                warnings.warn("Seems like `optimizer.step()` has been overridden after learning rate scheduler "
                              "initialization. Please, make sure to call `optimizer.step()` before "
                              "`lr_scheduler.step()`. See more details at "
                              "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)

            # Just check if there were two first lr_scheduler.step() calls before optimizer.step()
            elif self.optimizer._step_count < 1:
                warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
                              "In PyTorch 1.1.0 and later, you should call them in the opposite order: "
                              "`optimizer.step()` before `lr_scheduler.step()`.  Failure to do this "
                              "will result in PyTorch skipping the first value of the learning rate schedule."
                              "See more details at "
                              "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
        self._step_count += 1

        if epoch is None:
            epoch = self.last_epoch + 1
        self.last_epoch = epoch
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

我們主要注意上面代碼中的以下部分:

if epoch is None:
    epoch = self.last_epoch + 1
self.last_epoch = epoch
for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
    param_group['lr'] = lr

上面代碼中的for循環是調整學習率,self.optimizer.param_groups是一個參數組list,list中的每個元素是一個dict,因此param_group是一個字典,param_group[‘lr’]是參數組的學習率,通過param_group[‘lr’] = lr更新學習率,學習率是通過self.get_lr()進行更新的,self.get_lr()是計算下一個epoch中的lr,現在觀察self.get_lr()是怎樣更新學習率的,其具體代碼如下所示:

def get_lr(self):
    return [base_lr * self.gamma ** (self.last_epoch // self.step_size)
            for base_lr in self.base_lrs]

get_lr()函數是在step()函數中使用的;

2、Pytorch的六種學習率調整策略

2.1 StepLR

功能:等間隔調整學習率
主要參數

  • step_size:調整間隔數;
  • gamma:調整係數;
    調整方法lr=lrgammalr = lr * gamma
torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma=0.1,last_epoch=-1) 

下面通過代碼觀察StepLR的具體應用:

import torch
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
torch.manual_seed(1)

LR = 0.1  # 初始學習率
iteration = 10
max_epoch = 200  
# ------------------------------ fake data and optimizer  ------------------------------

weights = torch.randn((1), requires_grad=True)  # 進行梯度更新的參數
target = torch.zeros((1))

optimizer = optim.SGD([weights], lr=LR, momentum=0.9)  # 構建一個虛擬優化器

# ------------------------------ 1 Step LR ------------------------------
# flag = 0
flag = 1
if flag:

    scheduler_lr = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)  # 設置學習率下降策略

    lr_list, epoch_list = list(), list()
    for epoch in range(max_epoch):

        lr_list.append(scheduler_lr.get_lr())
        epoch_list.append(epoch)

        for i in range(iteration):

            loss = torch.pow((weights - target), 2)
            loss.backward()

            optimizer.step()  # 梯度更新
            optimizer.zero_grad()  # 梯度清零

        scheduler_lr.step()

    plt.plot(epoch_list, lr_list, label="Step LR Scheduler")
    plt.xlabel("Epoch")
    plt.ylabel("Learning rate")
    plt.legend()
    plt.show()

代碼輸出爲:
在這裏插入圖片描述
可以看出每隔50個epoch學習率會下降90%。

2.2 MultiStepLR

功能:按給定間隔調整學習率;
主要參數

  • milestones:設定調整時刻數;
  • gamma:調整係數;
    調整方式lr=lrgammalr = lr * gamma
lr_scheduler.MultiStepLR(optimizer,milestones, gamma=0.1,last_epoch=-1)

與stepLR不同的是,MultiStepLR可以自定義間隔,該功能通過milestones實現,下面通過代碼觀察其功能實現:

    milestones = [50, 125, 160]
    scheduler_lr = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1)

    lr_list, epoch_list = list(), list()
    for epoch in range(max_epoch):

        lr_list.append(scheduler_lr.get_lr())
        epoch_list.append(epoch)

        for i in range(iteration):

            loss = torch.pow((weights - target), 2)
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

        scheduler_lr.step()

    plt.plot(epoch_list, lr_list, label="Multi Step LR Scheduler\nmilestones:{}".format(milestones))
    plt.xlabel("Epoch")
    plt.ylabel("Learning rate")
    plt.legend()
    plt.show()

代碼的輸出爲:
在這裏插入圖片描述

2.3 ExponentialLR

功能:按指數衰減調整學習率;
主要參數

  • gamma:指數的底
    調整方式lr=lrgammaepochlr = lr * gamma**epoch
lr_scheduler.ExponentialLR(optimizer,gamma,last_epoch=-1)

下面通過代碼學習這個方法:

    gamma = 0.95
    scheduler_lr = optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma)

    lr_list, epoch_list = list(), list()
    for epoch in range(max_epoch):

        lr_list.append(scheduler_lr.get_lr())
        epoch_list.append(epoch)

        for i in range(iteration):

            loss = torch.pow((weights - target), 2)
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

        scheduler_lr.step()

    plt.plot(epoch_list, lr_list, label="Exponential LR Scheduler\ngamma:{}".format(gamma))
    plt.xlabel("Epoch")
    plt.ylabel("Learning rate")
    plt.legend()
    plt.show()

代碼輸出爲:
在這裏插入圖片描述

2.4 CosineAnnealingLR

功能:餘弦週期調整學習率;
主要參數

  • T_max:下降週期;
  • eta_min:學習率下限;
    調整方式ηt=ηmin+12(ηmaxηmin)(1+cos(TcurTmaxπ))\eta_{t}=\eta_{\min }+\frac{1}{2}\left(\eta_{\max }-\eta_{\min }\right)\left(1+\cos \left(\frac{T_{\operatorname{cur}}}{T_{\max }} \pi\right)\right)
lt_scheduler.CosineAnnealingLR(optimizer,T_max,eta_min=0,last_epoch=-1)

下面通過代碼學習這個方法:

    t_max = 50
    scheduler_lr = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max, eta_min=0.)

    lr_list, epoch_list = list(), list()
    for epoch in range(max_epoch):

        lr_list.append(scheduler_lr.get_lr())
        epoch_list.append(epoch)

        for i in range(iteration):

            loss = torch.pow((weights - target), 2)
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

        scheduler_lr.step()

    plt.plot(epoch_list, lr_list, label="CosineAnnealingLR Scheduler\nT_max:{}".format(t_max))
    plt.xlabel("Epoch")
    plt.ylabel("Learning rate")
    plt.legend()
    plt.show()

代碼輸出爲:
在這裏插入圖片描述

2.5 ReduceLRonPlateau

功能:監控指標,當指標不再變化則調整學習率;當loss不再下降則調整學習率,或者監控分類的準確率,當準確率不再上升則調整學習率;
主要參數

  • mode:min/max 兩種模式;min是監控指標下降,當指標不下降則調整學習率;max是監控指標上升,如果指標不上升則調整學習率;
  • factor:調整係數,例如上面方法中介紹的gamma;
  • patience:“耐心”,接受幾次不變化;比如說loss連續多少次不變化才調整學習率;
  • cooldown:“冷卻時間”,停止監控一段時間;意思是調整完學習率之後,在一段時間內不去監控指標,等冷卻時間過了再監控參數指標;
  • verbose:bool變量,是否打印日誌;
  • min_lr:學習率下限;
  • eps:學習率衰減最小值;
lr_scheduler.ReduceLROnPlateau(optimizer,mode='min',factor=0.1,patience=10,cerbose=False,threshold=0.0001,threshold_mode='rel',cooldown=0,min_lr=0,eps=1e-08)

下面通過代碼學習這個方法:

    loss_value = 0.5
    accuray = 0.9

    factor = 0.1
    mode = "min"
    patience = 10
    cooldown = 10
    min_lr = 1e-4
    verbose = True

    scheduler_lr = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=factor, mode=mode, patience=patience,
                                                        cooldown=cooldown, min_lr=min_lr, verbose=verbose)

    for epoch in range(max_epoch):
        for i in range(iteration):

            # train(...)

            optimizer.step()
            optimizer.zero_grad()

        if epoch == 5:
            loss_value = 0.4

        scheduler_lr.step(loss_value)

代碼對應的輸出爲:

Epoch    16: reducing learning rate of group 0 to 1.0000e-02.
Epoch    37: reducing learning rate of group 0 to 1.0000e-03.
Epoch    58: reducing learning rate of group 0 to 1.0000e-04.

2.6 LambdaLR

功能:自定義調整策略,對於不同的參數可以設置不同的學習率;
主要參數

  • lr_lambda:可以是function or list,如果是list,list中的每一個元素必須是function;
lr_scheduler.LambdaLR(Optimizer.lr_lambda,last_epoch=-1)

下面通過代碼學習其功能實現:

    lr_init = 0.1

    weights_1 = torch.randn((6, 3, 5, 5))
    weights_2 = torch.ones((5, 5))

    optimizer = optim.SGD([
        {'params': [weights_1]},
        {'params': [weights_2]}], lr=lr_init)

    lambda1 = lambda epoch: 0.1 ** (epoch // 20)
    lambda2 = lambda epoch: 0.95 ** epoch

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda1, lambda2])

    lr_list, epoch_list = list(), list()
    for epoch in range(max_epoch):
        for i in range(iteration):

            # train(...)

            optimizer.step()
            optimizer.zero_grad()

        scheduler.step()

        lr_list.append(scheduler.get_lr())
        epoch_list.append(epoch)

        print('epoch:{:5d}, lr:{}'.format(epoch, scheduler.get_lr()))

    plt.plot(epoch_list, [i[0] for i in lr_list], label="lambda 1")
    plt.plot(epoch_list, [i[1] for i in lr_list], label="lambda 2")
    plt.xlabel("Epoch")
    plt.ylabel("Learning Rate")
    plt.title("LambdaLR")
    plt.legend()
    plt.show()

代碼輸出爲:
在這裏插入圖片描述

3、學習率調整小結

  1. 有序調整:Step、MultiStep、Exponential和CosineAnnealing;
  2. 自適應調整:ReduceLROnPleateau;當參數不再下降或者不再上升,調整學習率;
  3. 自定義調整:Lambda;在模型fintinue或者有多個模型參數組中對於不同參數執行不同學習率時可以使用;

學習率初始化

  1. 設置較小值:0.01/0。001/0。0001
  2. 搜索最大學習率:《Cyclical Learning Rates for Training Neural Networks》
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章