1、批量歸一化(BatchNormalization)
1.1、對輸入的標準化(漸層模型)
處理後的任意一個特徵在數據集中所有樣本上的均值爲0、標準差爲1。
標準化處理輸入數據使各個特徵的分佈相近
1.2、批量歸一化(深度模型)
利用小批量上的均值和標準差,不斷調整神經網絡中間輸出,從而使整個神經網絡在各層的中間輸出的數值更穩定。
1.3、對全連接層做批量歸一化
位置:全連接層中的仿射變換和激活函數之間.
全連接是對batch上對應的單個樣本值進行標準化。所以輸出的維度爲下樣本.shape
1.4、卷積層做批量歸一化
位置:卷積計算之後、應⽤激活函數之前。
如果卷積計算輸出多個通道,我們需要對這些通道的輸出分別做批量歸一化,且每個通道都擁有獨立的拉伸和偏移參數。
計算:對單通道,batchsize=m,卷積計算輸出=pxq 對該通道中m×p×q個元素同時做批量歸一化,使用相同的均值和方差。輸出爲mx1
1.5、實現代碼
import time
import torch
from torch import nn, optim
import torch.nn.functional as F
import torchvision
import sys
sys.path.append("/home/kesci/input/")
import d2lzh1981 as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum):
# 判斷當前模式是訓練模式還是預測模式
if not is_training:
# 如果是在預測模式下,直接使用傳入的移動平均所得的均值和方差
X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
else:
assert len(X.shape) in (2, 4)
if len(X.shape) == 2:
# 使用全連接層的情況,計算特徵維上的均值和方差
mean = X.mean(dim=0)
var = ((X - mean) ** 2).mean(dim=0)
else:
# 使用二維卷積層的情況,計算通道維上(axis=1)的均值和方差。這裏我們需要保持
# X的形狀以便後面可以做廣播運算
mean = X.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
# 訓練模式下用當前的均值和方差做標準化
X_hat = (X - mean) / torch.sqrt(var + eps)
# 更新移動平均的均值和方差
moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
moving_var = momentum * moving_var + (1.0 - momentum) * var
Y = gamma * X_hat + beta # 拉伸和偏移
return Y, moving_mean, moving_var
class BatchNorm(nn.Module):
def __init__(self, num_features, num_dims):
super(BatchNorm, self).__init__()
if num_dims == 2:
shape = (1, num_features) #全連接層輸出神經元
else:
shape = (1, num_features, 1, 1) #通道數
# 參與求梯度和迭代的拉伸和偏移參數,分別初始化成0和1
self.gamma = nn.Parameter(torch.ones(shape))
self.beta = nn.Parameter(torch.zeros(shape))
# 不參與求梯度和迭代的變量,全在內存上初始化成0
self.moving_mean = torch.zeros(shape)
self.moving_var = torch.zeros(shape)
def forward(self, X):
# 如果X不在內存上,將moving_mean和moving_var複製到X所在顯存上
if self.moving_mean.device != X.device:
self.moving_mean = self.moving_mean.to(X.device)
self.moving_var = self.moving_var.to(X.device)
# 保存更新過的moving_mean和moving_var, Module實例的traning屬性默認爲true, 調用.eval()後設成false
Y, self.moving_mean, self.moving_var = batch_norm(self.training,
X, self.gamma, self.beta, self.moving_mean,
self.moving_var, eps=1e-5, momentum=0.9)
return Y
2、殘差網絡(ResNet)
深度學習的問題:深度CNN網絡達到一定深度後再一味地增加層數並不能帶來進一步地分類性能提高,反而會招致網絡收斂變得更慢,準確率也變得更差。
2.1、殘差塊(Residual Block)
恆等映射:
左邊:f(x)=x
右邊:f(x)-x=0 (易於捕捉恆等映射的細微波動)
在殘差塊中,輸⼊可通過跨層的數據線路更快 地向前傳播。
class Residual(nn.Module): # 本類已保存在d2lzh_pytorch包中方便以後使用
#可以設定輸出通道數、是否使用額外的1x1卷積層來修改通道數以及卷積層的步幅。
def __init__(self, in_channels, out_channels, use_1x1conv=False, stride=1):
super(Residual, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
if use_1x1conv: # 使用1x1卷積核改變通道數
self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm2d(out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
def forward(self, X):
Y = F.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3:
X = self.conv3(X)
return F.relu(Y + X)
3、ResNet模型
卷積(64,7x7,3)
批量一體化
最大池化(3x3,2)
殘差塊x4 (通過步幅爲2的殘差塊在每個模塊之間減小高和寬)
全局平均池化
全連接
#定義網絡結構
net = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
def resnet_block(in_channels, out_channels, num_residuals, first_block=False):
if first_block:
assert in_channels == out_channels # 第一個模塊的通道數同輸入通道數一致
blk = []
for i in range(num_residuals):
if i == 0 and not first_block:
blk.append(Residual(in_channels, out_channels, use_1x1conv=True, stride=2))
else:
blk.append(Residual(out_channels, out_channels))
return nn.Sequential(*blk)
net.add_module("resnet_block1", resnet_block(64, 64, 2, first_block=True))
net.add_module("resnet_block2", resnet_block(64, 128, 2))
net.add_module("resnet_block3", resnet_block(128, 256, 2))
net.add_module("resnet_block4", resnet_block(256, 512, 2))
net.add_module("global_avg_pool", d2l.GlobalAvgPool2d()) # GlobalAvgPool2d的輸出: (Batch, 512, 1, 1)
net.add_module("fc", nn.Sequential(d2l.FlattenLayer(), nn.Linear(512, 10)))
#網絡結構檢驗
X = torch.rand((1, 1, 224, 224))
for name, layer in net.named_children():
X = layer(X)
print(name, ' output shape:\t', X.shape)
4、稠密連接網絡(DenseNet)
主要構建模塊:
稠密塊(dense block): 定義了輸入和輸出是如何連結的。
過渡層(transition layer):用來控制通道數,使之不過大。
4.1、稠密塊
def conv_block(in_channels, out_channels):
blk = nn.Sequential(nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
return blk
class DenseBlock(nn.Module):
def __init__(self, num_convs, in_channels, out_channels):
super(DenseBlock, self).__init__()
net = []
for i in range(num_convs):
in_c = in_channels + i * out_channels
net.append(conv_block(in_c, out_channels))
self.net = nn.ModuleList(net)
self.out_channels = in_channels + num_convs * out_channels # 計算輸出通道數
def forward(self, X):
for blk in self.net:
Y = blk(X)
X = torch.cat((X, Y), dim=1) # 在通道維上將輸入和輸出連結
return X
blk = DenseBlock(2, 3, 10)
X = torch.rand(4, 3, 8, 8)
Y = blk(X)
Y.shape # torch.Size([4, 23, 8, 8])
4.2、過渡層
1×1 卷積層:來減小通道數
步幅爲2的平均池化層:減半高和寬
def transition_block(in_channels, out_channels):
blk = nn.Sequential(
nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels, out_channels, kernel_size=1),
nn.AvgPool2d(kernel_size=2, stride=2))
return blk
blk = transition_block(23, 10)
blk(Y).shape # torch.Size([4, 10, 4, 4])
4.3、DenseNet模型
net = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
num_channels, growth_rate = 64, 32 # num_channels爲當前的通道數
num_convs_in_dense_blocks = [4, 4, 4, 4]
for i, num_convs in enumerate(num_convs_in_dense_blocks):
DB = DenseBlock(num_convs, num_channels, growth_rate)
net.add_module("DenseBlosk_%d" % i, DB)
# 上一個稠密塊的輸出通道數
num_channels = DB.out_channels
# 在稠密塊之間加入通道數減半的過渡層
if i != len(num_convs_in_dense_blocks) - 1:
net.add_module("transition_block_%d" % i, transition_block(num_channels, num_channels // 2))
num_channels = num_channels // 2
net.add_module("BN", nn.BatchNorm2d(num_channels))
net.add_module("relu", nn.ReLU())
net.add_module("global_avg_pool", d2l.GlobalAvgPool2d()) # GlobalAvgPool2d的輸出: (Batch, num_channels, 1, 1)
net.add_module("fc", nn.Sequential(d2l.FlattenLayer(), nn.Linear(num_channels, 10)))
X = torch.rand((1, 1, 96, 96))
for name, layer in net.named_children():
X = layer(X)
print(name, ' output shape:\t', X.shape)