前言
本節繼續學習深度卷積神經網絡
- 批量歸一化
- ResNet
- DenseNet
1、批量歸一化
- 對深層神經⽹絡來說,即使輸⼊數據已做標準化,訓練中模型參數的更新依然很容易造成靠近輸出層輸出的劇烈變化
- 批量歸⼀化利⽤小批量上的均值和標準差,不斷調整神經⽹絡中間輸出,從而使整個神經⽹絡在各層的中間輸出的數值更穩定
import d2lzh as d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import nn
"""實現批量歸一化"""
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
# 通過autograd來判斷當前模式是訓練模式還是預測模式
if not autograd.is_training():
# 如果是在預測模式下,直接使用傳入的移動平均所得的均值和方差
X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps)
else:
assert len(X.shape) in (2, 4)
if len(X.shape) == 2:
# 使用全連接層的情況,計算特徵維上的均值和方差
mean = X.mean(axis=0)
var = ((X - mean) ** 2).mean(axis=0)
else:
# 使用二維卷積層的情況,計算通道維上(axis=1)的均值和方差
# 這裏保持X的形狀以便後面可以做廣播運算
mean = X.mean(axis=(0, 2, 3), keepdims=True)
var = ((X - mean) ** 2).mean(axis=(0, 2, 3), keepdims=True)
# 訓練模式下用當前的均值和方差做標準化
X_hat = (X - mean) / nd.sqrt(var + eps)
# 更新移動平均的均值和方差
moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
moving_var = momentum * moving_var + (1.0 - momentum) * var
Y = gamma * X_hat + beta # 拉伸和偏移
return Y, moving_mean, moving_var
# BatchNorm層保存參與求梯度和迭代的拉伸參數gamma和偏移參數beta
# 同時也維護移動平均得到的均值和方差,以便能夠在模型預測時被使用
class BatchNorm(nn.Block):
def __init__(self, num_features, num_dims, **kwargs):
super(BatchNorm, self).__init__(**kwargs)
if num_dims == 2:
shape = (1, num_features)
else:
shape = (1, num_features, 1, 1)
# 參與求梯度和迭代的拉伸和偏移參數,分別初始化成1和0
self.gamma = self.params.get('gamma', shape=shape, init=init.One())
self.beta = self.params.get('beta', shape=shape, init=init.Zero())
# 不參與求梯度和迭代的變量,全在內存上初始化成0
self.moving_mean = nd.zeros(shape)
self.moving_var = nd.zeros(shape)
def forward(self, X):
# 如果X不在內存上,將moving_mean和moving_var複製到X所在顯存上
if self.moving_mean.context != X.context:
self.moving_mean = self.moving_mean.copyto(X.context)
self.moving_var = self.moving_var.copyto(X.context)
# 保存更新過的moving_mean和moving_var
Y, self.moving_mean, self.moving_var = batch_norm(
X, self.gamma.data(), self.beta.data(), self.moving_mean,
self.moving_var, eps=1e-5, momentum=0.9)
return Y
2、ResNet
殘差網絡
基礎模塊是殘差塊
殘差塊
如圖所示
- 有2個有相同輸出通道數的3 * 3卷積層
- 每個卷積層後接⼀個批量歸⼀化層和ReLU激活函數
- 如果想改變通道數,就需要引⼊⼀個額外的1*1卷積層來將輸⼊變換成需要的形狀後再做
相加運算
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.gluon import nn
# 殘差塊
class Residual(nn.Block): # 本類已保存在d2lzh包中方便以後使用
def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs):
super(Residual, self).__init__(**kwargs)
self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1, strides=strides)
self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1)
if use_1x1conv:
self.conv3 = nn.Conv2D(num_channels, kernel_size=1, strides=strides)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm()
self.bn2 = nn.BatchNorm()
def forward(self, X):
Y = nd.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3:
X = self.conv3(X)
return nd.relu(Y + X)
# 查看輸⼊和輸出形狀⼀致的情況
blk = Residual(3)
blk.initialize()
X = nd.random.uniform(shape=(4, 3, 6, 6))
blk(X).shape
# 可以在增加輸出通道數的同時減半輸出的高和寬。
blk = Residual(6, use_1x1conv=True, strides=2)
blk.initialize()
blk(X).shape
ResNet模型
- 前兩層在輸出通道數爲64、步幅爲2的7 * 7卷積層後接步幅爲2的3 * 3的最⼤池化層,每個卷積層後是批量歸⼀化層
- 後面是4個由殘差塊組成的模塊,第⼀個模塊的通道數同輸⼊通道數⼀致,之後的每個模塊在第⼀個殘差塊⾥將上⼀個模塊的通道數翻倍,並將⾼和寬減半
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.gluon import nn
"""實現ResNet"""
# ResNet模型
net = nn.Sequential()
# 前兩層
net.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3),
nn.BatchNorm(), nn.Activation('relu'),
nn.MaxPool2D(pool_size=3, strides=2, padding=1))
# 後面的模塊
def resnet_block(num_channels, num_residuals, first_block=False):
blk = nn.Sequential()
for i in range(num_residuals):
if i == 0 and not first_block:
blk.add(Residual(num_channels, use_1x1conv=True, strides=2))
else:
blk.add(Residual(num_channels))
return blk
net.add(resnet_block(64, 2, first_block=True),
resnet_block(128, 2),
resnet_block(256, 2),
resnet_block(512, 2))
# 全局平均池化層後接上全連接層輸出
net.add(nn.GlobalAvgPool2D(), nn.Dense(10))
# 觀察⼀下輸⼊形狀
X = nd.random.uniform(shape=(1, 1, 224, 224))
net.initialize()
for layer in net:
X = layer(X)
print(layer.name, 'output shape:\t', X.shape)
# 數據和訓練
lr, num_epochs, batch_size, ctx = 0.05, 5, 256, d2l.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
3、DenseNet
稠密連接⽹絡
- 是ResNet的延伸
- 主要區別如圖所示,模塊不是相加,是在通道維上連結
稠密塊
基礎模塊
網絡名稱的由來
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.gluon import nn
# 改良版的批量歸⼀化、激活和卷積結構
def conv_block(num_channels):
blk = nn.Sequential()
blk.add(nn.BatchNorm(), nn.Activation('relu'),
nn.Conv2D(num_channels, kernel_size=3, padding=1))
return blk
# 稠密塊
class DenseBlock(nn.Block):
def __init__(self, num_convs, num_channels, **kwargs):
super(DenseBlock, self).__init__(**kwargs)
self.net = nn.Sequential()
for _ in range(num_convs):
self.net.add(conv_block(num_channels)) #多個conv_block組成
def forward(self, X):
for blk in self.net:
Y = blk(X)
X = nd.concat(X, Y, dim=1) #在通道維上將輸入和輸出連結
return X
# 觀察
blk = DenseBlock(2, 10) #2個輸出通道數爲10的卷積塊,卷積塊的通道數控制了輸出通道數相對於輸⼊通道數的增⻓,因此也被稱爲增⻓率(growth rate)
blk.initialize()
X = nd.random.uniform(shape=(4, 3, 8, 8))
Y = blk(X)
print(Y.shape)
過渡層
- 控制模型複雜度
- 通過1 * 1卷積層來減小通道數
- 使⽤步幅爲2的平均池化層減半⾼和寬,從而進⼀步降低模型複雜度
def transition_block(num_channels):
blk = nn.Sequential()
blk.add(nn.BatchNorm(), nn.Activation('relu'),
nn.Conv2D(num_channels, kernel_size=1), #減小通道數
nn.AvgPool2D(pool_size=2, strides=2)) #減半高和寬
return blk
# 觀察
blk = transition_block(10)
blk.initialize()
print(blk(Y).shape)
DenseNet模型
- 同ResNet⼀樣的單卷積層和最⼤池化層
- 多個稠密塊和過渡層,控制卷積層數和通道數
- 最後接上全局池化層和全連接層
# DenseNet模型
net = nn.Sequential()
# 同ResNet一樣的單卷積層和最大池化層。
net.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3),
nn.BatchNorm(), nn.Activation('relu'),
nn.MaxPool2D(pool_size=3, strides=2, padding=1))
# 4個稠密塊
num_channels, growth_rate = 64, 32 # num_channels爲當前的通道數
num_convs_in_dense_blocks = [4, 4, 4, 4] #設置每個稠密塊使⽤多少個卷積層,這⾥設成4
for i, num_convs in enumerate(num_convs_in_dense_blocks):
net.add(DenseBlock(num_convs, growth_rate))
# 上一個稠密塊的輸出通道數
num_channels += num_convs * growth_rate
# 在稠密塊之間加入過渡層減半⾼和寬,並減半通道數
if i != len(num_convs_in_dense_blocks) - 1:
num_channels //= 2
net.add(transition_block(num_channels))
# 同ResNet一樣,最後接上全局池化層和全連接層來輸出。
net.add(nn.BatchNorm(), nn.Activation('relu'), nn.GlobalAvgPool2D(),
nn.Dense(10))
# 數據和訓練
lr, num_epochs, batch_size, ctx = 0.1, 5, 256, d2l.try_gpu()
net.initialize(ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
結語
簡單瞭解了穩定模型數值的批量歸一化和延伸出來的ResNet、DenseNet