前言
本節學習幾種深度卷積神經網絡
- AlexNet
- VGG
- NiN
- GoogleNet
1、AlexNet
- 2012年由Alex Krizhevsky提出
- 以很⼤的優勢贏得了ImageNet 2012圖像識別挑戰賽
- 有5層卷積和2層全連接隱藏層,以及1個全連接輸出層
卷積層
- 第⼀層中的卷積窗口形狀是11 * 11
- 第⼆層中的卷積窗口形狀減小到5*5
- 之後全採⽤3*3
- 第⼀、第⼆和第五個卷積層之後都使⽤了窗口形狀爲3*3、步幅爲2的最⼤池化層。
全連接層
- 兩個輸出個數爲4096
激活函數用ReLU
一個簡單實現如下
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.gluon import data as gdata, nn
import os
import sys
"""簡單實現alexnet"""
# alexnet模型
net = nn.Sequential()
# 使用較大的11 x 11窗口來捕獲物體。同時使用步幅4來較大幅度減小輸出高和寬
# 這裏使用的輸出通道數比LeNet中的也要大很多
net.add(nn.Conv2D(96, kernel_size=11, strides=4, activation='relu'),
nn.MaxPool2D(pool_size=3, strides=2),
# 減小卷積窗口,使用填充爲2來使得輸入與輸出的高和寬一致,且增大輸出通道數
nn.Conv2D(256, kernel_size=5, padding=2, activation='relu'),
nn.MaxPool2D(pool_size=3, strides=2),
# 連續3個卷積層,且使用更小的卷積窗口。除了最後的卷積層外,進一步增大了輸出通道數。
# 前兩個卷積層後不使用池化層來減小輸入的高和寬
nn.Conv2D(384, kernel_size=3, padding=1, activation='relu'),
nn.Conv2D(384, kernel_size=3, padding=1, activation='relu'),
nn.Conv2D(256, kernel_size=3, padding=1, activation='relu'),
nn.MaxPool2D(pool_size=3, strides=2),
# 這裏全連接層的輸出個數比LeNet中的大數倍。使用丟棄層來緩解過擬合
nn.Dense(4096, activation="relu"), nn.Dropout(0.5),
nn.Dense(4096, activation="relu"), nn.Dropout(0.5),
# 輸出層。由於這裏使用Fashion-MNIST,所以用類別數爲10,而非論文中的1000
nn.Dense(10))
# 構造一個高和寬均爲224的單通道數據樣本來觀察每一層的輸出形狀。
X = nd.random.uniform(shape=(1, 1, 224, 224))
net.initialize()
for layer in net:
X = layer(X)
print(layer.name, 'output shape:\t', X.shape)
# 讀取數據
def load_data_fashion_mnist(batch_size, resize=None, root=os.path.join('~', '.mxnet', 'datasets', 'fashion-mnist')):
root = os.path.expanduser(root) # 展開用戶路徑'~'
transformer = []
if resize: #將圖像⾼和寬擴⼤到AlexNet使⽤的圖像⾼和寬224
transformer += [gdata.vision.transforms.Resize(resize)]
transformer += [gdata.vision.transforms.ToTensor()]
transformer = gdata.vision.transforms.Compose(transformer)
mnist_train = gdata.vision.FashionMNIST(root=root, train=True)
mnist_test = gdata.vision.FashionMNIST(root=root, train=False)
num_workers = 0 if sys.platform.startswith('win32') else 4
train_iter = gdata.DataLoader(mnist_train.transform_first(transformer), batch_size, shuffle=True, num_workers=num_workers)
test_iter = gdata.DataLoader(mnist_test.transform_first(transformer), batch_size, shuffle=False, num_workers=num_workers)
return train_iter, test_iter
batch_size = 128 #如出現“out of memory”的報錯信息,可減小batch_size或resize
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=224)
# 訓練
lr, num_epochs, ctx = 0.01, 5, d2l.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
2、VGG
- Visual Geometry Group實驗室提出
- 通過重複使⽤簡單的基礎塊來構建深度模型
- 連續使⽤數個相同的填充爲1、窗口形狀爲3 * 3的卷積層後接上⼀個步幅爲2、窗口形狀爲2 * 2的最⼤池化層
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.gluon import nn
"""實現VGG"""
# 基礎模塊
def vgg_block(num_convs, num_channels): #可以指定卷積層的數量num_convs和輸出通道數num_channels
blk = nn.Sequential()
for _ in range(num_convs):
blk.add(nn.Conv2D(num_channels, kernel_size=3,padding=1, activation='relu'))
blk.add(nn.MaxPool2D(pool_size=2, strides=2))
return blk
# VGG-11。
def vgg(conv_arch):
net = nn.Sequential()
# 卷積層部分
for (num_convs, num_channels) in conv_arch:
net.add(vgg_block(num_convs, num_channels))
# 全連接層部分
net.add(nn.Dense(4096, activation='relu'), nn.Dropout(0.5),
nn.Dense(4096, activation='relu'), nn.Dropout(0.5),
nn.Dense(10))
return net
conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512))
net = vgg(conv_arch)
# 構造一個高和寬均爲224的單通道數據樣本來觀察每一層的輸出形狀。
net.initialize()
X = nd.random.uniform(shape=(1, 1, 224, 224))
for blk in net:
X = blk(X)
print(blk.name, 'output shape:\t', X.shape)
# 數據與訓練
ratio = 4
small_conv_arch = [(pair[0], pair[1] // ratio) for pair in conv_arch]
net = vgg(small_conv_arch)
lr, num_epochs, batch_size, ctx = 0.05, 5, 128, d2l.try_gpu()
net.initialize(ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,num_epochs)
3、NiN
⽹絡中的⽹絡
- 串聯多個由卷積層和“全連接”層構成的小⽹絡來構建⼀個深層⽹絡
與AlexNet還有個不同
- NiN去掉了AlexNet最後的3個全連接層
- 取而代之地,NiN使⽤了輸出通道數等於標籤類別數的NiN塊
- 然後使⽤全局平均池化層對每個通道中所有元素求平均並直接⽤於分類
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.gluon import nn
"""實現NiN"""
#基礎模塊
def nin_block(num_channels, kernel_size, strides, padding):
blk = nn.Sequential()
blk.add(nn.Conv2D(num_channels, kernel_size, strides, padding, activation='relu'),
nn.Conv2D(num_channels, kernel_size=1, activation='relu'),
nn.Conv2D(num_channels, kernel_size=1, activation='relu'))
return blk
# NiN
net = nn.Sequential()
net.add(nin_block(96, kernel_size=11, strides=4, padding=0),
nn.MaxPool2D(pool_size=3, strides=2),
nin_block(256, kernel_size=5, strides=1, padding=2),
nn.MaxPool2D(pool_size=3, strides=2),
nin_block(384, kernel_size=3, strides=1, padding=1),
nn.MaxPool2D(pool_size=3, strides=2), nn.Dropout(0.5),
# 標籤類別數是10
nin_block(10, kernel_size=3, strides=1, padding=1),
# 全局平均池化層將窗口形狀自動設置成輸入的高和寬
nn.GlobalAvgPool2D(),
# 將四維的輸出轉成二維的輸出,其形狀爲(批量大小, 10)
nn.Flatten())
# 構建一個數據樣本來查看每一層的輸出形狀。
X = nd.random.uniform(shape=(1, 1, 224, 224))
net.initialize()
for layer in net:
X = layer(X)
print(layer.name, 'output shape:\t', X.shape)
# 數據與訓練
lr, num_epochs, batch_size, ctx = 0.1, 5, 128, d2l.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
4、GoogleNet
2014年的ImageNet圖像識別挑戰賽中出現
基礎卷積塊Inception塊
- 相當於⼀個有4條線路的⼦⽹絡
- 通過不同窗口形狀的卷積層和最⼤池化層來並⾏抽取信息
- 使⽤1 * 1卷積層減少通道數從而降低模型複雜度。
- 將多個設計精細的Inception塊和其他層串聯起來
- Inception塊的通道數分配之⽐是在ImageNet數據集上通過⼤量的實驗得來的
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.gluon import nn
"""實現googlenet"""
# 基礎塊Inception
class Inception(nn.Block):
# c1 - c4爲每條線路里的層的輸出通道數
def __init__(self, c1, c2, c3, c4, **kwargs):
super(Inception, self).__init__(**kwargs)
# 線路1,單1 x 1卷積層
self.p1_1 = nn.Conv2D(c1, kernel_size=1, activation='relu')
# 線路2,1 x 1卷積層後接3 x 3卷積層
self.p2_1 = nn.Conv2D(c2[0], kernel_size=1, activation='relu')
self.p2_2 = nn.Conv2D(c2[1], kernel_size=3, padding=1, activation='relu')
# 線路3,1 x 1卷積層後接5 x 5卷積層
self.p3_1 = nn.Conv2D(c3[0], kernel_size=1, activation='relu')
self.p3_2 = nn.Conv2D(c3[1], kernel_size=5, padding=2, activation='relu')
# 線路4,3 x 3最大池化層後接1 x 1卷積層
self.p4_1 = nn.MaxPool2D(pool_size=3, strides=1, padding=1)
self.p4_2 = nn.Conv2D(c4, kernel_size=1, activation='relu')
def forward(self, x):
p1 = self.p1_1(x)
p2 = self.p2_2(self.p2_1(x))
p3 = self.p3_2(self.p3_1(x))
p4 = self.p4_2(self.p4_1(x))
return nd.concat(p1, p2, p3, p4, dim=1) # 在通道維上連結輸出
# googlenet模型
# 第一模塊使用一個64通道的7*7卷積層。
b1 = nn.Sequential()
b1.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3, activation='relu'),
nn.MaxPool2D(pool_size=3, strides=2, padding=1))
# 第二模塊使用2個卷積層:首先是64通道的1*1卷積層,然後是將通道增大3倍的3*3卷積層。它對應Inception塊中的第二條線路。
b2 = nn.Sequential()
b2.add(nn.Conv2D(64, kernel_size=1, activation='relu'),
nn.Conv2D(192, kernel_size=3, padding=1, activation='relu'),
nn.MaxPool2D(pool_size=3, strides=2, padding=1))
# 第三模塊串聯2個完整的Inception塊
# 第一個Inception塊的輸出通道數爲64+128+32+32=256
# 第二個Inception塊輸出通道數增至$128+192+96+64=480
b3 = nn.Sequential()
b3.add(Inception(64, (96, 128), (16, 32), 32),
Inception(128, (128, 192), (32, 96), 64),
nn.MaxPool2D(pool_size=3, strides=2, padding=1))
# 第四模塊串聯了5個Inception塊,其輸出通道數分別是192+208+48+64=512、160+224+64+64=512、128+256+64+64=512、112+288+64+64=528和256+320+128+128=832
b4 = nn.Sequential()
b4.add(Inception(192, (96, 208), (16, 48), 64),
Inception(160, (112, 224), (24, 64), 64),
Inception(128, (128, 256), (24, 64), 64),
Inception(112, (144, 288), (32, 64), 64),
Inception(256, (160, 320), (32, 128), 128),
nn.MaxPool2D(pool_size=3, strides=2, padding=1))
# 第五模塊有輸出通道數爲256+320+128+128=832和384+384+128+128=1024的兩個Inception塊
# 第五模塊的後面緊跟輸出層,該模塊同NiN一樣使用全局平均池化層來將每個通道的高和寬變成1
# 最後將輸出變成二維數組後接上一個輸出個數爲標籤類別數的全連接層
b5 = nn.Sequential()
b5.add(Inception(256, (160, 320), (32, 128), 128),
Inception(384, (192, 384), (48, 128), 128),
nn.GlobalAvgPool2D())
net = nn.Sequential()
net.add(b1, b2, b3, b4, b5, nn.Dense(10))
# 構造數據查看每層形狀
X = nd.random.uniform(shape=(1, 1, 96, 96))
net.initialize()
for layer in net:
X = layer(X)
print(layer.name, 'output shape:\t', X.shape)
# 數據與訓練
lr, num_epochs, batch_size, ctx = 0.1, 5, 128, d2l.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
結語
簡單瞭解了下這幾種深度卷積神經網絡
對於其設計思路以及一些延伸還有待後續學習