ResNet的跨層鏈接思想影響了後面的模型發展,本章我們介紹DenseNet。下圖主要展示了這兩個區別(左圖:ResNet,右圖:DenseNet):
ResNet和DenseNet的主要區別是將“加”改爲了“融合”。DenseNet的好處是底層特徵並沒有丟失,而是完完全全被保留了下來:
根據相關原理,我們來定義一下block:
import mxnet.ndarray as nd
import mxnet.autograd as ag
import mxnet.gluon as gn
import mxnet as mx
import matplotlib.pyplot as plt
from mxnet import init
def conv_block(channels): # 按照ResNet V2的結構定義conv_block
out=gn.nn.Sequential()
out.add(gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.Conv2D(channels=channels,kernel_size=3,padding=1)
)
return out
# 稠密塊由多個conv_block組成,每塊使用相同的輸出通道數。
# 構造dense block(稠密塊)
class dense_block(gn.nn.Block):
def __init__(self,num_layers,channels,**kwargs):
super(dense_block, self).__init__(**kwargs)
self.net=gn.nn.Sequential()
for i in range(num_layers):
self.net.add(conv_block(channels=channels))
def forward(self, x):
for layer in self.net:
out=layer(x)
x=nd.concat(x,out,dim=1) # 在通道維上將輸入和輸出連結,與ResNet不一樣的是,它是一個融合,而ResNet是加法
return x
測試一下:
# 測試一個實例看看結果是否符合預期
dlk=dense_block(num_conv_block=2,channels=10)
dlk.initialize()
X=nd.random_normal(shape=(1,3,8,8)) # NCHW
print(dlk(X).shape)
結果:
可以看到,除了通道數變成了23以外,其它的都沒變,爲什麼呢?這是因爲我有2個dense block,每個block的輸出通道數爲10,然後融合起來就是20,最後,別忘了最開始的通道數3,總共就是2×10+3=23。
那麼,這也會浮現一個問題,如果我的dense block數比較多,比如有4個,輸出通道數爲128,那麼,整個模型的複雜度就會異常高,這明顯不對,因此,引入一個過渡塊,這個過渡塊裏其實沒有什麼高大上的東西,不外乎就是1×1的卷積+池化,用來縮小通道數和圖像高、寬。下面是相關代碼:
def trans_block(channels): # 過渡塊
out=gn.nn.Sequential()
out.add(gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.Conv2D(channels=channels,kernel_size=1),
gn.nn.AvgPool2D(pool_size=2,strides=2)
)
return out
接下來測試一下:
tlk=trans_block(10)
tlk.initialize()
print(tlk(dlk(X)).shape)
結果:
可以看到,通道數從23變成了10,同時,寬高也減半了。
DenseNet的主體就是將稠密塊和過渡塊反覆堆疊,下面實現一個121層的DenseNet:
start_channel=64
growth_channel=32 # 全局通道數
block_layer_num=[6,12,24,16] # 每個dense block裏有幾個conv
def DenseNet():
net=gn.nn.Sequential()
with net.name_scope():
# first block
net.add(gn.nn.Conv2D(channels=start_channel,kernel_size=7,padding=3,strides=2),
gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.MaxPool2D(pool_size=2,strides=2,padding=1)
)
# dense block
channels=start_channel
for i,num_layers in enumerate(block_layer_num):
net.add(dense_block(num_conv_block=num_layers,channels=growth_channel))
channels+=channels+growth_channel*num_layers # 計算已經有多少個通道數了
# 在每一個dense block後面添加一個過渡塊,用來減小通道數和寬高
if i!=len(block_layer_num)-1:
net.add(trans_block(channels=channels//2)) # 添加一個過渡塊,通道數減半
# last block
net.add(gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.GlobalAvgPool2D(),
gn.nn.Dense(10)
)
return net
下面放上所有代碼:
import mxnet.ndarray as nd
import mxnet.autograd as ag
import mxnet.gluon as gn
import mxnet as mx
import matplotlib.pyplot as plt
from mxnet import init
def conv_block(channels): # 按照ResNet V2的結構定義conv_block
out=gn.nn.Sequential()
out.add(gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.Conv2D(channels=channels,kernel_size=3,padding=1)
)
return out
# 稠密塊由多個conv_block組成,每塊使用相同的輸出通道數。
# 構造dense block(稠密塊)
class dense_block(gn.nn.Block):
def __init__(self,num_conv_block,channels,**kwargs):
super(dense_block, self).__init__(**kwargs)
self.net=gn.nn.Sequential()
for i in range(num_conv_block):
self.net.add(conv_block(channels=channels))
def forward(self, x):
for layer in self.net:
out=layer(x)
x=nd.concat(x,out,dim=1) # 在通道維上將輸入和輸出連結,與ResNet不一樣的是,它是一個融合,而ResNet是加法
return x
# 測試一個實例看看結果是否符合預期
# dlk1=dense_block(num_conv_block=6,channels=32)
# dlk1.initialize()
# X=nd.random_normal(shape=(1,64,32,32)) # NCHW
# dlk2=dense_block(num_conv_block=12,channels=32)
# dlk2.initialize()
# dlk3=dense_block(num_conv_block=24,channels=32)
# dlk3.initialize()
# print(dlk1(X).shape)
# print(dlk2(dlk1(X)).shape)
# print(dlk3(dlk2(dlk1(X))).shape)
def trans_block(channels): # 過渡塊
out=gn.nn.Sequential()
out.add(gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.Conv2D(channels=channels,kernel_size=1),
gn.nn.AvgPool2D(pool_size=2,strides=2)
)
return out
tlk=trans_block(10)
tlk.initialize()
# print(tlk(dlk(X)).shape)
start_channel=64
growth_channel=32 # 全局通道數
block_layer_num=[6,12,24,16] # 每個dense block裏有幾個conv
def DenseNet():
net=gn.nn.Sequential()
with net.name_scope():
# first block
net.add(gn.nn.Conv2D(channels=start_channel,kernel_size=7,padding=3,strides=2),
gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.MaxPool2D(pool_size=2,strides=2,padding=1)
)
# dense block
channels=start_channel
for i,num_layers in enumerate(block_layer_num):
net.add(dense_block(num_conv_block=num_layers,channels=growth_channel))
channels+=growth_channel*num_layers # 計算已經有多少個通道數了
# print(i,"channels:",channels)
# 在每一個dense block後面添加一個過渡塊,用來減小通道數和寬高
if i!=len(block_layer_num)-1:
channels//=2
net.add(trans_block(channels=channels)) # 添加一個過渡塊,通道數減半
# print("channels//2:",channels)
# last block
net.add(gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.GlobalAvgPool2D(),
gn.nn.Dense(10)
)
return net
ctx=mx.gpu()
net=DenseNet()
net.initialize(init=init.Xavier(),ctx=ctx)
# for layer in net:
# X=X.as_in_context(ctx)
# X=layer(X)
# print(layer.name, 'output shape:\t', X.shape)
'''---讀取數據和預處理---'''
def load_data_fashion_mnist(batch_size, resize=None):
transformer = []
if resize:
transformer += [gn.data.vision.transforms.Resize(resize)]
transformer += [gn.data.vision.transforms.ToTensor()]
transformer = gn.data.vision.transforms.Compose(transformer)
mnist_train = gn.data.vision.FashionMNIST(train=True)
mnist_test = gn.data.vision.FashionMNIST(train=False)
train_iter = gn.data.DataLoader(
mnist_train.transform_first(transformer), batch_size, shuffle=True)
test_iter = gn.data.DataLoader(
mnist_test.transform_first(transformer), batch_size, shuffle=False)
return train_iter, test_iter
batch_size=128
train_iter,test_iter=load_data_fashion_mnist(batch_size,resize=32) # 32,因爲圖片加大的話訓練很慢,而且顯存會吃不消
# 定義準確率
def accuracy(output,label):
return nd.mean(output.argmax(axis=1)==label).asscalar()
def evaluate_accuracy(data_iter,net):# 定義測試集準確率
acc=0
for data,label in data_iter:
data, label = data.as_in_context(ctx), label.as_in_context(ctx)
label = label.astype('float32')
output=net(data)
acc+=accuracy(output,label)
return acc/len(data_iter)
# softmax和交叉熵分開的話數值可能會不穩定
cross_loss=gn.loss.SoftmaxCrossEntropyLoss()
# 優化
train_step=gn.Trainer(net.collect_params(),'sgd',{"learning_rate":0.2}) #因爲使用了BN,所以學習率可以大一些
# 訓練
lr=0.1
epochs=20
for epoch in range(epochs):
n=0
train_loss=0
train_acc=0
for image,y in train_iter:
image, y = image.as_in_context(ctx), y.as_in_context(ctx)
y = y.astype('float32')
with ag.record():
output = net(image)
loss = cross_loss(output, y)
loss.backward()
train_step.step(batch_size)
train_loss += nd.mean(loss).asscalar()
train_acc += accuracy(output, y)
test_acc = evaluate_accuracy(test_iter, net)
print("Epoch %d, Loss:%f, Train acc:%f, Test acc:%f"
%(epoch,train_loss/len(train_iter),train_acc/len(train_iter),test_acc))
訓練結果:
當然,原論文中還包含了1×1卷積,本文沒有實現,說不上真正的121層。真正的121層DenseNet如下:
import mxnet.ndarray as nd
import mxnet.autograd as ag
import mxnet.gluon as gn
import mxnet as mx
from mxnet import init
def conv_block(channels): # 按照ResNet V2的結構定義conv_block
out=gn.nn.Sequential()
out.add(gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.Conv2D(channels=4*channels,kernel_size=1,padding=0),
gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.Conv2D(channels=channels, kernel_size=3, padding=1)
)
return out
# 稠密塊由多個conv_block組成,每塊使用相同的輸出通道數。
# 構造dense block(稠密塊)
class dense_block(gn.nn.Block):
def __init__(self,num_conv_block,channels,**kwargs):
super(dense_block, self).__init__(**kwargs)
self.net=gn.nn.Sequential()
for i in range(num_conv_block):
self.net.add(conv_block(channels=channels))
def forward(self, x):
for layer in self.net:
out=layer(x)
x=nd.concat(x,out,dim=1) # 在通道維上將輸入和輸出連結,與ResNet不一樣的是,它是一個融合,而ResNet是加法
return x
# 測試一個實例看看結果是否符合預期
dlk1=dense_block(num_conv_block=1,channels=3)
dlk1.initialize()
X=nd.random_normal(shape=(1,1,32,32)) # NCHW
print(dlk1(X).shape)
def trans_block(channels): # 過渡塊
out=gn.nn.Sequential()
out.add(gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.Conv2D(channels=channels,kernel_size=1),
gn.nn.AvgPool2D(pool_size=2,strides=2)
)
return out
tlk=trans_block(10)
tlk.initialize()
# print(tlk(dlk(X)).shape)
start_channel=64
growth_channel=32 # 全局通道數
block_layer_num=[6,12,24,16] # 每個dense block裏有幾個conv
def DenseNet():
net=gn.nn.Sequential()
with net.name_scope():
# first block
net.add(gn.nn.Conv2D(channels=start_channel,kernel_size=7,padding=3,strides=2),
gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.MaxPool2D(pool_size=2,strides=2,padding=1)
)
# dense block
channels=start_channel
for i,num_layers in enumerate(block_layer_num):
net.add(dense_block(num_conv_block=num_layers,channels=growth_channel))
channels+=growth_channel*num_layers # 計算已經有多少個通道數了
# print(i,"channels:",channels)
# 在每一個dense block後面添加一個過渡塊,用來減小通道數和寬高
if i!=len(block_layer_num)-1:
channels//=2
net.add(trans_block(channels=channels)) # 添加一個過渡塊,通道數減半
# print("channels//2:",channels)
# last block
net.add(gn.nn.BatchNorm(),
gn.nn.Activation("relu"),
gn.nn.GlobalAvgPool2D(),
gn.nn.Dense(10)
)
return net
ctx=mx.gpu()
net=DenseNet()
net.initialize(init=init.Xavier(),ctx=ctx)
# for layer in net:
# X=X.as_in_context(ctx)
# X=layer(X)
# print(layer.name, 'output shape:\t', X.shape)
'''---讀取數據和預處理---'''
def load_data_fashion_mnist(batch_size, resize=None):
transformer = []
if resize:
transformer += [gn.data.vision.transforms.Resize(resize)]
transformer += [gn.data.vision.transforms.ToTensor()]
transformer = gn.data.vision.transforms.Compose(transformer)
mnist_train = gn.data.vision.FashionMNIST(train=True)
mnist_test = gn.data.vision.FashionMNIST(train=False)
train_iter = gn.data.DataLoader(
mnist_train.transform_first(transformer), batch_size, shuffle=True)
test_iter = gn.data.DataLoader(
mnist_test.transform_first(transformer), batch_size, shuffle=False)
return train_iter, test_iter
batch_size=128
train_iter,test_iter=load_data_fashion_mnist(batch_size,resize=32) # 32,因爲圖片加大的話訓練很慢,而且顯存會吃不消
# 定義準確率
def accuracy(output,label):
return nd.mean(output.argmax(axis=1)==label).asscalar()
def evaluate_accuracy(data_iter,net):# 定義測試集準確率
acc=0
for data,label in data_iter:
data, label = data.as_in_context(ctx), label.as_in_context(ctx)
label = label.astype('float32')
output=net(data)
acc+=accuracy(output,label)
return acc/len(data_iter)
# softmax和交叉熵分開的話數值可能會不穩定
cross_loss=gn.loss.SoftmaxCrossEntropyLoss()
# 優化
train_step=gn.Trainer(net.collect_params(),'sgd',{"learning_rate":0.2}) #因爲使用了BN,所以學習率可以大一些
# 訓練
lr=0.1
epochs=20
for epoch in range(epochs):
n=0
train_loss=0
train_acc=0
for image,y in train_iter:
image, y = image.as_in_context(ctx), y.as_in_context(ctx)
y = y.astype('float32')
with ag.record():
output = net(image)
loss = cross_loss(output, y)
loss.backward()
train_step.step(batch_size)
train_loss += nd.mean(loss).asscalar()
train_acc += accuracy(output, y)
test_acc = evaluate_accuracy(test_iter, net)
print("Epoch %d, Loss:%f, Train acc:%f, Test acc:%f"
%(epoch,train_loss/len(train_iter),train_acc/len(train_iter),test_acc))
訓練結果: