MXNET深度學習框架-24-使用gluon的DenseNet

        ResNet的跨層鏈接思想影響了後面的模型發展,本章我們介紹DenseNet。下圖主要展示了這兩個區別(左圖:ResNet,右圖:DenseNet):
                        在這裏插入圖片描述
        ResNet和DenseNet的主要區別是將“加”改爲了“融合”。DenseNet的好處是底層特徵並沒有丟失,而是完完全全被保留了下來:
在這裏插入圖片描述
        根據相關原理,我們來定義一下block:

import mxnet.ndarray as nd
import mxnet.autograd as ag
import mxnet.gluon as gn
import mxnet as mx
import matplotlib.pyplot as plt
from mxnet import init

def conv_block(channels): # 按照ResNet V2的結構定義conv_block
    out=gn.nn.Sequential()
    out.add(gn.nn.BatchNorm(),
            gn.nn.Activation("relu"),
            gn.nn.Conv2D(channels=channels,kernel_size=3,padding=1)
            )
    return out
# 稠密塊由多個conv_block組成,每塊使用相同的輸出通道數。
# 構造dense block(稠密塊)
class dense_block(gn.nn.Block):
    def __init__(self,num_layers,channels,**kwargs):
        super(dense_block, self).__init__(**kwargs)
        self.net=gn.nn.Sequential()
        for i in range(num_layers):
            self.net.add(conv_block(channels=channels))
    def forward(self, x):
        for layer in self.net:
            out=layer(x)
            x=nd.concat(x,out,dim=1) # 在通道維上將輸入和輸出連結,與ResNet不一樣的是,它是一個融合,而ResNet是加法
        return x

        測試一下:

# 測試一個實例看看結果是否符合預期
dlk=dense_block(num_conv_block=2,channels=10)
dlk.initialize()
X=nd.random_normal(shape=(1,3,8,8)) # NCHW
print(dlk(X).shape)

結果:
在這裏插入圖片描述
        可以看到,除了通道數變成了23以外,其它的都沒變,爲什麼呢?這是因爲我有2個dense block,每個block的輸出通道數爲10,然後融合起來就是20,最後,別忘了最開始的通道數3,總共就是2×10+3=23。
        那麼,這也會浮現一個問題,如果我的dense block數比較多,比如有4個,輸出通道數爲128,那麼,整個模型的複雜度就會異常高,這明顯不對,因此,引入一個過渡塊,這個過渡塊裏其實沒有什麼高大上的東西,不外乎就是1×1的卷積+池化,用來縮小通道數和圖像高、寬。下面是相關代碼:

def trans_block(channels): # 過渡塊
    out=gn.nn.Sequential()
    out.add(gn.nn.BatchNorm(),
            gn.nn.Activation("relu"),
            gn.nn.Conv2D(channels=channels,kernel_size=1),
            gn.nn.AvgPool2D(pool_size=2,strides=2)
            )
    return out

接下來測試一下:

tlk=trans_block(10)
tlk.initialize()
print(tlk(dlk(X)).shape)

結果:
在這裏插入圖片描述
可以看到,通道數從23變成了10,同時,寬高也減半了。
        DenseNet的主體就是將稠密塊和過渡塊反覆堆疊,下面實現一個121層的DenseNet:

start_channel=64
growth_channel=32 # 全局通道數
block_layer_num=[6,12,24,16] # 每個dense block裏有幾個conv
def DenseNet():
    net=gn.nn.Sequential()
    with net.name_scope():
        # first block
        net.add(gn.nn.Conv2D(channels=start_channel,kernel_size=7,padding=3,strides=2),
                gn.nn.BatchNorm(),
                gn.nn.Activation("relu"),
                gn.nn.MaxPool2D(pool_size=2,strides=2,padding=1)
                )
        # dense block
        channels=start_channel
        for i,num_layers in enumerate(block_layer_num):
            net.add(dense_block(num_conv_block=num_layers,channels=growth_channel))
            channels+=channels+growth_channel*num_layers  # 計算已經有多少個通道數了
            # 在每一個dense block後面添加一個過渡塊,用來減小通道數和寬高
            if i!=len(block_layer_num)-1:
                net.add(trans_block(channels=channels//2)) # 添加一個過渡塊,通道數減半
        #  last block
        net.add(gn.nn.BatchNorm(),
                gn.nn.Activation("relu"),
                gn.nn.GlobalAvgPool2D(),
                gn.nn.Dense(10)
                )
    return net

下面放上所有代碼:

import mxnet.ndarray as nd
import mxnet.autograd as ag
import mxnet.gluon as gn
import mxnet as mx
import matplotlib.pyplot as plt
from mxnet import init

def conv_block(channels): # 按照ResNet V2的結構定義conv_block
    out=gn.nn.Sequential()
    out.add(gn.nn.BatchNorm(),
            gn.nn.Activation("relu"),
            gn.nn.Conv2D(channels=channels,kernel_size=3,padding=1)
            )
    return out
# 稠密塊由多個conv_block組成,每塊使用相同的輸出通道數。
# 構造dense block(稠密塊)
class dense_block(gn.nn.Block):
    def __init__(self,num_conv_block,channels,**kwargs):
        super(dense_block, self).__init__(**kwargs)
        self.net=gn.nn.Sequential()
        for i in range(num_conv_block):
            self.net.add(conv_block(channels=channels))
    def forward(self, x):
        for layer in self.net:
            out=layer(x)
            x=nd.concat(x,out,dim=1) # 在通道維上將輸入和輸出連結,與ResNet不一樣的是,它是一個融合,而ResNet是加法
        return x

# 測試一個實例看看結果是否符合預期
# dlk1=dense_block(num_conv_block=6,channels=32)
# dlk1.initialize()
# X=nd.random_normal(shape=(1,64,32,32)) # NCHW
# dlk2=dense_block(num_conv_block=12,channels=32)
# dlk2.initialize()
# dlk3=dense_block(num_conv_block=24,channels=32)
# dlk3.initialize()
# print(dlk1(X).shape)
# print(dlk2(dlk1(X)).shape)
# print(dlk3(dlk2(dlk1(X))).shape)

def trans_block(channels): # 過渡塊
    out=gn.nn.Sequential()
    out.add(gn.nn.BatchNorm(),
            gn.nn.Activation("relu"),
            gn.nn.Conv2D(channels=channels,kernel_size=1),
            gn.nn.AvgPool2D(pool_size=2,strides=2)
            )
    return out

tlk=trans_block(10)
tlk.initialize()
# print(tlk(dlk(X)).shape)


start_channel=64
growth_channel=32 # 全局通道數
block_layer_num=[6,12,24,16] # 每個dense block裏有幾個conv
def DenseNet():
    net=gn.nn.Sequential()
    with net.name_scope():
        # first block
        net.add(gn.nn.Conv2D(channels=start_channel,kernel_size=7,padding=3,strides=2),
                gn.nn.BatchNorm(),
                gn.nn.Activation("relu"),
                gn.nn.MaxPool2D(pool_size=2,strides=2,padding=1)
                )
        # dense block
        channels=start_channel
        for i,num_layers in enumerate(block_layer_num):
            net.add(dense_block(num_conv_block=num_layers,channels=growth_channel))
            channels+=growth_channel*num_layers  # 計算已經有多少個通道數了
            # print(i,"channels:",channels)
            # 在每一個dense block後面添加一個過渡塊,用來減小通道數和寬高
            if i!=len(block_layer_num)-1:
                channels//=2
                net.add(trans_block(channels=channels)) # 添加一個過渡塊,通道數減半
                # print("channels//2:",channels)
        #  last block
        net.add(gn.nn.BatchNorm(),
                gn.nn.Activation("relu"),
                gn.nn.GlobalAvgPool2D(),
                gn.nn.Dense(10)
                )
    return net
ctx=mx.gpu()
net=DenseNet()
net.initialize(init=init.Xavier(),ctx=ctx)
# for layer in net:
#     X=X.as_in_context(ctx)
#     X=layer(X)
#     print(layer.name, 'output shape:\t', X.shape)
'''---讀取數據和預處理---'''
def load_data_fashion_mnist(batch_size, resize=None):
    transformer = []
    if resize:
        transformer += [gn.data.vision.transforms.Resize(resize)]
    transformer += [gn.data.vision.transforms.ToTensor()]
    transformer = gn.data.vision.transforms.Compose(transformer)
    mnist_train = gn.data.vision.FashionMNIST(train=True)
    mnist_test = gn.data.vision.FashionMNIST(train=False)
    train_iter = gn.data.DataLoader(
        mnist_train.transform_first(transformer), batch_size, shuffle=True)
    test_iter = gn.data.DataLoader(
        mnist_test.transform_first(transformer), batch_size, shuffle=False)
    return train_iter, test_iter
batch_size=128
train_iter,test_iter=load_data_fashion_mnist(batch_size,resize=32) # 32,因爲圖片加大的話訓練很慢,而且顯存會吃不消


# 定義準確率
def accuracy(output,label):
    return nd.mean(output.argmax(axis=1)==label).asscalar()

def evaluate_accuracy(data_iter,net):# 定義測試集準確率
    acc=0
    for data,label in data_iter:
        data, label = data.as_in_context(ctx), label.as_in_context(ctx)
        label = label.astype('float32')
        output=net(data)
        acc+=accuracy(output,label)
    return acc/len(data_iter)

# softmax和交叉熵分開的話數值可能會不穩定
cross_loss=gn.loss.SoftmaxCrossEntropyLoss()
# 優化
train_step=gn.Trainer(net.collect_params(),'sgd',{"learning_rate":0.2}) #因爲使用了BN,所以學習率可以大一些

# 訓練
lr=0.1
epochs=20
for epoch in range(epochs):
    n=0
    train_loss=0
    train_acc=0
    for image,y in train_iter:
        image, y = image.as_in_context(ctx), y.as_in_context(ctx)
        y = y.astype('float32')

        with ag.record():
            output = net(image)
            loss = cross_loss(output, y)
        loss.backward()
        train_step.step(batch_size)
        train_loss += nd.mean(loss).asscalar()
        train_acc += accuracy(output, y)

    test_acc = evaluate_accuracy(test_iter, net)
    print("Epoch %d, Loss:%f, Train acc:%f, Test acc:%f"
          %(epoch,train_loss/len(train_iter),train_acc/len(train_iter),test_acc))

訓練結果:
在這裏插入圖片描述

當然,原論文中還包含了1×1卷積,本文沒有實現,說不上真正的121層。真正的121層DenseNet如下:

import mxnet.ndarray as nd
import mxnet.autograd as ag
import mxnet.gluon as gn
import mxnet as mx
from mxnet import init

def conv_block(channels): # 按照ResNet V2的結構定義conv_block
    out=gn.nn.Sequential()
    out.add(gn.nn.BatchNorm(),
            gn.nn.Activation("relu"),
            gn.nn.Conv2D(channels=4*channels,kernel_size=1,padding=0),
            gn.nn.BatchNorm(),
            gn.nn.Activation("relu"),
            gn.nn.Conv2D(channels=channels, kernel_size=3, padding=1)
            )
    return out
# 稠密塊由多個conv_block組成,每塊使用相同的輸出通道數。
# 構造dense block(稠密塊)
class dense_block(gn.nn.Block):
    def __init__(self,num_conv_block,channels,**kwargs):
        super(dense_block, self).__init__(**kwargs)
        self.net=gn.nn.Sequential()
        for i in range(num_conv_block):
            self.net.add(conv_block(channels=channels))
    def forward(self, x):
        for layer in self.net:
            out=layer(x)
            x=nd.concat(x,out,dim=1) # 在通道維上將輸入和輸出連結,與ResNet不一樣的是,它是一個融合,而ResNet是加法
        return x

# 測試一個實例看看結果是否符合預期
dlk1=dense_block(num_conv_block=1,channels=3)
dlk1.initialize()
X=nd.random_normal(shape=(1,1,32,32)) # NCHW
print(dlk1(X).shape)


def trans_block(channels): # 過渡塊
    out=gn.nn.Sequential()
    out.add(gn.nn.BatchNorm(),
            gn.nn.Activation("relu"),
            gn.nn.Conv2D(channels=channels,kernel_size=1),
            gn.nn.AvgPool2D(pool_size=2,strides=2)
            )
    return out

tlk=trans_block(10)
tlk.initialize()
# print(tlk(dlk(X)).shape)


start_channel=64
growth_channel=32 # 全局通道數
block_layer_num=[6,12,24,16] # 每個dense block裏有幾個conv
def DenseNet():
    net=gn.nn.Sequential()
    with net.name_scope():
        # first block
        net.add(gn.nn.Conv2D(channels=start_channel,kernel_size=7,padding=3,strides=2),
                gn.nn.BatchNorm(),
                gn.nn.Activation("relu"),
                gn.nn.MaxPool2D(pool_size=2,strides=2,padding=1)
                )
        # dense block
        channels=start_channel
        for i,num_layers in enumerate(block_layer_num):
            net.add(dense_block(num_conv_block=num_layers,channels=growth_channel))
            channels+=growth_channel*num_layers  # 計算已經有多少個通道數了
            # print(i,"channels:",channels)
            # 在每一個dense block後面添加一個過渡塊,用來減小通道數和寬高
            if i!=len(block_layer_num)-1:
                channels//=2
                net.add(trans_block(channels=channels)) # 添加一個過渡塊,通道數減半
                # print("channels//2:",channels)
        #  last block
        net.add(gn.nn.BatchNorm(),
                gn.nn.Activation("relu"),
                gn.nn.GlobalAvgPool2D(),
                gn.nn.Dense(10)
                )
    return net
ctx=mx.gpu()
net=DenseNet()
net.initialize(init=init.Xavier(),ctx=ctx)
# for layer in net:
#     X=X.as_in_context(ctx)
#     X=layer(X)
#     print(layer.name, 'output shape:\t', X.shape)
'''---讀取數據和預處理---'''
def load_data_fashion_mnist(batch_size, resize=None):
    transformer = []
    if resize:
        transformer += [gn.data.vision.transforms.Resize(resize)]
    transformer += [gn.data.vision.transforms.ToTensor()]
    transformer = gn.data.vision.transforms.Compose(transformer)
    mnist_train = gn.data.vision.FashionMNIST(train=True)
    mnist_test = gn.data.vision.FashionMNIST(train=False)
    train_iter = gn.data.DataLoader(
        mnist_train.transform_first(transformer), batch_size, shuffle=True)
    test_iter = gn.data.DataLoader(
        mnist_test.transform_first(transformer), batch_size, shuffle=False)
    return train_iter, test_iter
batch_size=128
train_iter,test_iter=load_data_fashion_mnist(batch_size,resize=32) # 32,因爲圖片加大的話訓練很慢,而且顯存會吃不消


# 定義準確率
def accuracy(output,label):
    return nd.mean(output.argmax(axis=1)==label).asscalar()

def evaluate_accuracy(data_iter,net):# 定義測試集準確率
    acc=0
    for data,label in data_iter:
        data, label = data.as_in_context(ctx), label.as_in_context(ctx)
        label = label.astype('float32')
        output=net(data)
        acc+=accuracy(output,label)
    return acc/len(data_iter)

# softmax和交叉熵分開的話數值可能會不穩定
cross_loss=gn.loss.SoftmaxCrossEntropyLoss()
# 優化
train_step=gn.Trainer(net.collect_params(),'sgd',{"learning_rate":0.2}) #因爲使用了BN,所以學習率可以大一些

# 訓練
lr=0.1
epochs=20
for epoch in range(epochs):
    n=0
    train_loss=0
    train_acc=0
    for image,y in train_iter:
        image, y = image.as_in_context(ctx), y.as_in_context(ctx)
        y = y.astype('float32')

        with ag.record():
            output = net(image)
            loss = cross_loss(output, y)
        loss.backward()
        train_step.step(batch_size)
        train_loss += nd.mean(loss).asscalar()
        train_acc += accuracy(output, y)

    test_acc = evaluate_accuracy(test_iter, net)
    print("Epoch %d, Loss:%f, Train acc:%f, Test acc:%f"
          %(epoch,train_loss/len(train_iter),train_acc/len(train_iter),test_acc))

訓練結果:
在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章