深度學習系列(十) 計算機視覺之目標檢測(object detection)2020.6.29

前言

本節學習目標檢測
圖像裏有多個我們感興趣的目標,我們不僅想知道它們的類別,還想得到它們在圖像中的具體位置

  • 邊界框
  • 錨框
  • 多尺度
  • SSD

1、邊界框

通常使⽤邊界框(bounding box)來描述⽬標位置
可以由矩形左上⻆的x和y軸座標與右下⻆的x和y軸座標確定
舉個例子
在這裏插入圖片描述

2、錨框

⽬標檢測算法通常會在輸⼊圖像中採樣⼤量的區域,然後判斷這些區域中是否包含我們感興趣的
⽬標,並調整區域邊緣從而更準確地預測⽬標的真實邊界框(ground-truth bounding box)

錨框

  • 以每個像素爲中心生成多個⼤小和寬⾼⽐(aspect ratio)不同的邊界框
  • 用Jaccard係數(Jaccard index)衡量錨框和真實邊界框的相似度
    在這裏插入圖片描述
import d2lzh as d2l
from mxnet import contrib, gluon, image, nd
import numpy as np
"""實現錨框"""
np.set_printoptions(2)
# 數據
img = image.imread('../img/catdog.jpg').asnumpy()
h, w = img.shape[0:2]
X = nd.random.uniform(shape=(1, 3, h, w))  # 構造輸入數據
Y = contrib.nd.MultiBoxPrior(X, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5]) #錨框:輸⼊、⼀組⼤小和⼀組寬⾼⽐
print(Y.shape) #(批量大小,錨框個數,4)
# 繪圖
def show_bboxes(axes, bboxes, labels=None, colors=None):
    def _make_list(obj, default_values=None):
        if obj is None:
            obj = default_values
        elif not isinstance(obj, (list, tuple)):
            obj = [obj]
        return obj
    labels = _make_list(labels)
    colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c'])
    for i, bbox in enumerate(bboxes):
        color = colors[i % len(colors)]
        rect = d2l.bbox_to_rect(bbox.asnumpy(), color)
        axes.add_patch(rect)
        if labels and len(labels) > i:
            text_color = 'k' if color == 'w' else 'w'
            axes.text(rect.xy[0], rect.xy[1], labels[i],
                      va='center', ha='center', fontsize=9, color=text_color,
                      bbox=dict(facecolor=color, lw=0))
# 畫出圖像中以(250,250)爲中⼼的所有錨框
d2l.set_figsize()
bbox_scale = nd.array((w, h, w, h))
fig = d2l.plt.imshow(img)
show_bboxes(fig.axes, boxes[250, 250, :, :] * bbox_scale,
            ['s=0.75, r=1', 's=0.5, r=1', 's=0.25, r=1', 's=0.75, r=2',
             's=0.75, r=0.5'])

在這裏插入圖片描述

3、多尺度目標檢測

  • 在輸⼊圖像中均勻採樣⼀小部分像素,並以採樣的像素爲中⼼⽣成錨框
  • 在不同尺度下,可以⽣成不同數量和不同⼤小的錨框
import d2lzh as d2l
from mxnet import contrib, image, nd
"""實現多尺度目標檢測"""
# 數據
img = image.imread('../img/catdog.jpg')
h, w = img.shape[0:2]
d2l.set_figsize()
# 多尺度
def display_anchors(fmap_w, fmap_h, s):
    fmap = nd.zeros((1, 10, fmap_w, fmap_h))  # 前兩維的取值不影響輸出結果
    anchors = contrib.nd.MultiBoxPrior(fmap, sizes=s, ratios=[1, 2, 0.5])
    bbox_scale = nd.array((w, h, w, h))
    d2l.show_bboxes(d2l.plt.imshow(img.asnumpy()).axes,
                    anchors[0] * bbox_scale)
display_anchors(fmap_w=4, fmap_h=4, s=[0.15]) #小目標
display_anchors(fmap_w=1, fmap_h=1, s=[0.8]) #放大

小目標
在這裏插入圖片描述
放大
在這裏插入圖片描述

4、單發多框檢測(SSD)

  • 由⼀個基礎⽹絡塊和若⼲個多尺度特徵塊串聯而成
  • 基礎⽹絡塊⽤來從原始圖像中抽取特徵,因此⼀般會選擇常⽤的深度卷積神經⽹絡
  • 每個多尺度特徵塊將上⼀層提供的特徵圖的⾼和寬縮小(如減半),並使特徵圖中每個單元在輸⼊圖像上的感受野變得更⼴闊

在這裏插入圖片描述
一個實現

import d2lzh as d2l
from mxnet import autograd, contrib, gluon, image, init, nd
from mxnet.gluon import loss as gloss, nn
import time
"""實現SSD"""
# 類別預測層
def cls_predictor(num_anchors, num_classes):
    return nn.Conv2D(num_anchors * (num_classes + 1), kernel_size=3, padding=1)
# 邊界框預測層
def bbox_predictor(num_anchors):
    return nn.Conv2D(num_anchors * 4, kernel_size=3, padding=1)
# 連結多尺度的預測
def forward(x, block):
    block.initialize()
    return block(x)
def flatten_pred(pred):
    return pred.transpose((0, 2, 3, 1)).flatten()
def concat_preds(preds):
    return nd.concat(*[flatten_pred(p) for p in preds], dim=1)
# 高和寬減半塊
def down_sample_blk(num_channels):
    blk = nn.Sequential()
    for _ in range(2):
        blk.add(nn.Conv2D(num_channels, kernel_size=3, padding=1),
                nn.BatchNorm(in_channels=num_channels),
                nn.Activation('relu'))
    blk.add(nn.MaxPool2D(2))
    return blk
# 基礎網絡塊
def base_net():
    blk = nn.Sequential()
    for num_filters in [16, 32, 64]:
        blk.add(down_sample_blk(num_filters))
    return blk
# 完整模型
def get_blk(i):
    if i == 0:
        blk = base_net()
    elif i == 4:
        blk = nn.GlobalMaxPool2D()
    else:
        blk = down_sample_blk(128)
    return blk
# 每個模塊的前向計算
def blk_forward(X, blk, size, ratio, cls_predictor, bbox_predictor):
    Y = blk(X)
    anchors = contrib.ndarray.MultiBoxPrior(Y, sizes=size, ratios=ratio)
    cls_preds = cls_predictor(Y)
    bbox_preds = bbox_predictor(Y)
    return (Y, anchors, cls_preds, bbox_preds)
sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79],
         [0.88, 0.961]]
ratios = [[1, 2, 0.5]] * 5
num_anchors = len(sizes[0]) + len(ratios[0]) - 1
# TinySSD
class TinySSD(nn.Block):
    def __init__(self, num_classes, **kwargs):
        super(TinySSD, self).__init__(**kwargs)
        self.num_classes = num_classes
        for i in range(5):
            # 即賦值語句self.blk_i = get_blk(i)
            setattr(self, 'blk_%d' % i, get_blk(i))
            setattr(self, 'cls_%d' % i, cls_predictor(num_anchors, num_classes))
            setattr(self, 'bbox_%d' % i, bbox_predictor(num_anchors))
    def forward(self, X):
        anchors, cls_preds, bbox_preds = [None] * 5, [None] * 5, [None] * 5
        for i in range(5):
            # getattr(self, 'blk_%d' % i)即訪問self.blk_i
            X, anchors[i], cls_preds[i], bbox_preds[i] = blk_forward(
                X, getattr(self, 'blk_%d' % i), sizes[i], ratios[i],
                getattr(self, 'cls_%d' % i), getattr(self, 'bbox_%d' % i))
        # reshape函數中的0表示保持批量大小不變
        return (nd.concat(*anchors, dim=1),
                concat_preds(cls_preds).reshape(
                    (0, -1, self.num_classes + 1)), concat_preds(bbox_preds))
"""訓練"""
# 數據
batch_size = 32
train_iter, _ = d2l.load_data_pikachu(batch_size)
# 初始化
ctx, net = d2l.try_gpu(), TinySSD(num_classes=1)
net.initialize(init=init.Xavier(), ctx=ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.2, 'wd': 5e-4})
# 損失函數
cls_loss = gloss.SoftmaxCrossEntropyLoss()
bbox_loss = gloss.L1Loss() #掩碼變量
def calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks):
    cls = cls_loss(cls_preds, cls_labels)
    bbox = bbox_loss(bbox_preds * bbox_masks, bbox_labels * bbox_masks)
    return cls + bbox
# 準確率和平均絕對誤差
def cls_eval(cls_preds, cls_labels):
    # 由於類別預測結果放在最後一維,argmax需要指定最後一維
    return (cls_preds.argmax(axis=-1) == cls_labels).sum().asscalar()
def bbox_eval(bbox_preds, bbox_labels, bbox_masks):
    return ((bbox_labels - bbox_preds) * bbox_masks).abs().sum().asscalar()
# 訓練
for epoch in range(20):
    acc_sum, mae_sum, n, m = 0.0, 0.0, 0, 0
    train_iter.reset()  # 從頭讀取數據
    start = time.time()
    for batch in train_iter:
        X = batch.data[0].as_in_context(ctx)
        Y = batch.label[0].as_in_context(ctx)
        with autograd.record():
            # 生成多尺度的錨框,爲每個錨框預測類別和偏移量
            anchors, cls_preds, bbox_preds = net(X)
            # 爲每個錨框標註類別和偏移量
            bbox_labels, bbox_masks, cls_labels = contrib.nd.MultiBoxTarget(anchors, Y, cls_preds.transpose((0, 2, 1)))
            # 根據類別和偏移量的預測和標註值計算損失函數
            l = calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks)
        l.backward()
        trainer.step(batch_size)
        acc_sum += cls_eval(cls_preds, cls_labels)
        n += cls_labels.size
        mae_sum += bbox_eval(bbox_preds, bbox_labels, bbox_masks)
        m += bbox_labels.size
    if (epoch + 1) % 5 == 0:
        print('epoch %2d, class err %.2e, bbox mae %.2e, time %.1f sec' % (
            epoch + 1, 1 - acc_sum / n, mae_sum / m, time.time() - start))
# 預測
img = image.imread('../img/pikachu.jpg')
feature = image.imresize(img, 256, 256).astype('float32')
X = feature.transpose((2, 0, 1)).expand_dims(axis=0)
def predict(X):
    anchors, cls_preds, bbox_preds = net(X.as_in_context(ctx))
    cls_probs = cls_preds.softmax().transpose((0, 2, 1))
    output = contrib.nd.MultiBoxDetection(cls_probs, bbox_preds, anchors)
    idx = [i for i, row in enumerate(output[0]) if row[0].asscalar() != -1]
    return output[0, idx]
output = predict(X)
# 置信度不低於0.3的邊界框篩選爲最終輸出
d2l.set_figsize((5, 5))
def display(img, output, threshold):
    fig = d2l.plt.imshow(img.asnumpy())
    for row in output:
        score = row[1].asscalar()
        if score < threshold:
            continue
        h, w = img.shape[0:2]
        bbox = [row[2:6] * nd.array((w, h, w, h), ctx=row.context)]
        d2l.show_bboxes(fig.axes, bbox, '%.2f' % score, 'w')
display(img, output, threshold=0.3)

在這裏插入圖片描述

結語

大概瞭解了目標檢測的原理
嘗試了SSD的實現
裏面還有好些細節沒有搞明白
留待後面回頭再看

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章