SSD代碼解讀（二）——Data Augmentation

本部分代碼是pytorch版本的，非官方的caffe實現，貼上代碼解讀的同時會與caffe實現進行比較。先貼代碼

import torch
from torchvision import transforms
import cv2
import numpy as np
import random
import math
from utils.box_utils import matrix_iou

def _crop(image, boxes, labels):
    height, width, _ = image.shape

    if len(boxes)== 0:
        return image, boxes, labels

    while True: # caffe中的min_iou多了個1.0
        mode = random.choice((
            None,
            (0.1, None),
            (0.3, None),
            (0.5, None),
            (0.7, None),
            (0.9, None),
            (None, None),
        ))

        if mode is None: #隨到None，直接返回，1/6概率
            return image, boxes, labels

        min_iou, max_iou = mode
        if min_iou is None:
            min_iou = float('-inf')
        if max_iou is None:
            max_iou = float('inf')

        for _ in range(50): #最大重複裁剪50次，直到某次裁剪合格
            # 面積比是scale^2, aspect_ratio是長寬比，從而獲得img_n的w和h
            scale = random.uniform(0.3,1.)
            min_ratio = max(0.5, scale*scale)
            max_ratio = min(2, 1. / scale / scale)
            ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
            w = int(scale * ratio * width)
            h = int((scale / ratio) * height)

            # 隨機生成img_n的左上角點座標，進而獲得img_n的位置，就是roi
            l = random.randrange(width - w)
            t = random.randrange(height - h)
            roi = np.array((l, t, l + w, t + h))

            iou = matrix_iou(boxes, roi[np.newaxis])
            
            # 若不存在任何一個GT與roi的iou大於之前隨機的iou_min，則重新裁剪
            if not (min_iou <= iou.min() and iou.max() <= max_iou):
                continue
            
            # 獲取img_n的像素信息，注意height是第一維
            image_t = image[roi[1]:roi[3], roi[0]:roi[2]]
            
            # 僅保留GT中心在img_n的img_n，若沒有，則重新裁剪
            centers = (boxes[:, :2] + boxes[:, 2:]) / 2
            mask = np.logical_and(roi[:2] < centers, centers < roi[2:]) \
                     .all(axis=1)
            boxes_t = boxes[mask].copy()
            labels_t = labels[mask].copy()
            if len(boxes_t) == 0:
                continue
            # 對GT的座標重新限定，主要是因爲邊界問題
            boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2])
            boxes_t[:, :2] -= roi[:2]
            boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:])
            boxes_t[:, 2:] -= roi[:2]
            # 返回裁剪後的img，box和label信息
            return image_t, boxes_t,labels_t

# 亮度對比度在RGB空間調整，色相飽和度在HSV空間調整，都是以0.5的概率
def _distort(image):
    def _convert(image, alpha=1, beta=0):
        tmp = image.astype(float) * alpha + beta
        tmp[tmp < 0] = 0
        tmp[tmp > 255] = 255
        image[:] = tmp

    image = image.copy()

    if random.randrange(2):
        _convert(image, beta=random.uniform(-32, 32))

    if random.randrange(2):
        _convert(image, alpha=random.uniform(0.5, 1.5))

    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    if random.randrange(2):
        tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
        tmp %= 180
        image[:, :, 0] = tmp

    if random.randrange(2):
        _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))

    image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)

    return image


# 擴展圖片，以p的概率，caffe中p=0.5，pytorch中p=0.6
def _expand(image, boxes,fill, p):
    if random.random() > p:
        return image, boxes

    height, width, depth = image.shape
    for _ in range(50): # 最大重複實驗50次
        scale = random.uniform(1,4)

        min_ratio = max(0.5, 1./scale/scale)
        max_ratio = min(2, scale*scale)
        ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
        ws = scale*ratio
        hs = scale/ratio
        if ws < 1 or hs < 1: # 擴展後的長和寬必須都要大於1
            continue
        w = int(ws * width)
        h = int(hs * height)
        
        # 隨機生成左上角的點的座標
        left = random.randint(0, w - width)
        top = random.randint(0, h - height)
        
        # 對GT的座標的調整
        boxes_t = boxes.copy()
        boxes_t[:, :2] += (left, top)
        boxes_t[:, 2:] += (left, top)

        # 擴展後的圖像，和原圖重疊部分原像素填充；其他部分填充均值，因爲後續需要減去均值，所以等價於0填充，即爲黑邊
        expand_image = np.empty(
            (h, w, depth),
            dtype=image.dtype)
        expand_image[:, :] = fill
        expand_image[top:top + height, left:left + width] = image
        image = expand_image

        return image, boxes_t

# 以0.5的概率水平翻轉,返回處理後的圖片和GT信息
def _mirror(image, boxes):
    _, width, _ = image.shape
    if random.randrange(2):
        image = image[:, ::-1]
        boxes = boxes.copy()
        boxes[:, 0::2] = width - boxes[:, 2::-2]
    return image, boxes

# 隨機選擇一種resize方式，進行resize，並將channel維度調到第一維
def preproc_for_test(image, insize, mean):
    interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4]
    interp_method = interp_methods[random.randrange(5)]
    image = cv2.resize(image, (insize, insize),interpolation=interp_method)
    image = image.astype(np.float32)
    image -= mean
    return image.transpose(2, 0, 1)

# 數據增強類
class preproc(object):

    def __init__(self, resize, rgb_means, p):
        self.means = rgb_means
        self.resize = resize
        self.p = p

    def __call__(self, image, targets): 
        # targets.shape = (n,5),n是img中target的數量，5是(x1,y1,x2,y2,label)
        # image 是原圖
        boxes = targets[:,:-1].copy()
        labels = targets[:,-1].copy()
        if len(boxes) == 0: # 若img中沒有gt, resize後再減去均值直接返回
            #boxes = np.empty((0, 4))
            targets = np.zeros((1,5))
            image = preproc_for_test(image, self.resize, self.means)
            return torch.from_numpy(image), targets
        
        # 下面的代碼段實現拷貝作用，備份。
        image_o = image.copy()
        targets_o = targets.copy()
        height_o, width_o, _ = image_o.shape
        boxes_o = targets_o[:,:-1]
        labels_o = targets_o[:,-1]
        boxes_o[:, 0::2] /= width_o
        boxes_o[:, 1::2] /= height_o
        labels_o = np.expand_dims(labels_o,1)
        targets_o = np.hstack((boxes_o,labels_o))
        
        #數據增強部分
        image_t, boxes, labels = _crop(image, boxes, labels) # 先裁剪
        image_t = _distort(image_t) # 亮度對比度色相飽和度等屬性調整
        image_t, boxes = _expand(image_t, boxes, self.means, self.p) #裁剪後再擴展
        image_t, boxes = _mirror(image_t, boxes) # 水平翻轉

        height, width, _ = image_t.shape
        image_t = preproc_for_test(image_t, self.resize, self.means) # aug後的img進行resize並減去均值
        # GT信息不參與resize操作，並將其轉化爲和resize前aug後的圖片的百分比形式，因爲SSD的預測信息是小數。
        boxes = boxes.copy()
        boxes[:, 0::2] /= width
        boxes[:, 1::2] /= height
        b_w = (boxes[:, 2] - boxes[:, 0])*1.
        b_h = (boxes[:, 3] - boxes[:, 1])*1.
        mask_b= np.minimum(b_w, b_h) > 0.01 # 太小的GT排除掉
        boxes_t = boxes[mask_b]
        labels_t = labels[mask_b].copy()

        if len(boxes_t)==0: #若aug後的img都是太小的GT，則取消aug，直接對原圖resize並剪均值。
            image = preproc_for_test(image_o, self.resize, self.means) #此處體現了之前備份的作用
            return torch.from_numpy(image),targets_o

        labels_t = np.expand_dims(labels_t,1)
        targets_t = np.hstack((boxes_t,labels_t)) #整合targets信息

        return torch.from_numpy(image_t), targets_t



class BaseTransform(object):
    """Defines the transformations that should be applied to test PIL image
        for input into the network
    dimension -> tensorize -> color adj
    Arguments:
        resize (int): input dimension to SSD
        rgb_means ((int,int,int)): average RGB of the dataset
            (104,117,123)
        swap ((int,int,int)): final order of channels
    Returns:
        transform (transform) : callable transform to be applied to test/val
        data
    """
    def __init__(self, resize, rgb_means, swap=(2, 0, 1)):
        self.means = rgb_means
        self.resize = resize
        self.swap = swap

    # assume input is cv2 img for now
    def __call__(self, img):

        interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4]
        interp_method = interp_methods[0]
        img = cv2.resize(np.array(img), (self.resize,
                                         self.resize),interpolation = interp_method).astype(np.float32)
        img -= self.means
        img = img.transpose(self.swap)
        return torch.from_numpy(img)

整個Aug的流程是：

crop的操作最爲複雜：

1. 首先隨機選取iou_min;

2. 隨機選擇scale(<1)，進行面積上的縮放，再隨機出aspect_ratio，進行長寬比縮放，進一步隨機出crop區域的左上角座標，從而確定crop區域；

3. 判斷是否存在一個GT與crop區域的 iou > iou_min，若不存在，最大重複實驗50次；

4. 滿足3後，再篩選出滿足GT的中心點在crop區域的crop區域，若沒有，最大重複實驗50次；

5，針對crop區域，修改GT信息，主要是邊界的調整。

expand的操作如下：

1. 以一定的概率進行擴展；

2. 隨機生成scale(>1)和aspect_ratio, 判斷擴展後的width和height是否都大於未擴展前的，若不滿足，最大重複實驗50次；

3. 隨機生成擴展圖像的左上角座標，並修改GT信息，主要是進行一個平移；

4. 擴展後的圖像像素填充。未擴展前圖像的那部分原像素填充，其他部分則均值填充，因爲後續還需要減去均值，所以等價於0值填充。

SSD代碼解讀（二）——Data Augmentation

SSD代碼解讀（一）——Prior Box Layer

SSD代碼解讀（二）——Data Augmentation

VALSE2019小記

記錄C++刷Leetcode

SSD代碼解讀（三）——MultiboxLoss

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結