Faster RCNN源碼學習六

anchor_target_layer.py

    rpn類在訓練的時候主要有兩個功能,第一個是get_rpn_cls_loss計算的rpn網絡分類loss,第二個是get_rpn_bbox_loss計算的rpn網絡的anchor邊界迴歸loss。那麼,要計算兩個loss,最難的地方是如何去獲得ground truth。這個ground truth的獲得是通過anchor_target_layer函數實現的。源碼如下:

# -*- coding: utf-8 -*-
"""
Created on Sun Jan  1 16:11:17 2017
@author: Kevin Liang (modifications)
Anchor Target Layer: Creates all the anchors in the final convolutional feature
map, assigns anchors to ground truth boxes, and applies labels of "objectness"
Adapted from the official Faster R-CNN repo: 
https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/anchor_target_layer.py
"""
 
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------
 
import sys
sys.path.append('../')
 
import numpy as np
import numpy.random as npr
import tensorflow as tf
 
from Lib.bbox_overlaps import bbox_overlaps
from Lib.bbox_transform import bbox_transform
from Lib.faster_rcnn_config import cfg
from Lib.generate_anchors import generate_anchors
 
#該函數計算每個anchor對應的ground truth(前景/背景,座標偏移值)
def anchor_target_layer(rpn_cls_score, gt_boxes, im_dims, _feat_stride, anchor_scales):
    '''
    Make Python version of _anchor_target_layer_py below Tensorflow compatible
    '''
    #執行_anchor_target_layer_py函數,傳參有網絡預測的rpn分類分數,ground_truth_box,圖像的尺寸,與原圖相比特徵圖縮小的比例和anchor的尺度
    rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights = \
        tf.py_func(_anchor_target_layer_py, [rpn_cls_score, gt_boxes, im_dims, _feat_stride, anchor_scales],
                   [tf.float32, tf.float32, tf.float32, tf.float32])
 
    #轉化成tensor
    rpn_labels = tf.convert_to_tensor(tf.cast(rpn_labels,tf.int32), name = 'rpn_labels')
    rpn_bbox_targets = tf.convert_to_tensor(rpn_bbox_targets, name = 'rpn_bbox_targets')
    rpn_bbox_inside_weights = tf.convert_to_tensor(rpn_bbox_inside_weights , name = 'rpn_bbox_inside_weights')
    rpn_bbox_outside_weights = tf.convert_to_tensor(rpn_bbox_outside_weights , name = 'rpn_bbox_outside_weights')
 
    return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
 
 
def _anchor_target_layer_py(rpn_cls_score, gt_boxes, im_dims, _feat_stride, anchor_scales):
    """
    Python version    
    
    Assign anchors to ground-truth targets. Produces anchor classification
    labels and bounding-box regression targets.
    
    # Algorithm:
    #
    # for each (H, W) location i
    #   generate 9 anchor boxes centered on cell i
    #   apply predicted bbox deltas at cell i to each of the 9 anchors
    # filter out-of-image anchors
    # measure GT overlap
    """
    im_dims = im_dims[0] #獲得原圖的尺度[height, width]
    _anchors = generate_anchors(scales=np.array(anchor_scales))# 生成9個錨點,shape: [9,4]
    _num_anchors = _anchors.shape[0] #_num_anchors值爲9
    
    # allow boxes to sit over the edge by a small amount
    _allowed_border =  0 #將anchor超出邊界的限度設置爲0
    
    # Only minibatch of 1 supported 在這裏覈驗batch_size是否爲1
    assert rpn_cls_score.shape[0] == 1, \
        'Only single item batches are supported'    
    
    # map of shape (..., H, W)
    height, width = rpn_cls_score.shape[1:3] #在這裏得到了rpn輸出的H和W,總的anchor數目應該是H×W×9
    
    # 1. Generate proposals from bbox deltas and shifted anchors
    #下面是在原圖上生成anchor
    shift_x = np.arange(0, width) * _feat_stride #shape: [width,]
    shift_y = np.arange(0, height) * _feat_stride #shape: [height,]
    shift_x, shift_y = np.meshgrid(shift_x, shift_y) #生成網格 shift_x shape: [height, width], shift_y shape: [height, width]
    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
                        shift_x.ravel(), shift_y.ravel())).transpose() # shape[height*width, 4]
 
    # add A anchors (1, A, 4) to
    # cell K shifts (K, 1, 4) to get
    # shift anchors (K, A, 4)
    # reshape to (K*A, 4) shifted anchors
    A = _num_anchors # A = 9
    K = shifts.shape[0] # K=height*width(特徵圖上的)
    all_anchors = (_anchors.reshape((1, A, 4)) +
                   shifts.reshape((1, K, 4)).transpose((1, 0, 2))) #shape[K,A,4] 得到所有的anchor
    all_anchors = all_anchors.reshape((K * A, 4))
    total_anchors = int(K * A) #total_anchors記錄anchor的數目
    
    # anchors inside the image inds_inside所有的anchor中沒有超過圖像邊界的
    inds_inside = np.where(
        (all_anchors[:, 0] >= -_allowed_border) &
        (all_anchors[:, 1] >= -_allowed_border) &
        (all_anchors[:, 2] < im_dims[1] + _allowed_border) &  # width
        (all_anchors[:, 3] < im_dims[0] + _allowed_border)    # height
    )[0]
    
    # keep only inside anchors
    anchors = all_anchors[inds_inside, :]#在這裏選出合理的anchors,指的是沒超出邊界的
    
    # label: 1 is positive, 0 is negative, -1 is dont care
    labels = np.empty((len(inds_inside), ), dtype=np.float32)#labels的長度就是合法的anchor的個數
    labels.fill(-1) #先用-1填充labels
    
    # overlaps between the anchors and the gt boxes
    # overlaps (ex, gt)
    #對所有的沒超過圖像邊界的anchor計算overlap,得到的shape: [len(anchors), len(gt_boxes)]
    overlaps = bbox_overlaps(
        np.ascontiguousarray(anchors, dtype=np.float),
        np.ascontiguousarray(gt_boxes, dtype=np.float))
    argmax_overlaps = overlaps.argmax(axis=1) #對於每個anchor,找到對應的gt_box座標。shape: [len(anchors),]
    max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] #對於每個anchor,找到最大的overlap的gt_box shape: [len(anchors)]
    gt_argmax_overlaps = overlaps.argmax(axis=0) #對於每個gt_box,找到對應的最大overlap的anchor。shape[len(gt_boxes),]
    gt_max_overlaps = overlaps[gt_argmax_overlaps,
                               np.arange(overlaps.shape[1])]#對於每個gt_box,找到與anchor的最大IoU值。shape[len(gt_boxes),]
    gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]#再次對於每個gt_box,找到對應的最大overlap的anchor。shape[len(gt_boxes),]
    
    if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: #如果不需要抑制positive的anchor,就先給背景anchor賦值,這樣在賦前景值的時候可以覆蓋。
        # assign bg labels first so that positive labels can clobber them
        labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 #在這裏將最大IoU仍然小於閾值(0.3)的某些anchor置0
 
    # fg label: for each gt, anchor with highest overlap
    labels[gt_argmax_overlaps] = 1 #在這裏將每個gt_box對應IoU最大的anchor置1
 
    # fg label: above threshold IOU
    labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 #在這裏將最大IoU大於閾值(0.7)的某些anchor置1
 
    if cfg.TRAIN.RPN_CLOBBER_POSITIVES: #如果需要抑制positive的anchor,就將背景anchor後賦值
        # assign bg labels last so that negative labels can clobber positives
        labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 #在這裏將最大IoU仍然小於閾值(0.3)的某些anchor置0
 
    # subsample positive labels if we have too many
    num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)#計算出一個訓練batch中需要的前景的數量
    fg_inds = np.where(labels == 1)[0] #找出被置爲前景的anchors
    if len(fg_inds) > num_fg:
        disable_inds = npr.choice(
            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
        labels[disable_inds] = -1 #如果事實存在的前景anchor大於了所需值,就隨機拋棄一些前景anchor
 
    # subsample negative labels if we have too many
    num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) ##計算出一個訓練batch中需要的背景的數量
    bg_inds = np.where(labels == 0)[0] #找出被置爲背景的anchors
    if len(bg_inds) > num_bg:
        disable_inds = npr.choice(
            bg_inds, size=(len(bg_inds) - num_bg), replace=False)
        labels[disable_inds] = -1 #如果事實存在的背景anchor大於了所需值,就隨機拋棄一些背景anchor
 
    # bbox_targets: The deltas (relative to anchors) that Faster R-CNN should 
    # try to predict at each anchor
    # TODO: This "weights" business might be deprecated. Requires investigation
    #返回的是,對於每個anchor,得到四個座標變換值(tx,ty,th,tw)。
    bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) #對每個在原圖內部的anchor,用全0初始化座標變換值
    bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) #對於每個anchor,找到變換到對應的最大的overlap的gt_box的四個值
 
    bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) #使用全0初始化inside_weights
    bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) #在前景anchor處賦權重
 
    bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) #使用全0初始化outside_weights
    if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: #如果RPN_POSITIVE_WEIGHT小於0的話,
        # uniform weighting of examples (given non-uniform sampling)
        num_examples = np.sum(labels >= 0)
        positive_weights = np.ones((1, 4)) * 1.0 / num_examples #則positive_weights和negative_weights都一樣
        negative_weights = np.ones((1, 4)) * 1.0 / num_examples
    else:
        assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
                (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) #如果RPN_POSITIVE_WEIGHT位於0和1之間的話,
        positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
                            np.sum(labels == 1))
        negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
                            np.sum(labels == 0)) #則positive_weights和negative_weights分別賦值
    bbox_outside_weights[labels == 1, :] = positive_weights
    bbox_outside_weights[labels == 0, :] = negative_weights #將positive_weights和negative_weights賦給bbox_outside_weights
 
    # map up to original set of anchors
    labels = _unmap(labels, total_anchors, inds_inside, fill=-1)#把圖像內部的anchor對應的label映射回總的anchor(加上了那些超出邊界的anchor,類別填充-1)
    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)#把圖像內部的anchor對應的bbox_target映射回所有的anchor(加上了那些超出邊界的anchor,填充0)
    bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) #把圖像內部的anchor對應的inside_weights映射回總的anchor(加上了那些超出邊界的anchor,填充0)
    bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) #把圖像內部的anchor對應的outside_weights映射回總的anchor(加上了那些超出邊界的anchor,填充0)
    
    # labels
    labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
    labels = labels.reshape((1, 1, A * height, width)) #將anchor的類別label數組形狀置爲[1,1,9*height,width]
    rpn_labels = labels
 
    # bbox_targets
    rpn_bbox_targets = bbox_targets.reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) #將anchor的位置映射數組的形狀置爲[1,9*4,height,width]
    
    # bbox_inside_weights
    rpn_bbox_inside_weights = bbox_inside_weights.reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) #將anchor的inside_weights數組的形狀置爲[1,9*4,height,width]
 
    # bbox_outside_weights
    rpn_bbox_outside_weights = bbox_outside_weights.reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) #將anchor的outside_weights數組的形狀置爲[1,9*4,height,width]
 
    return rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights #返回所有的ground truth值
    
 
def _unmap(data, count, inds, fill=0): #_unmap函數將圖像內部的anchor映射回到生成的所有的anchor
    """ Unmap a subset of item (data) back to the original set of items (of
    size count) """
    if len(data.shape) == 1:
        ret = np.empty((count, ), dtype=np.float32)
        ret.fill(fill)
        ret[inds] = data
    else:
        ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
        ret.fill(fill)
        ret[inds, :] = data
    return ret
 
def _compute_targets(ex_rois, gt_rois): #_compute_targets函數計算anchor和對應的gt_box的位置映射
    """Compute bounding-box regression targets for an image."""
 
    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 5
 
    return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)

      anchor_target_layer函數主要還是調用了_anchor_target_layer_py函數,然後將輸出轉化爲tensor。下面,我們就來仔細分析一下_anchor_target_layer_py函數。在該函數中,首先通過generate_anchors函數生成了9個候選框,然後按照在共享特徵上每滑動一次對應到原圖的位置生成候選框,即all_anchors。緊接着,排除了全部邊框超過圖像邊界的候選框,得到anchors,之後的操作都是針對圖像內部的anchors。然後,通過bbox_overlaps函數計算了所有邊界內anchor與包圍框之間的IoU值。接着,排除了IoU在0.3到0.7之間的anchor(通過將labels對應的值置爲-1),並且爲訓練安排了合適數量的前景anchor和背景anchor。然後,通過_compute_targets函數計算出了每個anchor對應的座標變換值(tx,ty,th,tw),存在bbox_targets數組裏面。再計算了bbox_inside_weights和bbox_outside_weights,這兩個數組在訓練anchor邊框修正時有重大作用。最後,通過_unmap函數將所有圖像邊框內部的anchor映射回所有的anchor。 

       anchor_target_layer主要就是爲了得到兩個東西,第一個東西是對應的一張圖像生成的anchor的類別,在訓練時需要賦予一定數量的正樣本(前景)和一定數量的負樣本(背景),其餘的需要全部置成-1,表示訓練的時候會忽略掉。第二個東西是對於每一個anchor的邊框修正,在進行邊框修正loss的計算時,只有前景anchor會起作用,可以看到這是bbox_inside_weights和bbox_outside_weights在實現。非前景和背景anchor對應的bbox_inside_weights和bbox_outside_weights都爲0。    

      在anchor_target_layer函數中,有幾個比較重要的函數,第一個函數就是generate_anchors,這個函數的主要作用是生成9個anchor,包含3種長寬比和3種面積。源代碼及註釋如下:

# -*- coding: utf-8 -*-
"""
Created on Sun Jan  1 16:11:17 2017
@author: Kevin Liang (modifications)
generate_anchors and supporting functions: generate reference windows (anchors)
for Faster R-CNN. Specifically, it creates a set of k (default of 9) relative 
coordinates. These references will be added on to all positions of the final
convolutional feature maps.
Adapted from the official Faster R-CNN repo: 
https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py
Note: the produced anchors have indices off by 1 of what the comments claim. 
Probably due to MATLAB being 1-indexed, while Python is 0-indexed.
"""
 
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------
 
import numpy as np
 
# Verify that we compute the same anchors as Shaoqing's matlab implementation:
#
#    >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
#    >> anchors
#
#    anchors =
#
#       -83   -39   100    56
#      -175   -87   192   104
#      -359  -183   376   200
#       -55   -55    72    72
#      -119  -119   136   136
#      -247  -247   264   264
#       -35   -79    52    96
#       -79  -167    96   184
#      -167  -343   184   360
 
#array([[ -83.,  -39.,  100.,   56.],
#       [-175.,  -87.,  192.,  104.],
#       [-359., -183.,  376.,  200.],
#       [ -55.,  -55.,   72.,   72.],
#       [-119., -119.,  136.,  136.],
#       [-247., -247.,  264.,  264.],
#       [ -35.,  -79.,   52.,   96.],
#       [ -79., -167.,   96.,  184.],
#       [-167., -343.,  184.,  360.]])
 
def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
                     scales=2**np.arange(3, 6)):
    """
    Generate anchor (reference) windows by enumerating aspect ratios X
    scales wrt a reference (0, 0, 15, 15) window.
    """
    #請注意anchor的表示形式有兩種,一種是記錄左上角和右下角的座標,一種是記錄中心座標和寬高
    #這裏生成一個基準anchor,採用左上角和右下角的座標表示[0,0,15,15]
    base_anchor = np.array([1, 1, base_size, base_size]) - 1 #[0,0,15,15]
    ratio_anchors = _ratio_enum(base_anchor, ratios) #shape: [3,4],返回的是不同長寬比的anchor
    anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
                         for i in range(ratio_anchors.shape[0])])#生成九個候選框 shape: [9,4] 
    return anchors
 
def _whctrs(anchor):#傳入anchor的左上角和右下角的座標,返回anchor的中心座標和長寬
    """
    Return width, height, x center, and y center for an anchor (window).
    """
 
    w = anchor[2] - anchor[0] + 1
    h = anchor[3] - anchor[1] + 1
    x_ctr = anchor[0] + 0.5 * (w - 1)
    y_ctr = anchor[1] + 0.5 * (h - 1)
    return w, h, x_ctr, y_ctr
 
def _mkanchors(ws, hs, x_ctr, y_ctr):#由anchor中心和長寬座標返回window,記錄左上角和右下角的座標
    """
    Given a vector of widths (ws) and heights (hs) around a center
    (x_ctr, y_ctr), output a set of anchors (windows).
    """
 
    ws = ws[:, np.newaxis] #shape: [3,1]
    hs = hs[:, np.newaxis] #shape: [3,1]
    anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
                         y_ctr - 0.5 * (hs - 1),
                         x_ctr + 0.5 * (ws - 1),
                         y_ctr + 0.5 * (hs - 1)))
    return anchors #shape [3,4],對於每個anchor,返回了左上角和右下角的座標值
 
def _ratio_enum(anchor, ratios): #這個函數計算不同長寬尺度下的anchor的座標
    """
    Enumerate a set of anchors for each aspect ratio wrt an anchor.
    """
 
    w, h, x_ctr, y_ctr = _whctrs(anchor) #找到anchor的中心點和長寬
    size = w * h #返回anchor的面積
    size_ratios = size / ratios #爲了計算anchor的長寬尺度設置的數組:array([512.,256.,128.])
    ws = np.round(np.sqrt(size_ratios)) #計算不同長寬比下的anchor的寬:array([23.,16.,11.])
    hs = np.round(ws * ratios) #計算不同長寬比下的anchor的長 array([12.,16.,22.])
    #請大家注意,對應位置上ws和hs相乘,面積都爲256左右
    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)#返回新的不同長寬比的anchor 返回的數組shape:[3,4],請注意anchor記錄的是左上角和右下角的座標
    return anchors
 
def _scale_enum(anchor, scales): #這個函數對於每一種長寬比的anchor,計算不同面積尺度的anchor座標
    """
    Enumerate a set of anchors for each scale wrt an anchor.
    """
 
    w, h, x_ctr, y_ctr = _whctrs(anchor) #找到anchor的中心座標
    ws = w * scales #shape [3,] 得到不同尺度的新的寬
    hs = h * scales #shape [3,] 得到不同尺度的新的高
    anchors = _mkanchors(ws, hs, x_ctr, y_ctr) #得到不同面積尺度的anchor信息,對應的是左上角和右下角的座標
    return anchors
 
if __name__ == '__main__':
    import time
    t = time.time()
    a = generate_anchors()
    print(time.time() - t)
    print(a)
    from IPython import embed; embed()

      在上面的代碼中,主要的原理就是最開始生成一個基準anchor。然後,通過這個基準anchor生成三個不同長寬比,面積一樣的anchor。最後,對每個長寬比anchor生成三個不同面積尺度的anchor,最終生成9個anchor,詳情請見代碼註釋。

      第二個重要的函數,是bbox_overlaps函數,這個函數對於每一個anchor,和所有的ground truth box計算IoU值,代碼如下: 

# -*- coding: utf-8 -*-
"""
Created on Sun Jan  1 20:25:19 2017
@author: Kevin Liang (modification)
Calculates bounding box overlaps between N bounding boxes(ground truth), and K query boxes (anchors) and return a matrix of overlap proportions
Written in Cython for optimization.
"""
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Sergey Karayev
# --------------------------------------------------------
 
cimport cython
import numpy as np
cimport numpy as np
 
DTYPE = np.float
ctypedef np.float_t DTYPE_t
 
def bbox_overlaps(#計算重合程度,兩個框之間的重合區域的面積 / 兩個區域一共加起來的面積
        np.ndarray[DTYPE_t, ndim=2] boxes,
        np.ndarray[DTYPE_t, ndim=2] query_boxes):
    """
    Parameters
    ----------
    boxes: (N, 4) ndarray of float
    query_boxes: (K, 4) ndarray of float
    Returns
    -------
    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
    """
    cdef unsigned int N = boxes.shape[0]
    cdef unsigned int K = query_boxes.shape[0]
    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
    cdef DTYPE_t iw, ih, box_area
    cdef DTYPE_t ua
    cdef unsigned int k, n
#iw,ih爲重疊部分的寬和高
    for k in range(K):
        box_area = (
            (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
            (query_boxes[k, 3] - query_boxes[k, 1] + 1)
        )
        for n in range(N):
            iw = (
                min(boxes[n, 2], query_boxes[k, 2]) -
                max(boxes[n, 0], query_boxes[k, 0]) + 1
            )
            if iw > 0:
                ih = (
                    min(boxes[n, 3], query_boxes[k, 3]) -
                    max(boxes[n, 1], query_boxes[k, 1]) + 1
                )
                if ih > 0:
                    ua = float(
                        (boxes[n, 2] - boxes[n, 0] + 1) *
                        (boxes[n, 3] - boxes[n, 1] + 1) +
                        box_area - iw * ih
                    )
                    overlaps[n, k] = iw * ih / ua
    return overlaps

第三個重要的部分是,在計算anchor的座標變換值的時候,使用到了bbox_transform函數,請注意在計算座標變換的時候是將anchor的表示形式變成中心座標與長寬。該函數代碼及註釋如下所示:

# -*- coding: utf-8 -*-
"""
Created on Sun Jan  1 21:18:58 2017
@author: Kevin Liang (modifications)
bbox_transform and its inverse operation
"""
 
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------
 
import numpy as np
 
def bbox_transform(ex_rois, gt_rois):
    '''
    Receives two sets of bounding boxes, denoted by two opposite corners 
    (x1,y1,x2,y2), and returns the target deltas that Faster R-CNN should aim 
    for.
    '''
    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights  #計算得到每個anchor的中心座標和長寬
 
    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights  #計算每個anchor對應的ground truth box對應的中心座標和長寬
 
    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths #計算四個座標變換值
    targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
    targets_dw = np.log(gt_widths / ex_widths)
    targets_dh = np.log(gt_heights / ex_heights)
 
    targets = np.vstack(
        (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()#對於每一個anchor,得到四個關係值 shape: [4, num_anchor]
    return targets

        到這裏,anchor_target_layers解析就完成了。這是rpn源碼中最重要的函數之一,因爲會返回所有anchor對應的類別和對應的邊框修正值,方便在計算loss時計算。 

RPN代碼中比較巧妙的部分筆者認爲有如下兩個:

(1)如何生成H×W×9個anchor:做法是先生成9個不同長寬比不同面積anchor,然後在圖上各個滑動區域上都生成這9個anchor。

(2)如何計算每個anchor的類別(前景背景)和邊框變換值。做法是首先爲每個anchor計算與ground truth box對應的IoU值,排除IoU爲0.3~0.7的anchor。0.3以下的爲背景anchor,0.7以上的爲前景anchor。對於邊框變化值,是計算的anchor與IoU重合最大的ground truth box對應的tx,ty,th,tw四個值。

 

轉自:https://blog.csdn.net/jiongnima/article/details/79781792 講的非常好,感謝。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章