proposal_layer.py與proposal_target_layer.py代碼解讀

proposal_layer.py

 

# --------------------------------------------------------
# Faster R-CNN
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
from model.config import cfg
from model.bbox_transform import bbox_transform_inv, clip_boxes, bbox_transform_inv_tf, clip_boxes_tf
from model.nms_wrapper import nms
'''
實際用到的是proposal_layer_tf,再network中調用。凡是有layer結尾的,都是再network裏面調用。
'''
def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
  """A simplified version compared to fast/er RCNN
     For details please see the technical report
  """
  #下面提取好config裏面預先設置好的rpn參數:
  #分別是nms前rpn最多框限制(12000個)以及之後最多框限制(300),以及rpn閾值
  if type(cfg_key) == bytes:
      cfg_key = cfg_key.decode('utf-8')
#Number of top scoring boxes to keep before apply NMS to RPN proposals 12000
  pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
#Number of top scoring boxes to keep after applying NMS to RPN proposals 300
  post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
# NMS threshold used on RPN proposals 0.7
  nms_thresh = cfg[cfg_key].RPN_NMS_THRESH

  # Get the scores and bounding boxes
  #根據輸入,提取分類概率和bbox位置
  scores = rpn_cls_prob[:, :, :, num_anchors:]
  rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4))
  scores = scores.reshape((-1, 1))
  proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
  proposals = clip_boxes(proposals, im_info[:2])

  # Pick the top region proposals
  #下面就是排序,提取出排序後數據對應排序前的索引,輸出爲order
  order = scores.ravel().argsort()[::-1]
  #按設置的參數截取前RPN_PRE_NMS_TOP_N個proposal
  if pre_nms_topN > 0:
    order = order[:pre_nms_topN]
  proposals = proposals[order, :]
  #按索引order從scroes中切片出對應的scores
  scores = scores[order]

  # Non-maximal suppression
  #進行非極大值抑制
  keep = nms(np.hstack((proposals, scores)), nms_thresh)

  # Pick th top region proposals after NMS
  #按設置好的nms後框數量限制參數--RPN_POST_NMS_TOP_N
  #截取前RPN_POST_NMS_TOP_N個proposals
  if post_nms_topN > 0:
    keep = keep[:post_nms_topN]
  proposals = proposals[keep, :]
  scores = scores[keep]

  # Only support single image as input
  #生成一個空數組
  batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
  #爲一張圖片單獨製造blob
  blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))

  return blob, scores


def proposal_layer_tf(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
#這個是真實調用的,實際上比上面的要少一點,進行了一點濃縮,部分功能在其他位置實現。
  if type(cfg_key) == bytes:
    cfg_key = cfg_key.decode('utf-8')
  pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
  post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
  nms_thresh = cfg[cfg_key].RPN_NMS_THRESH

  # Get the scores and bounding boxes
  scores = rpn_cls_prob[:, :, :, num_anchors:]
  scores = tf.reshape(scores, shape=(-1,))
  rpn_bbox_pred = tf.reshape(rpn_bbox_pred, shape=(-1, 4))

  proposals = bbox_transform_inv_tf(anchors, rpn_bbox_pred)
  proposals = clip_boxes_tf(proposals, im_info[:2])

  # Non-maximal suppression
  indices = tf.image.non_max_suppression(proposals, scores, max_output_size=post_nms_topN, iou_threshold=nms_thresh)

  boxes = tf.gather(proposals, indices)
  boxes = tf.to_float(boxes)
  scores = tf.gather(scores, indices)
  scores = tf.reshape(scores, shape=(-1, 1))

  # Only support single image as input
  batch_inds = tf.zeros((tf.shape(indices)[0], 1), dtype=tf.float32)
  blob = tf.concat([batch_inds, boxes], 1)

  return blob, scores


proposal_target_layer.py

 

# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick, Sean Bell and Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import numpy.random as npr
from model.config import cfg
from model.bbox_transform import bbox_transform
from utils.cython_bbox import bbox_overlaps

'''

'''


def proposal_target_layer(rpn_rois, rpn_scores, gt_boxes, _num_classes):
  """
  Assign object detection proposals to ground-truth targets. Produces proposal
  classification labels and bounding-box regression targets.
  根據gt,對rpn產生的proposal打上分類標籤以及計算迴歸的偏差
  """

  # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
  # (i.e., rpn.proposal_layer.ProposalLayer), or any other source
  all_rois = rpn_rois
  all_scores = rpn_scores

  # Include ground-truth boxes in the set of candidate rois
  if cfg.TRAIN.USE_GT:#在config裏面這個參數是false,這段代碼應該是不執行的。
  #但是依然要解讀下這段代碼。看看爲什麼不執行。
    #按gt集合的形狀生成個空數組,同樣的行數
    zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
    #vstack按列堆疊列,hstack按行堆疊爲列。
	#首先把前面生成的0數組和gt數組疊加起來,然後再與roi數組堆疊起來。
    all_rois = np.vstack
      (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
    )
    # not sure if it a wise appending, but anyway i am not using it
    #然後得分也這樣處理。
    all_scores = np.vstack((all_scores, zeros))
  #TRAIN.BATCH_SIZE是感興趣區域的數量
  #rois_per_image就是每一張圖片允許的roi區域batch。
  #在其他地方也遇到了rois_per_image,名字不一樣,其實就是一個限制參數。
  num_images = 1
  rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
  #按cfg.TRAIN.FG_FRACTION參數計算得到每一張圖片的batch個roi中前景的數量
  #比如rois_per_image=100,那麼就是最多允許選取100*0.25=25個前景roi
  fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)

  # Sample rois with classification labels and bounding box regression
  # targets
  #_sample_rois函數,對每張圖片的Batch按照參數設置隨機採樣
  labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois(
    all_rois, all_scores, gt_boxes, fg_rois_per_image,
    rois_per_image, _num_classes)
  #爲了配合下面的操作,reshape一下
  #weight是權重,每一個roi都會產生誤差,對所有roi使用均權計算。
  rois = rois.reshape(-1, 5)
  roi_scores = roi_scores.reshape(-1)
  labels = labels.reshape(-1, 1)
  bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
  bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)
  bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)

  return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights


def _get_bbox_regression_labels(bbox_target_data, num_classes):
  """Bounding-box regression targets (bbox_target_data) are stored in a
  compact form N x (class, tx, ty, tw, th)

  This function expands those targets into the 4-of-4*K representation used
  by the network (i.e. only one class has non-zero targets).

  Returns:
      bbox_target (ndarray): N x 4K blob of regression targets
      bbox_inside_weights (ndarray): N x 4K blob of loss weights
	  
  bbox_target_data=N x (class, tx, ty, tw, th)
  然後生成一個全0數組用於存放bbox_targets
  以及生成一個全0權重。
  然後就計算
  """
  
  clss = bbox_target_data[:, 0]
  bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
  bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
  #找出類別標號大於0的框,也就是找出前景的bbox
  inds = np.where(clss > 0)[0]
  #對於每一個前景bbox
  for ind in inds:
    cls = clss[ind]#首先提取出來類別標號
    start = int(4 * cls)#類別標號擴大4倍數再加4.如果前景=1那麼start=1 end=8
    end = start + 4
	#如果是2的話,就是8-12。其實就是一個k類*4的向量,第一類的4個target在4-8之間,第二類在8-12之間,0類也就是背景在1-4之間
	#bbox_targets[ind,4:8]= bbox_target_data[ind ]
    bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
    bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
	#也就是說,對於不同類別的bbox,根據類別進行編碼,編碼到一個統一的向量裏面,四個偏移所在的位置,是根據其所屬類別來定位的。
  return bbox_targets, bbox_inside_weights#同時返回權重。


def _compute_targets(ex_rois, gt_rois, labels):
  """Compute bounding-box regression targets for an image."""
#計算bbox迴歸量
  assert ex_rois.shape[0] == gt_rois.shape[0]
  assert ex_rois.shape[1] == 4
  assert gt_rois.shape[1] == 4
#bbox_transform這個函數就是gt的四個座標和extrect的roi四個座標比對,然後轉換出偏移
  targets = bbox_transform(ex_rois, gt_rois)
  #下面的參數是一個正則化開關,是什麼正則化?
  #事先進行了規定,得到means和stds,然後再計算的時候,用這些參數對目標進行...歸一化?
  #這些means和stds是怎麼計算出來的呢?
  #在config裏面直接定義了,而means應該是均值把,stds因該是統計計算出來的標準差。
  #那麼就是用了數學的歸一化方法對數據進行了一些正則化,但直接用了參數,所以..
  #可能是不是不準?
  if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
    # Optionally normalize targets by a precomputed mean and stdev
    targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
               / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
  return np.hstack(
    (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)


def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
  """
  對roi採樣
  """
  # overlaps: (rois x gt_boxes)
  #bbox_overlaps是個計算函數,return計算結果
  overlaps = bbox_overlaps(
    np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
    np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
  gt_assignment = overlaps.argmax(axis=1)#axi=1 每一行的第幾列最大,返回索引
  max_overlaps = overlaps.max(axis=1)#同上,不過返回具體值,及重疊率
  labels = gt_boxes[gt_assignment, 4]#得到對應的gt之後,以gt的標籤爲label
  #放在第4位的原因是,前0-3四個位置放的是座標。

  # Select foreground RoIs as those with >= FG_THRESH overlap
  #定義前景
  fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
  # Guard against the case when an image has fewer than fg_rois_per_image
  # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
  bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
                     (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
  #定義背景,背景是一個範圍,在參數裏設置的是0.1~0.5

  # Small modification to the original version where we ensure a fixed number of regions are sampled
  #修改了原始版本,保證採樣的數量是固定的?
  if fg_inds.size > 0 and bg_inds.size > 0:
  #如果正樣本很多,大於設置的batch*正樣本比例,則隨機採樣,負樣本就爲batch減掉正樣本
  #在rpn_target_layer.py裏面有類似的操作
    fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
    fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False)
    bg_rois_per_image = rois_per_image - fg_rois_per_image
    to_replace = bg_inds.size < bg_rois_per_image
    bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace)
  elif fg_inds.size > 0:
    to_replace = fg_inds.size < rois_per_image
    fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace)
    fg_rois_per_image = rois_per_image
  elif bg_inds.size > 0:
    to_replace = bg_inds.size < rois_per_image
    bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace)
    fg_rois_per_image = 0
  else:
    import pdb
    pdb.set_trace()

  # The indices that we're selecting (both fg and bg)
  #最後挑選出來的前景和背景的索引
  keep_inds = np.append(fg_inds, bg_inds)
  # Select sampled values from various arrays:
  #提取出對應的labels
  labels = labels[keep_inds]
  # Clamp labels for the background RoIs to 0
  #又重複確定一邊,25個框之後的全部設置爲背景
  labels[int(fg_rois_per_image):] = 0
  rois = all_rois[keep_inds]
  #提取出對應的roi以及得分
  roi_scores = all_scores[keep_inds]
  #下面的函數在這個函數上面,計算bbox偏移
  bbox_target_data = _compute_targets(
    rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
  #根據偏移,進一步計算迴歸labels。實際是分爲了兩步計算。在這一步纔有weight
  bbox_targets, bbox_inside_weights = \
    _get_bbox_regression_labels(bbox_target_data, num_classes)

  return labels, rois, roi_scores, bbox_targets, bbox_inside_weights
#返回框的前景背景labels,roi區域及其得分,Bbox需要的修正量以及輸入的權重

#跟anchor_target_layer相比,有一定的重複,但是在前者的功能主要是針對anchor,也就是在rpn輸入的階段,而後者主要是rpn處理之後的結果再處理
rpn_rois, rpn_scores, gt_boxes, _num_classes從輸入的類別就可以看到。雖然這裏面依然定義了前景和背景,但是設計到_num_classes,其實有具體前景類別的標籤了。可以用於rpn之後的訓練。

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章