1. self.roidb返回一個列表,列表元素爲字典,每張圖片對應一個字典(包括flipped的圖像)
2. prepare_roidb 函數
def prepare_roidb(imdb):
"""Enrich the imdb's roidb by adding some derived quantities that
are useful for training. This function precomputes the maximum
overlap, taken over ground-truth boxes, between each ROI and
each ground-truth box. The class with maximum overlap is also
recorded.
該函數主要是用來準備imdb的roidb,主要工作是給roidb中的字典添加一些屬性,比如‘image’。
簡單地來說,imdb就是圖像數據庫,roidb就是圖像中的Region Of Intrest 數據庫
"""
sizes = [PIL.Image.open(imdb.image_path_at(i)).size
for i in xrange(imdb.num_images)]
# 當在‘Stage 2 Fast R-CNN, init from stage 2 RPN R-CNN model’階段中,roidb由rpn_roidb()
# 方法生成,其中的沒一張圖像的box不僅僅只有gtbox,還包括rpn_file裏面的box。
roidb = imdb.roidb
# print '~~~~~~~~~~~~~~imdb.image_index: {}'.format(imdb.image_index)
# print '~~~~~~~~~~~~~~imdb.image_index: {} {}'.format(imdb.image_index[0], imdb.image_index[5011]) # 000005 000005 說明image_index是重複的兩端合在一起的,即驗證了append_flipped_images()中的self._image_index = self._image_index * 2
# print '~~~~~~roidb[0]: {}'.format(roidb[0])
# print '~~~~~~len(imdb.image_index): {}'.format(len(imdb.image_index)) #10022
for i in xrange(len(imdb.image_index)):
roidb[i]['image'] = imdb.image_path_at(i)
roidb[i]['width'] = sizes[i][0]
roidb[i]['height'] = sizes[i][1]
# need gt_overlaps as a dense array for argmax
gt_overlaps = roidb[i]['gt_overlaps'].toarray()
# max overlap with gt over classes (columns)
max_overlaps = gt_overlaps.max(axis=1)
# gt class that had the max overlap 返回最大值的aixs=1軸上的座標,在這裏它同時代表某一類
max_classes = gt_overlaps.argmax(axis=1)
roidb[i]['max_classes'] = max_classes
roidb[i]['max_overlaps'] = max_overlaps
# sanity checks
# max overlap of 0 => class should be zero (background)
#檢查背景類
zero_inds = np.where(max_overlaps == 0)[0]
assert all(max_classes[zero_inds] == 0)
# max overlap > 0 => class should not be zero (must be a fg class)
nonzero_inds = np.where(max_overlaps > 0)[0]
assert all(max_classes[nonzero_inds] != 0)
# print '~~~~~~~~~~~~~~~~zero_inds: {}'.format(zero_inds)
# print '~~~~~~~~roidb[0]["gt_overlaps"]: {}'.format(roidb[0]['gt_overlaps'].toarray())
# print '~~~~~~~~roidb[1]["gt_overlaps"]: {}'.format(roidb[1]['gt_overlaps'].toarray())
# print '~~~~~~~~roidb[2]["gt_overlaps"]: {}'.format(roidb[2]['gt_overlaps'].toarray())
# print '~~~~~~~~roidb[3]["gt_overlaps"]: {}'.format(roidb[3]['gt_overlaps'].toarray())
print ‘~~~~roidb[3][“gt_overlaps”]: {}’.format(roidb[3][‘gt_overlaps’].toarray())等測試代碼在stage 1 時的輸出如下:
{'boxes': array([[262, 210, 323, 338],
[164, 263, 252, 371],
[240, 193, 294, 298]], dtype=uint16), 'gt_classes': array([9, 9, 9], dtype=int32), 'gt_overlaps': <3x21 sparse matrix of type '<type 'numpy.float32'>'
with 3 stored elements in Compressed Sparse Row format>, 'seg_areas': array([ 7998., 9701., 5830.], dtype=float32), 'flipped': False}
## 'gt_overlaps': <3x21 sparse matrix of type '<type 'numpy.float32'>' with 3 stored elements in Compressed Sparse Row format> 21代表21類,
#with 3 stored elements in Compressed Sparse Row format> 說明該sparse matrix裏面只存儲了3個非0值
#roidb[0]
{'gt_classes': array([9, 9, 9], dtype=int32), 'max_classes': array([9, 9, 9]), 'image': '/home/sam/WORKSPACE/py-faster-rcnn/data/VOCdevkit2007/VOC2007/JPEGImages/000005.jpg', 'flipped': False, 'width': 500,
'boxes': array([[262, 210, 323, 338],
[164, 263, 252, 371],
[240, 193, 294, 298]], dtype=uint16), 'max_overlaps': array([ 1., 1., 1.], dtype=float32), 'height': 375, 'seg_areas': array([ 7998., 9701., 5830.], dtype=float32), 'gt_overlaps': <3x21 sparse matrix of type '<type 'numpy.float32'>'
with 3 stored elements in Compressed Sparse Row format>}
#roidb[1]
{'gt_classes': array([7], dtype=int32), 'max_classes': array([7]), 'image': '/home/sam/WORKSPACE/py-faster-rcnn/data/VOCdevkit2007/VOC2007/JPEGImages/000007.jpg', 'flipped': False, 'width': 500,
'boxes': array([[140, 49, 499, 329]], dtype=uint16), 'max_overlaps': array([ 1.], dtype=float32), 'height': 333, 'seg_areas': array([ 101160.], dtype=float32), 'gt_overlaps': <1x21 sparse matrix of type '<type 'numpy.float32'>'
with 1 stored elements in Compressed Sparse Row format>}
#roidb[2]
{'gt_classes': array([13, 15, 15, 15], dtype=int32), 'max_classes': array([13, 15, 15, 15]), 'image': '/home/sam/WORKSPACE/py-faster-rcnn/data/VOCdevkit2007/VOC2007/JPEGImages/000009.jpg', 'flipped': False, 'width': 500,
'boxes': array([[ 68, 171, 269, 329],
[149, 140, 228, 283],
[284, 200, 326, 330],
[257, 197, 296, 328]], dtype=uint16), 'max_overlaps': array([ 1., 1., 1., 1.], dtype=float32), 'height': 375, 'seg_areas': array([ 32118., 11520., 5633., 5280.], dtype=float32), 'gt_overlaps': <4x21 sparse matrix of type '<type 'numpy.float32'>'
with 4 stored elements in Compressed Sparse Row format>}
#roidb[3]
{'gt_classes': array([7], dtype=int32), 'max_classes': array([7]), 'image': '/home/sam/WORKSPACE/py-faster-rcnn/data/VOCdevkit2007/VOC2007/JPEGImages/000012.jpg', 'flipped': False, 'width': 500,
'boxes': array([[155, 96, 350, 269]], dtype=uint16), 'max_overlaps': array([ 1.], dtype=float32), 'height': 333, 'seg_areas': array([ 34104.], dtype=float32), 'gt_overlaps': <1x21 sparse matrix of type '<type 'numpy.float32'>'
with 1 stored elements in Compressed Sparse Row format>}
#roidb[0]["gt_overlaps"]:
[[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
#roidb[1]["gt_overlaps"]:
[[ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
#roidb[2]["gt_overlaps"]:
[[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]
#roidb[3]["gt_overlaps"]:
[[ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
def add_bbox_regression_targets(roidb): 添加bbox的迴歸目標(給roidb裏面的字典添加了’bbox_targets’屬性),同時根據cfg文件來處理bbox_targets的mean stds 問題
def add_bbox_regression_targets(roidb):
"""Add information needed to train bounding-box regressors."""
assert len(roidb) > 0
assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'
num_images = len(roidb)
# Infer number of classes from the number of columns in gt_overlaps
num_classes = roidb[0]['gt_overlaps'].shape[1]
for im_i in xrange(num_images):
rois = roidb[im_i]['boxes']
# 當在‘Stage 2 Fast R-CNN, init from stage 2 RPN R-CNN model’階段中,roidb由rpn_roidb()
# 方法生成,其中的沒一張圖像的box不僅僅只有gtbox,還包括rpn_file裏面的box。
max_overlaps = roidb[im_i]['max_overlaps']
max_classes = roidb[im_i]['max_classes']
roidb[im_i]['bbox_targets'] = \
_compute_targets(rois, max_overlaps, max_classes)
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Use fixed / precomputed "means" and "stds" instead of empirical values
means = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
stds = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
else:
# Compute values needed for means and stds
# var(x) = E(x^2) - E(x)^2
# 針對每個類計算mean 和 std, 因爲需要訓練class-specific regressors
class_counts = np.zeros((num_classes, 1)) + cfg.EPS
sums = np.zeros((num_classes, 4))
squared_sums = np.zeros((num_classes, 4))
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
if cls_inds.size > 0:
class_counts[cls] += cls_inds.size
sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
squared_sums[cls, :] += \
(targets[cls_inds, 1:] ** 2).sum(axis=0)
means = sums / class_counts
stds = np.sqrt(squared_sums / class_counts - means ** 2)
print 'bbox target means:'
print means # 輸出21類的bbox_targets的4個offset的均值
print means[1:, :].mean(axis=0) # ignore bg class
print 'bbox target stdevs:'
print stds # 輸出21類的bbox_targets的4個offset的std,每一類爲1行
print stds[1:, :].mean(axis=0) # ignore bg class
# Normalize targets
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
print "Normalizing targets"
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
else:
print "NOT normalizing targets"
# These values will be needed for making predictions
# (the predicts will need to be unnormalized and uncentered)
return means.ravel(), stds.ravel()
def _compute_targets(rois, overlaps, labels)
roidb中 鍵‘boxes’所對應的值爲左上、右下兩點的座標,其形式爲(w1, h1, w2, h2)
def _compute_targets(rois, overlaps, labels):
"""Compute bounding-box regression targets for an image."""
# Indices of ground-truth ROIs
gt_inds = np.where(overlaps == 1)[0]
if len(gt_inds) == 0:
# Bail if the image has no ground-truth ROIs
return np.zeros((rois.shape[0], 5), dtype=np.float32)
# Indices of examples for which we try to make predictions
ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]
# Get IoU overlap between each ex ROI and gt ROI
# 調用bbox.pyx文件中的bbox_overlaps函數計算RPN Proposals 與 Ground-Truth Box的IOU
# overlap;IOU overlap爲Proposal與gt box的面積交集 除以 面積的並集。
ex_gt_overlaps = bbox_overlaps(
np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
# Find which gt ROI each ex ROI has max overlap with:
# this will be the ex ROI's gt target
gt_assignment = ex_gt_overlaps.argmax(axis=1)
# 這裏gt_assignment的size比gt_inds的size大,所以返回gt_rois的size和ex_rois的size一樣大,
#即爲ex_rois中的每個roi都assign了一個gt,可見python的靈活強大
gt_rois = rois[gt_inds[gt_assignment], :]
ex_rois = rois[ex_inds, :]
# 調用bbox_transform.py文件中的bbox_transform函數。target返回圖片中每個box的結果,
#被選爲(P,G)pair的proposals包含其中,那些不滿足overlaps >= cfg.TRAIN.BBOX_THRESH
#的proposal也包含其中,只不過相應的值爲0.bbox_transform的計算參考論文RCNN
targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
targets[ex_inds, 0] = labels[ex_inds]
targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
return targets
bbox.pyx文件中的 def bbox_overlaps(np.ndarray[DTYPE_t, ndim=2] boxes,np.ndarray[DTYPE_t, ndim=2] query_boxes):
返回 : overlaps: (N, K) ndarray of overlap between boxes and query_boxes
def bbox_overlaps(
np.ndarray[DTYPE_t, ndim=2] boxes,
np.ndarray[DTYPE_t, ndim=2] query_boxes):
"""
Parameters
----------
boxes: (N, 4) ndarray of float
query_boxes: (K, 4) ndarray of float
Returns
-------
overlaps: (N, K) ndarray of overlap between boxes and query_boxes
"""
cdef unsigned int N = boxes.shape[0]
cdef unsigned int K = query_boxes.shape[0]
cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
cdef DTYPE_t iw, ih, box_area
cdef DTYPE_t ua
cdef unsigned int k, n
for k in range(K):
box_area = (
(query_boxes[k, 2] - query_boxes[k, 0] + 1) *
(query_boxes[k, 3] - query_boxes[k, 1] + 1)
)
for n in range(N):
iw = (
min(boxes[n, 2], query_boxes[k, 2]) -
max(boxes[n, 0], query_boxes[k, 0]) + 1
)
if iw > 0:
ih = (
min(boxes[n, 3], query_boxes[k, 3]) -
max(boxes[n, 1], query_boxes[k, 1]) + 1
)
if ih > 0:
ua = float(
(boxes[n, 2] - boxes[n, 0] + 1) *
(boxes[n, 3] - boxes[n, 1] + 1) +
box_area - iw * ih
)
overlaps[n, k] = iw * ih / ua
return overlaps