Mask-RCNN之PyramidROIAlign代码赏析

原創
2019-06-22 15:38

class PyramidROIAlign(KE.Layer):
    """Implements ROI Pooling on multiple levels of the feature pyramid.
    Params:
    - pool_shape: [pool_height, pool_width] of the output pooled regions. Usually [7, 7]

    Inputs:
    - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized coordinates. Possibly padded with zeros if not enough
             boxes to fill the array.
    - image_meta: [batch, (meta data)] Image details. See compose_image_meta()
    - feature_maps: List of feature maps from different levels of the pyramid. Each is [batch, height, width, channels]

    Output:
    Pooled regions in the shape: [batch, num_boxes, pool_height, pool_width, channels].
    The width and height are those specific in the pool_shape in the layer constructor.
    """

    def __init__(self, pool_shape, **kwargs):
        super(PyramidROIAlign, self).__init__(**kwargs)
        self.pool_shape = tuple(pool_shape)

    def call(self, inputs):
        # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
        boxes = inputs[0]

        # Image meta
        # Holds details about the image. See compose_image_meta()
        image_meta = inputs[1]

        # Feature Maps. List of feature maps from different level of the
        # feature pyramid. Each is [batch, height, width, channels]
        feature_maps = inputs[2:]

        # Assign each ROI to a level in the pyramid based on the ROI area.
        y1, x1, y2, x2 = tf.split(boxes, 4, axis=2) # [bt,num_boxes,1]
        h = y2 - y1
        w = x2 - x1
        # Use shape of first image. Images in a batch must have the same size.
        image_shape = data_utils.parse_image_meta_graph(image_meta)['image_shape'][0] # [bt,3][0]--->[3] 无batch
        # Equation 1 in the Feature Pyramid Networks paper. Account for
        # the fact that our coordinates are normalized here.
        # e.g. a 56*56 ROI (in pixels) maps to P3
        image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32) # 就是一个值
        roi_level = comm_utils.log2_graph(tf.sqrt(h * w) / (56.0 / tf.sqrt(image_area))) # [bt, num_boxes, 1]
        roi_level = tf.minimum(4, tf.maximum(2, 3 + tf.cast(tf.round(roi_level), tf.int32))) # [bt, num_boxes, 1]
        roi_level = tf.squeeze(roi_level, 2) # [bt, num_boxes] 记录了每个box所属于的P图层

        # Loop through levels and apply ROI pooling to each. P2 to P4.
        pooled = []  # 记录了每个图层的resize输出结果,[[box_num, pool_h, pool_w, c], ...]
        box_to_level = [] # 记录了每个图层所属的box地址, 如果只有一个图层,则应该包含所有box,
        for i, level in enumerate(range(2, 5)):
            # tf.equal(roi_level, level) :shape [bt, num_boxes] 全是bool值
            ix = tf.where(tf.equal(roi_level, level)) # 对于P_level图层, 所有属于其的box的索引 [?, 2] ,?是匹配上的个数
            level_boxes = tf.gather_nd(boxes, ix) # 从总box中获取这一部分box, [?, 4] ,这?个box,来自于不同的图像

            # Box indices for crop_and_resize.
            box_indices = tf.cast(ix[:, 0], tf.int32) # index : box_to_img, [?,] 代表了图像的index

            # Keep track of which box is mapped to which level
            box_to_level.append(ix)

            # Stop gradient propogation to ROI proposals
            level_boxes = tf.stop_gradient(level_boxes)
            box_indices = tf.stop_gradient(box_indices)

            # Crop and Resize
            # From Mask R-CNN paper: "We sample four regular locations, so
            # that we can evaluate either max or average pooling. In fact,
            # interpolating only a single value at each bin center (without
            # pooling) is nearly as effective."
            #
            # Here we use the simplified approach of a single value per bin,
            # which is how it's done in tf.crop_and_resize()
            # Result: [batch * num_boxes, pool_height, pool_width, channels]

            # tf.image.crop_and_resize的输出shape : [box_num, pool_h, pool_w, c]
            pooled.append(tf.image.crop_and_resize(feature_maps[i],  # [bt, h, w, 256], 这个i,是第i图层的意思
                                                   level_boxes,      # [?, 4] 这个问号,是box的个数
                                                   box_indices,      # [?,] 此index共?个, [j]代表了第j个box所对应的img, img范围属于[0, bt-1]
                                                   self.pool_shape,
                                                   method="bilinear"))

        # Pack pooled features into one tensor
        pooled = tf.concat(pooled, axis=0) # [batch * num_boxes, pool_height, pool_width, channels]

        # Pack box_to_level mapping into one array and add another column representing the order of pooled boxes
        box_to_level = tf.concat(box_to_level, axis=0) # [bt*num_boxes, 2]
        box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1) # [bt*num_boxes, 1]
        box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range], axis=1) # [bt*num_boxes, 3] 3:座标+序号

        # Rearrange pooled features to match the order of the original boxes
        # Sort box_to_level by batch then box index
        # TF doesn't have a way to sort by two columns, so merge them and sort.

        #                 照片序号变得很大             box序号相对变小 , 这样就可以按照batch顺序排序
        sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
        ix = tf.nn.top_k(sorting_tensor, k=tf.shape(box_to_level)[0]).indices[::-1] # 升序

        # 原先是杂乱的-->按照batch排序,得到升序ix-->
        ix = tf.gather(box_to_level[:, 2], ix)
        pooled = tf.gather(pooled, ix) # 得到按照batch排列的pooled

        # Re-add the batch dimension
        # [bt, num] [pool_height, pool_width, channels]
        shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0)
        pooled = tf.reshape(pooled, shape)
        return pooled
發表評論
所有評論
還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
Mask-RCNN之PyramidROIAlign代码赏析

Keras--動態調整學習率

【C++學習】1.Kdevelop環境配置

TTFNET實踐記錄

Ubuntu opencv3.4.1 編譯之編譯錯誤: 'cuda_compile_generated_gpu_mat.cu.o'

【C++學習】2.CMakeLists

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結