目標檢測算法中ROI提取方法比較+源碼分析

本文主要介紹ROI提取結構在目標檢測框架中的作用,並結合源碼,理解它的實現方式。包含的算法有:ROI-pooling,ROI-align,Deformable-psroi-pooling。
目前,主流的目標檢測算法大致分爲2種,one-stage和two-stage方法。

  • one-stage:典型代表爲SSD,相當於two-stage中的rpn結構,先通過基本的特徵提取網絡如resnet或vggnet得到特徵圖,再通過5層的卷積得到目標位置和類別。這種方法計算速度快,但是精度較two-stage方法差一些
  • two-stage:典型代表爲Faster-rcnn。其結構分爲RPN(Region Proposal Network)和RCNN(Region Convolution Neural Network)兩個部分。RPN的特徵通過ROI-Pooling層傳遞到RCNN中。

本文介紹的方法,僅出現在two-stage的方法中。顧名思義,該層的作用就是將RPN中提取的位置,截取特徵圖中特徵用於進一步的分類和定位。


ROI-Pooling

Roi-pooling是Faster-rcnn原版使用的特徵提取方式,論文在此。這裏用動圖來說明roi-pooling的過程(動圖來源)
在這裏插入圖片描述
從上圖看出,ROI-Pooling層的輸入有兩個:RPN層得到的位置和特徵提取網絡得到的特徵。參數有pooling結果的寬高。
好了,下面結合圖片來理解ROI-pooling代碼,代碼來源是pytorch-fasterrcnn項目,由於是cuda代碼,所以如果你看了另外支持自定義operation的框架(如caffe,mxnet等)的roipooling實現方式,就會發現它們是完全一致的。
在這裏插入圖片描述
代碼的註釋中,增加了上圖實例中各個變量的實際值,以便於讀者理解。

_global__ void ROIPoolForward(const int nthreads, const float* bottom_data,
    const float spatial_scale, const int height, const int width,
    const int channels, const int pooled_height, const int pooled_width,
    const float* bottom_rois, float* top_data, int* argmax_data)
{
    CUDA_KERNEL_LOOP(index, nthreads)
    {
    	//index是gpu並行時塊的計數
        int pw = index % pooled_width;//pooled_width=2,用戶設置的參數,控制pooling輸出大小
        int ph = (index / pooled_width) % pooled_height;//pooled_height=2,用戶設置的參數,控制pooling輸出大小
        int c  = (index / pooled_width / pooled_height) % channels;
        int n  = index / pooled_width / pooled_height / channels;

        // bottom_rois += n * 5;
        int roi_batch_ind = bottom_rois[n * 5 + 0];
        int roi_start_w = round(bottom_rois[n * 5 + 1] * spatial_scale);//rsw=0,左上角點橫座標,來自RPN
        int roi_start_h = round(bottom_rois[n * 5 + 2] * spatial_scale);//rsh=3,左上角點縱座標,來自RPN
        int roi_end_w = round(bottom_rois[n * 5 + 3] * spatial_scale);//rew=7,右下角點橫座標,來自RPN
        int roi_end_h = round(bottom_rois[n * 5 + 4] * spatial_scale);//reh=8,右下角點縱座標,來自RPN

        // Force malformed ROIs to be 1x1
        int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1);
        int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1);
        float bin_size_h = (float)(roi_height) / (float)(pooled_height);
        float bin_size_w = (float)(roi_width) / (float)(pooled_width);

        int hstart = (int)(floor((float)(ph) * bin_size_h));
        int wstart = (int)(floor((float)(pw) * bin_size_w));
        int hend = (int)(ceil((float)(ph + 1) * bin_size_h));
        int wend = (int)(ceil((float)(pw + 1) * bin_size_w));

        // Add roi offsets and clip to input boundaries
        hstart = fminf(fmaxf(hstart + roi_start_h, 0), height);
        hend = fminf(fmaxf(hend + roi_start_h, 0), height);
        wstart = fminf(fmaxf(wstart + roi_start_w, 0), width);
        wend = fminf(fmaxf(wend + roi_start_w, 0), width);
        bool is_empty = (hend <= hstart) || (wend <= wstart);//當roi_width<pooled_width或roi_height<pooled_height時觸發,此時bin_size<1

        // Define an empty pooling region to be zero
        float maxval = is_empty ? 0 : -FLT_MAX;
        // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
        int maxidx = -1;
        
        int bottom_data_batch_offset = roi_batch_ind * channels * height * width;
        int bottom_data_offset = bottom_data_batch_offset + c * height * width;
        
		//max-pooling操作,不同的index對應的hstart,wstart,hend,wend不同
        for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
                int bottom_index = h * width + w;
                if (bottom_data[bottom_data_offset + bottom_index] > maxval) {
                    maxval = bottom_data[bottom_data_offset + bottom_index];
                    maxidx = bottom_data_offset + bottom_index;
                }
            }
        }
        top_data[index] = maxval;
        if (argmax_data != NULL)
            argmax_data[index] = maxidx;
    }
}


ROI-align

從上面ROI-pooling的實現過程不難看出,由於取整的影響,各個index方塊中對應的寬高是不同的,有些是2,有些是3。而ROI-align做了一個小改動,使h, w可以是小數,並通過雙線性內插取得各個像素值,消除了取整帶來的誤差。源碼位置
此時,特徵圖上的pooling方框變成了下面這樣
在這裏插入圖片描述

    __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width,
                                    const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) {
        CUDA_1D_KERNEL_LOOP(index, nthreads) {

            int pw = index % aligned_width;
            int ph = (index / aligned_width) % aligned_height;
            int c  = (index / aligned_width / aligned_height) % channels;
            int n  = index / aligned_width / aligned_height / channels;

            // bottom_rois += n * 5;
            float roi_batch_ind = bottom_rois[n * 5 + 0];
            float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
            float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
            float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
            float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;

            // Force malformed ROIs to be 1x1
            float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
            float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
            float bin_size_h = roi_height / (aligned_height - 1.);
            float bin_size_w = roi_width / (aligned_width - 1.);
			
			//注意,此處的h,w變成了float型,避免取整帶來的誤差
            float h = (float)(ph) * bin_size_h + roi_start_h;
            float w = (float)(pw) * bin_size_w + roi_start_w;
			
			//保留了整數的hstart,hstart,便於下面計算取整帶來的位置偏移到底是多少
            int hstart = fminf(floor(h), height - 2);
            int wstart = fminf(floor(w), width - 2);

            int img_start = roi_batch_ind * channels * height * width;

            // bilinear interpolation
            if (h < 0 || h >= height || w < 0 || w >= width) {
                top_data[index] = 0.;
            } else {
            	//計算位置偏移,h是float型,hstart是int型
                float h_ratio = h - (float)(hstart);
                float w_ratio = w - (float)(wstart);
                int upleft = img_start + (c * height + hstart) * width + wstart;
                int upright = upleft + 1;
                int downleft = upleft + width;
                int downright = downleft + 1;
				
				//雙線性內插
                top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio)
                    + bottom_data[upright] * (1. - h_ratio) * w_ratio
                    + bottom_data[downleft] * h_ratio * (1. - w_ratio)
                    + bottom_data[downright] * h_ratio * w_ratio;
            }
        }
    }

有些細心的同學可能發現了上面的代碼沒有進行pooling操作,只有內插。那是因爲caffe實現roi-align時,在後面又接了一個avg_pooling和max_pooling,以實現不同的roipooling操作,並且避免了重複開發。
代碼位置


Deformable Roi pooling

可變形roi提取方法來源於論文Deformable Convolutional Networks,文章介紹了形變卷積的方法(增加offset)和所帶來的好處。其中,Deformable Roi pooling就是一種由此衍生而來的思路。代碼來源
由於要實現可變形,所以代碼中加入了offset變量。

    template <typename DType>
    __global__ void DeformablePSROIPoolForwardKernel(
      const int count,
      const DType* bottom_data,
      const DType spatial_scale,
      const int channels,
      const int height, const int width,
      const int pooled_height, const int pooled_width,
      const DType* bottom_rois, const DType* bottom_trans,
      const bool no_trans,
      const DType trans_std,
      const int sample_per_part,
      const int output_dim,
      const int group_size,
      const int part_size,
      const int num_classes,
      const int channels_each_class,
      DType* top_data,
      DType* top_count) {
      CUDA_KERNEL_LOOP(index, count) {
      	//常規套路
        // The output is in order (n, ctop, ph, pw)
        int pw = index % pooled_width;
        int ph = (index / pooled_width) % pooled_height;
        int ctop = (index / pooled_width / pooled_height) % output_dim;
        int n = index / pooled_width / pooled_height / output_dim;

        // [start, end) interval for spatial sampling
        const DType* offset_bottom_rois = bottom_rois + n * 5;
        int roi_batch_ind = offset_bottom_rois[0];
        DType roi_start_w = static_cast<DType>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
        DType roi_start_h = static_cast<DType>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
        DType roi_end_w = static_cast<DType>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
        DType roi_end_h = static_cast<DType>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;

        // Force too small ROIs to be 1x1
        DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
        DType roi_height = max(roi_end_h - roi_start_h, 0.1);

        // Compute w and h at bottom
        DType bin_size_h = roi_height / static_cast<DType>(pooled_height);
        DType bin_size_w = roi_width / static_cast<DType>(pooled_width);
		
		//一個採樣點採樣幾次
        DType sub_bin_size_h = bin_size_h / static_cast<DType>(sample_per_part);
        DType sub_bin_size_w = bin_size_w / static_cast<DType>(sample_per_part);

        int part_h = floor(static_cast<DType>(ph) / pooled_height*part_size);
        int part_w = floor(static_cast<DType>(pw) / pooled_width*part_size);
        int class_id = ctop / channels_each_class;
        
        //傳遞bottom_trans,也就是採樣點位移
        DType trans_x = no_trans ? static_cast<DType>(0) :
          bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w] * trans_std;
        DType trans_y = no_trans ? static_cast<DType>(0) :
          bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h)*part_size + part_w] * trans_std;
        
        DType wstart = static_cast<DType>(pw)* bin_size_w
          + roi_start_w;
        wstart += trans_x * roi_width;
        DType hstart = static_cast<DType>(ph) * bin_size_h
          + roi_start_h;
        hstart += trans_y * roi_height;
        
        DType sum = 0;
        int count = 0;
        int gw = floor(static_cast<DType>(pw) * group_size / pooled_width);
        int gh = floor(static_cast<DType>(ph)* group_size / pooled_height);
        gw = min(max(gw, 0), group_size - 1);
        gh = min(max(gh, 0), group_size - 1);
		
		//內插採樣(比roi-align更精確)
        const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
        for (int ih = 0; ih < sample_per_part; ih++) {
          for (int iw = 0; iw < sample_per_part; iw++) {
            DType w = wstart + iw*sub_bin_size_w;
            DType h = hstart + ih*sub_bin_size_h;
            // bilinear interpolation
            if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) {
              continue;
            }
            w = min(max(w, 0.), width - 1.);
            h = min(max(h, 0.), height - 1.);
            int c = (ctop*group_size + gh)*group_size + gw;
            DType val = bilinear_interp(offset_bottom_data + c*height*width, w, h, width, height);
            sum += val;
            count++;
          }
        }
        top_data[index] = count == 0 ? static_cast<DType>(0) : sum / count;
        top_count[index] = count;
      }
    }


本文描述了各個ROI特徵提取技術的流程,每一個新的技術都是對上一代方法缺點進行了改進。
最後,祝您身體健康,再見!


https://blog.csdn.net/jiongnima/article/details/80016683
https://towardsdatascience.com/review-dcn-deformable-convolutional-networks-2nd-runner-up-in-2017-coco-detection-object-14e488efce44

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章