Faster R-CNN roi_pooling_layer 淺析

閱讀完 R-CNN一系列論文之後,開始看源代碼。本文簡要記錄自己對roi_pooling_layer源碼的理解。
作者首先在caffe.proto中添加該層參數說明,主要是三個變量
 optional ROIPoolingParameter roi_pooling_param = 43;

 message ROIPoolingParameter {
// Pad, kernel size, and stride are all given as a single value for    equal
// dimensions in height and width or as Y, X pairs.
  optional uint32 pooled_h = 1 [default = 0]; // The pooled output height
  optional uint32 pooled_w = 2 [default = 0]; // The pooled output width
  // Multiplicative spatial scale factor to translate ROI coords from their
  // input scale to the scale used when pooling
  optional float spatial_scale = 3 [default = 1];
}

LayerSetUp

void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
        ROIPoolingParameter roi_pool_param = this->layer_param_.roi_pooling_param();
        CHECK_GT(roi_pool_param.pooled_h(), 0)
            << "pooled_h must be > 0";
        CHECK_GT(roi_pool_param.pooled_w(), 0)
            << "pooled_w must be > 0";
        pooled_height_ = roi_pool_param.pooled_h();
        pooled_width_ = roi_pool_param.pooled_w();
        spatial_scale_ = roi_pool_param.spatial_scale();
        LOG(INFO) << "Spatial scale: " << spatial_scale_;
}
實現參數賦值

Reshape

    void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
        channels_ = bottom[0]->channels();
        height_ = bottom[0]->height();
        width_ = bottom[0]->width();
        // top[0]的通道數與bottom[0]的通道數是相等的,畢竟只是做了個pooling而已   
        // top[0]的數量跟ROI的數量是一樣的,就是將ROI對應到conv_5上
        top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_,   // num of rois
            pooled_width_);
        max_idx_.Reshape(bottom[1]->num(), channels_, pooled_height_,
            pooled_width_);             
}

Forward_cpu

    void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
        const Dtype* bottom_data = bottom[0]->cpu_data();  
        // bottom_data 爲完整圖像經過conv_layer等前向傳播所得 ,即conv_5
        const Dtype* bottom_rois = bottom[1]->cpu_data();  
        // bottom_rois 爲rois,其實就是一些rois的信息:batch_index和2個點的座標
        // Number of ROIs
        int num_rois = bottom[1]->num();      //ROI的數量
        int batch_size = bottom[0]->num();    //conv_5 一批特徵的數量
        int top_count = top[0]->count();      
        Dtype* top_data = top[0]->mutable_cpu_data();
        caffe_set(top_count, Dtype(-FLT_MAX), top_data);    
        //將top_data全部設置成最小值(個數,值,位置)
        int* argmax_data = max_idx_.mutable_cpu_data();
        caffe_set(top_count, -1, argmax_data);

        // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
        //對於每一個ROI 區域
            int roi_batch_ind = bottom_rois[0];        //下標
            int roi_start_w = round(bottom_rois[1] * spatial_scale_);      
            int roi_start_h = round(bottom_rois[2] * spatial_scale_);
            int roi_end_w = round(bottom_rois[3] * spatial_scale_);
            int roi_end_h = round(bottom_rois[4] * spatial_scale_);
            CHECK_GE(roi_batch_ind, 0);
            CHECK_LT(roi_batch_ind, batch_size);

            int roi_height = max(roi_end_h - roi_start_h + 1, 1);
            int roi_width = max(roi_end_w - roi_start_w + 1, 1);
            const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                / static_cast<Dtype>(pooled_height_);          
             //除法 ROI區域相對於pooling後圖像大小的比例 可理解爲pooling後一個像  素代表多少個ROI像素
            const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                / static_cast<Dtype>(pooled_width_);

            const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind); // 找到對應ROI的conv_5的地址
            // bottom_data 爲完整圖像經過conv_layer等前向傳播所得 ,即conv_5 地址 
            for (int c = 0; c < channels_; ++c) {
                for (int ph = 0; ph < pooled_height_; ++ph) {
                    for (int pw = 0; pw < pooled_width_; ++pw) {
                        // Compute pooling region for this output unit:
                        //  start (included) = floor(ph * roi_height / pooled_height_)
                        //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
                        int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
                            * bin_size_h));
                        int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
                            * bin_size_w));
                        int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
                            * bin_size_h));
                        int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
                            * bin_size_w));

                        hstart = min(max(hstart + roi_start_h, 0), height_);    //height_  是conv_5的大小  
                        hend = min(max(hend + roi_start_h, 0), height_);
                        wstart = min(max(wstart + roi_start_w, 0), width_);
                        wend = min(max(wend + roi_start_w, 0), width_);
   //爲什麼要加上roi_start_h?因爲roi取自源圖片,其左上角座標不是從(0,0)開始
                        bool is_empty = (hend <= hstart) || (wend <= wstart);

                        const int pool_index = ph * pooled_width_ + pw;
                        if (is_empty) {
                            top_data[pool_index] = 0;
                            argmax_data[pool_index] = -1;
                        }

                        for (int h = hstart; h < hend; ++h) {
                            for (int w = wstart; w < wend; ++w) {
                                const int index = h * width_ + w;
                                if (batch_data[index] > top_data[pool_index]) {
                                    top_data[pool_index] = batch_data[index];    //conv_5上的像素對應到了輸出裏面
                                    argmax_data[pool_index] = index;
                                }
                            }
                        }
                    }
                }
                // Increment all data pointers by one channel
                batch_data += bottom[0]->offset(0, 1);
                top_data += top[0]->offset(0, 1);
                argmax_data += max_idx_.offset(0, 1);
            }
            // Increment ROI data pointer
            bottom_rois += bottom[1]->offset(1);
        }
}

template <typename Dtype>
void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
        NOT_IMPLEMENTED;
}

總的來講,實現的功能是
首先計算rois映射到feature map的座標,即原始座標*spacial_scale(大小爲所有stride的乘積分之一),然後針對每個輸出來進行計算,即每個輸出點都代表原先的一塊區域,這個區域大小爲bin_h= roi_height / pooled_ height, bin_w=roi_width / pooled_width.遍歷所有top的點所映射回feature map的區域,並找到最大值,記錄最大值所在的位置。

Backward_gpu

作者實現的是GPU上的反向傳播算法

 template  <typename Dtype>
 __global__ void ROIPoolBackward(const int nthreads, const Dtype* top_diff,
const int* argmax_data, const int num_rois, const Dtype spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, Dtype* bottom_diff,
const Dtype* bottom_rois) {
 CUDA_KERNEL_LOOP(index, nthreads) {
    // (n, c, h, w) coords in bottom data       //遍歷bottom[0],也就是feature map conv_5的特徵圖
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
Dtype gradient = 0;
// Accumulate gradient over all ROIs that pooled this element
for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
  const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
  int roi_batch_ind = offset_bottom_rois[0];
  // Skip if ROI's batch index doesn't match n         ROI要和conv_5的序號對應起來
  if (n != roi_batch_ind) {
    continue;
  }

  int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);  //ROIS 縮放到 映射大小區域
  int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
  int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
  int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);

  // Skip if ROI doesn't include (h, w)
  const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
                       h >= roi_start_h && h <= roi_end_h);
  if (!in_roi) {
    continue;
  }

  int offset = (roi_n * channels + c) * pooled_height * pooled_width;  //計算該ROI 對應到pooling後,相對於初始位置的偏移
  const Dtype* offset_top_diff = top_diff + offset;                    //獲得地址
  const int* offset_argmax_data = argmax_data + offset;                //對應pooling取值在conv_5上的位置地址

  // Compute feasible set of pooled units that could have pooled
  // this bottom unit

  // Force malformed ROIs to be 1x1
  int roi_width = max(roi_end_w - roi_start_w + 1, 1);
  int roi_height = max(roi_end_h - roi_start_h + 1, 1);

  Dtype bin_size_h = static_cast<Dtype>(roi_height)
                     / static_cast<Dtype>(pooled_height);
  Dtype bin_size_w = static_cast<Dtype>(roi_width)
                     / static_cast<Dtype>(pooled_width);

  int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h);
  int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h);
  int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w);
  int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w);

  phstart = min(max(phstart, 0), pooled_height);
  phend = min(max(phend, 0), pooled_height);
  pwstart = min(max(pwstart, 0), pooled_width);
  pwend = min(max(pwend, 0), pooled_width);

  for (int ph = phstart; ph < phend; ++ph) {
    for (int pw = pwstart; pw < pwend; ++pw) {
      if (offset_argmax_data[ph * pooled_width + pw] == (h * width + w)) {  //對於這一個conv_5位置點,它可能對pooling上多個點有影響,累加
        gradient += offset_top_diff[ph * pooled_width + pw];
      }
    }
  }
}
bottom_diff[index] = gradient;
  }
}
template <typename Dtype>
void ROIPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (!propagate_down[0]) {
return;
 }
  const Dtype* bottom_rois = bottom[1]->gpu_data();
  const Dtype* top_diff = top[0]->gpu_diff();
  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
  const int count = bottom[0]->count();
  caffe_gpu_set(count, Dtype(0.), bottom_diff);
  const int* argmax_data = max_idx_.gpu_data();
  // NOLINT_NEXT_LINE(whitespace/operators)
  ROIPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
      count, top_diff, argmax_data, top[0]->num(), spatial_scale_, channels_,
      height_, width_, pooled_height_, pooled_width_, bottom_diff, bottom_rois);
  CUDA_POST_KERNEL_CHECK;
}

總結起來就是:
遍歷feature map並記錄n, c, h, w,爲之後記錄bottom_diff做準備,然後計算每個roi映射到feature map的座標,接下來我就認爲有個小問題了,作者的意思是表達如果h,w如果不在roi區域內的話,可以直接continue了,這點不難理解,某個點在roi中可能對這個roi所對應的top產生貢獻(在某個bin中爲最大),如果點不在那個區域中,一定不會對top產生貢獻。而某一點可能對多個區域產生貢獻,故loss返回來時,同一點的loss累加。

參考博客

http://blog.csdn.net/xyy19920105/article/details/50420779
http://blog.csdn.net/iamzhangzhuping/article/details/51500162

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章