【NVCaffe源碼解析】MultiBoxLossLayer(Ing)

    NVCaffe中MultiBoxLossLayer的代碼說真的不是那麼好容易理解,有很多細節的地方最好是結合 SSD的論文反覆推敲,方得真諦。在深入解讀源碼之前,還是先給一個訪層的參數配置樣例。在後文中對代碼進行推演的時候以訪樣例爲參考。

layer {
    name: "mbox_loss"
    type: "MultiBoxLoss"
    bottom: "mbox_loc"
    bottom: "mbox_conf"
    bottom: "mbox_priorbox"
    bottom: "label"
    top: "mbox_loss"
    include {
      phase: TRAIN
    }
    propagate_down: true
    propagate_down: true
    propagate_down: false
    propagate_down: false
    loss_param {
      normalization: VALID
    }
    multibox_loss_param {
      loc_loss_type: SMOOTH_L1 //位置損失函數
      conf_loss_type: SOFTMAX  //置信度損失函數
      loc_weight: 1
      num_classes: 5           //類別數(背景類 + 目標類別)
      share_location: true
      match_type: PER_PREDICTION
      overlap_threshold: 0.5
      use_prior_for_matching: true
      background_label_id: 0   //背景類id,通常爲0
      use_difficult_gt: false
      neg_pos_ratio: 3         //正負樣本比例1:3
      neg_overlap: 0.5         //負樣本IoU 閾值
      code_type: CENTER_SIZE
      ignore_cross_boundary_bbox: false
      mining_type: MAX_NEGATIVE  //難樣本挖掘策略
    }
  }

LayerSetUp

    每個網絡層都將調用 LayerSetUp 進行特定的設置,主要包括讀取和處理對應訪層的相關參數,設置輸出 blob 的 shape 等。

template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
      const vector<Blob*>& top) {
  LossLayer<Ftype, Btype>::LayerSetUp(bottom, top);
  //配置文件如果沒有明確給出propagate_down參數配置,則照此默認配置.
  if (this->layer_param_.propagate_down_size() == 0) {
    this->layer_param_.add_propagate_down(true);
    this->layer_param_.add_propagate_down(true);
    this->layer_param_.add_propagate_down(false);
    this->layer_param_.add_propagate_down(false);
  }
  const MultiBoxLossParameter& multibox_loss_param =
      this->layer_param_.multibox_loss_param();
  multibox_loss_param_ = this->layer_param_.multibox_loss_param();
 
  //圖片的數量即(N,C,H,W)中的 N
  num_ = bottom[0]->num(); 
  /*bootom[2]是 mbox_priorbox,它是 PriorBoxLayer的輸出,即生成的 Anchor.
  bootom 的 shape 爲(1,2,N),其中 N 爲訪層生成的所有Anchor的座標,用於 Anchor 用
  4個座標來表示,所以這裏除以4就得到了總的生成的 Anchor 的個數*/
  num_priors_ = bottom[2]->height() / 4;
  // Get other parameters.
  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
  //類別數,注意要目標類別數再加上一個背景類
  num_classes_ = multibox_loss_param.num_classes();
  CHECK_GE(num_classes_, 1) << "num_classes should not be less than 1.";
  /*什麼意思呢?就是說各個類別是否共享一組位置, 相當於是在同一個 box上區分
  不同的類別,通常都是true*/
  share_location_ = multibox_loss_param.share_location();
  loc_classes_ = share_location_ ? 1 : num_classes_;
  background_label_id_ = multibox_loss_param.background_label_id();
  use_difficult_gt_ = multibox_loss_param.use_difficult_gt();
  //難樣本挖掘策略
  mining_type_ = multibox_loss_param.mining_type();
  if (multibox_loss_param.has_do_neg_mining()) {
    LOG(WARNING) << "do_neg_mining is deprecated, use mining_type instead.";
    do_neg_mining_ = multibox_loss_param.do_neg_mining();
    CHECK_EQ(do_neg_mining_,
             mining_type_ != MultiBoxLossParameter_MiningType_NONE);
  }
  //如果配置爲None,表示不採用難樣本挖掘
  do_neg_mining_ = mining_type_ != MultiBoxLossParameter_MiningType_NONE;

  if (!this->layer_param_.loss_param().has_normalization() &&
      this->layer_param_.loss_param().has_normalize()) {
    normalization_ = this->layer_param_.loss_param().normalize() ?
                     LossParameter_NormalizationMode_VALID :
                     LossParameter_NormalizationMode_BATCH_SIZE;
  } else {
    normalization_ = this->layer_param_.loss_param().normalization();
  }

  if (do_neg_mining_) {
    CHECK(share_location_)
        << "Currently only support negative mining if share_location is true.";
  }

  vector<int> loss_shape(1, 1);
  // Set up localization loss layer.
  loc_weight_ = multibox_loss_param.loc_weight();
  loc_loss_type_ = multibox_loss_param.loc_loss_type();
  // fake shape.
  vector<int> loc_shape(1, 1);
  loc_shape.push_back(4);
  loc_pred_ = Blob::create<Dtype>();
  loc_pred_->Reshape(loc_shape);
  loc_gt_ = Blob::create<Dtype>();
  loc_gt_->Reshape(loc_shape);
  loc_bottom_vec_.push_back(loc_pred_.get());
  loc_bottom_vec_.push_back(loc_gt_.get());
  loc_loss_ = Blob::create<Dtype>();
  loc_loss_->Reshape(loss_shape);
  loc_top_vec_.push_back(loc_loss_.get());
  if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_L2) {
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_l2_loc");
    layer_param.set_type("EuclideanLoss");
    layer_param.add_loss_weight(loc_weight_);
    loc_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
    loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_);
  } else if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_SMOOTH_L1) {
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_smooth_L1_loc");
    layer_param.set_type("SmoothL1Loss");
    layer_param.add_loss_weight(loc_weight_);
    loc_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
    loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_);
  } else {
    LOG(FATAL) << "Unknown localization loss type.";
  }
  // Set up confidence loss layer.
  conf_weight_ = multibox_loss_param.conf_weight();
  conf_loss_type_ = multibox_loss_param.conf_loss_type();
  conf_pred_ = Blob::create<Dtype>();
  conf_gt_ = Blob::create<Dtype>();
  conf_loss_ = Blob::create<Dtype>();
  conf_loss_->Reshape(loss_shape);
  conf_top_vec_.push_back(conf_loss_.get());
  if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
    CHECK_GE(background_label_id_, 0)
        << "background_label_id should be within [0, num_classes) for Softmax.";
    CHECK_LT(background_label_id_, num_classes_)
        << "background_label_id should be within [0, num_classes) for Softmax.";
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_softmax_conf");
    layer_param.set_type("SoftmaxWithLoss");
    layer_param.add_loss_weight(conf_weight_);
    // layer_param.add_loss_weight(Dtype(1.));
    layer_param.mutable_loss_param()->set_normalization(
        LossParameter_NormalizationMode_NONE);
    SoftmaxParameter* softmax_param = layer_param.mutable_softmax_param();
    softmax_param->set_axis(1);
    // Fake reshape.
    vector<int> conf_shape(1, 1);
    conf_gt_->Reshape(conf_shape);
    conf_shape.push_back(num_classes_);
    conf_pred_->Reshape(conf_shape);
    conf_bottom_vec_.push_back(conf_pred_.get());
    conf_bottom_vec_.push_back(conf_gt_.get());
    conf_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
    conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_);
  } else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_logistic_conf");
    layer_param.set_type("SigmoidCrossEntropyLoss");
    layer_param.add_loss_weight(conf_weight_);
    // layer_param.add_loss_weight(Dtype(1.));
    // Fake reshape.
    vector<int> conf_shape(1, 1);
    conf_shape.push_back(num_classes_);
    conf_gt_->Reshape(conf_shape);
    conf_pred_->Reshape(conf_shape);
    conf_bottom_vec_.push_back(conf_pred_.get());
    conf_bottom_vec_.push_back(conf_gt_.get());
    conf_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
    conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_);
  } else {
    LOG(FATAL) << "Unknown confidence loss type.";
  }
}

Reshape

template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
      const vector<Blob*>& top) {
  LossLayer<Ftype, Btype>::Reshape(bottom, top);
  num_ = bottom[0]->num();
  num_priors_ = bottom[2]->height() / 4; //anchor 的數量,上文已做解釋
  num_gt_ = bottom[3]->height(); //bottom[3]是 label,這裏得到 gt 框的數目
  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
  CHECK_EQ(num_priors_ * loc_classes_ * 4, bottom[0]->channels())
      << "Number of priors must match number of location predictions.";
  CHECK_EQ(num_priors_ * num_classes_, bottom[1]->channels())
      << "Number of priors must match number of confidence predictions.";
}

bottom[3]表示爲 label,bottom[3]->height()得到的是gt 框的總的個數,爲什麼這麼說呢?數據輸入層以 AnnotatedDataLayer 爲例,

它的輸出 shape 一般情況下爲(1,1,N,8),這裏的 第2維N 就是一個 batch 中的 gt 框的數量,這個就是這麼設定的。第3維的8表示對應 gt 框的相關信息(分類信息、位置信息等)。

前向推理(Forward)

    最核心的部分,也最複雜。主要完成的工作包括:1).正負樣本的匹配、劃分;2).難樣本挖掘;3).loss 的計算等等。

template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::Forward_cpu(const vector<Blob*>& bottom,
    const vector<Blob*>& top) {
  const Dtype* loc_data = bottom[0]->cpu_data<Dtype>();
  const Dtype* conf_data = bottom[1]->cpu_data<Dtype>();
  const Dtype* prior_data = bottom[2]->cpu_data<Dtype>();
  const Dtype* gt_data = bottom[3]->cpu_data<Dtype>();
  // refinedet
  const Dtype* arm_conf_data = NULL;
  const Dtype* arm_loc_data = NULL;
  vector<LabelBBox> all_arm_loc_preds;
  if (bottom.size() >= 5) {
	  arm_conf_data = bottom[4]->cpu_data<Dtype>();
  }
  if (bottom.size() >= 6) {
    arm_loc_data = bottom[5]->cpu_data<Dtype>();
    GetLocPredictions(arm_loc_data, num_, num_priors_, loc_classes_, share_location_,
                      &all_arm_loc_preds);
  }
  // Retrieve all ground truth.
  map<int, vector<NormalizedBBox> > all_gt_bboxes;
  GetGroundTruth(gt_data, num_classes_, num_gt_, background_label_id_, use_difficult_gt_,
                 &all_gt_bboxes);

  // Retrieve all prior bboxes. It is same within a batch since we assume all
  // images in a batch are of same dimension.
  vector<NormalizedBBox> prior_bboxes;
  vector<vector<float> > prior_variances;
  GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances);

  // Retrieve all predictions.
  vector<LabelBBox> all_loc_preds;
  GetLocPredictions(loc_data, num_, num_priors_, loc_classes_, share_location_,
                    &all_loc_preds);

  // Find matches between source bboxes and ground truth bboxes.
  vector<map<int, vector<float> > > all_match_overlaps;
  if (bottom.size() >= 6) {
    CasRegFindMatches(all_loc_preds, all_gt_bboxes, prior_bboxes, prior_variances,
			    multibox_loss_param_, &all_match_overlaps, &all_match_indices_, all_arm_loc_preds);
  } else {
    FindMatches(all_loc_preds, all_gt_bboxes, prior_bboxes, prior_variances,
                multibox_loss_param_, &all_match_overlaps, &all_match_indices_);
  }
  num_matches_ = 0;
  int num_negs = 0;
  // Sample hard negative (and positive) examples based on mining type.
  MineHardExamples<Dtype>(*bottom[1],
      all_loc_preds, all_gt_bboxes, prior_bboxes,
      prior_variances, all_match_overlaps, multibox_loss_param_,
      &num_matches_, &num_negs, &all_match_indices_, &all_neg_indices_, arm_conf_data, do_neg_mining_);

  if (num_matches_ >= 1) {
    // Form data to pass on to loc_loss_layer_.
    vector<int> loc_shape(2);
    loc_shape[0] = 1;
    loc_shape[1] = num_matches_ * 4;
    loc_pred_->Reshape(loc_shape);
    loc_gt_->Reshape(loc_shape);
    Dtype* loc_pred_data = loc_pred_->mutable_cpu_data<Dtype>();
    Dtype* loc_gt_data = loc_gt_->mutable_cpu_data<Dtype>();
    if (bottom.size() >= 6) {
      CasRegEncodeLocPrediction(all_loc_preds, all_gt_bboxes, all_match_indices_,
	  					prior_bboxes, prior_variances, multibox_loss_param_,
	  					loc_pred_data, loc_gt_data, all_arm_loc_preds);
    } else {
      EncodeLocPrediction(all_loc_preds, all_gt_bboxes, all_match_indices_,
                          prior_bboxes, prior_variances, multibox_loss_param_,
                          loc_pred_data, loc_gt_data);
    }
    loc_loss_layer_->Reshape(loc_bottom_vec_, loc_top_vec_);
    loc_loss_layer_->Forward(loc_bottom_vec_, loc_top_vec_);
  } else {
    loc_loss_->mutable_cpu_data<Dtype>()[0] = 0;
  }

  // Form data to pass on to conf_loss_layer_.
  if (do_neg_mining_) {
    num_conf_ = num_matches_ + num_negs;
  } else {
    num_conf_ = num_ * num_priors_;
  }

  if (0) {
      const Solver* solver = this->parent_solver();

      if ((solver && solver->display()) || solver==0) {
        LOG(INFO) << cv::format("iter %d, do_neg_mining %d, num_matches %d, num_negs %d, num_conf %d, num %d, num_priors %d\n",
                              this->iter(), do_neg_mining_, num_matches_, num_negs, num_conf_, num_, num_priors_);
      }
  }

  if (num_conf_ >= 1) {
    // Reshape the confidence data.
    vector<int> conf_shape;
    if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
      conf_shape.push_back(num_conf_);
      conf_bottom_vec_[1]->Reshape(conf_shape);
      conf_shape.push_back(num_classes_);
      conf_bottom_vec_[0]->Reshape(conf_shape);
    } else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
      conf_shape.push_back(1);
      conf_shape.push_back(num_conf_);
      conf_shape.push_back(num_classes_);
      conf_bottom_vec_[0]->Reshape(conf_shape);
      conf_bottom_vec_[1]->Reshape(conf_shape);
    } else {
      LOG(FATAL) << "Unknown confidence loss type.";
    }
    if (!do_neg_mining_) {
      // Consider all scores.
      // Share data and diff with bottom[1].
      CHECK_EQ(conf_pred_->count(), bottom[1]->count());
      conf_pred_->ShareData(*(bottom[1]));
    }
    Dtype* conf_pred_data = conf_pred_->mutable_cpu_data<Dtype>();
    Dtype* conf_gt_data = conf_gt_->mutable_cpu_data<Dtype>();
    caffe_set(conf_gt_->count(), Dtype(background_label_id_), conf_gt_data);
    EncodeConfPrediction(conf_data, num_, num_priors_, multibox_loss_param_,
                         all_match_indices_, all_neg_indices_, all_gt_bboxes,
                         conf_pred_data, conf_gt_data, do_neg_mining_);
    conf_loss_layer_->Reshape(conf_bottom_vec_, conf_top_vec_);
    conf_loss_layer_->Forward(conf_bottom_vec_, conf_top_vec_);
  } else {
    conf_loss_->mutable_cpu_data<Dtype>()[0] = 0;
  }

  top[0]->mutable_cpu_data<Dtype>()[0] = 0;
  if (this->layer_param_.propagate_down(0)) {
    Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
        normalization_, num_, num_priors_, num_matches_);
    top[0]->mutable_cpu_data<Dtype>()[0] +=
        loc_weight_ * loc_loss_->cpu_data<Dtype>()[0] / normalizer;
  }
  if (this->layer_param_.propagate_down(1)) {
    Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
        normalization_, num_, num_priors_, num_matches_);
    top[0]->mutable_cpu_data<Dtype>()[0] +=
        conf_weight_ * conf_loss_->cpu_data<Dtype>()[0] / normalizer;
  }
}

這裏首先調用 GetGroundTruth 函數獲取當前 batch 下的所有 gt 框存入 all_gt_bboxes 中,訪變量是一個 map。

template <typename Dtype>
void GetGroundTruth(const Dtype* gt_data, const int num_classes, const int num_gt,
      const int background_label_id, const bool use_difficult_gt,
      map<int, vector<NormalizedBBox> >* all_gt_bboxes) {
  all_gt_bboxes->clear();
  for (int i = 0; i < num_gt; ++i) {
    //8->item_id,group_label,instance_id,xmin,ymin,xmax,ymax,difficult
    int start_idx = i * 8; 
    int item_id = gt_data[start_idx];
    if (item_id == -1) {
      continue;
    }
    //group_label,也就是類別label
    int label = std::round(gt_data[start_idx + 1]);
    if (label <= background_label_id) {
      DLOG(WARNING) << "Ignoring background label in the dataset: " << gt_data[start_idx + 1];
      continue;
    }
    if (label >= num_classes) {
      DLOG(WARNING) << "Ignoring label >= num_classes in the dataset: " << gt_data[start_idx + 1];
      continue;
    }
    bool difficult = static_cast<bool>(gt_data[start_idx + 7]);
    if (!use_difficult_gt && difficult) {
      // Skip reading difficult ground truth.
      continue;
    }
    NormalizedBBox bbox;
    bbox.set_label(label);
    bbox.set_xmin(gt_data[start_idx + 3]);
    bbox.set_ymin(gt_data[start_idx + 4]);
    bbox.set_xmax(gt_data[start_idx + 5]);
    bbox.set_ymax(gt_data[start_idx + 6]);
    bbox.set_difficult(difficult);
    //面積
    float bbox_size = BBoxSize(bbox);
    bbox.set_size(bbox_size);
    (*all_gt_bboxes)[item_id].push_back(bbox);
  }
}

接下來是調用 GetPriorBBoxes 函數,它的含義並不是命名上的這個意思即獲取 prior bboxes,因爲 prior bboxes 本身就已經存在於 prior_data 裏面了,來源於 bottom[2]。這個函數只是將 prior_data 中關於 prior box 的部分和 variance 的部分分別提取出來存入prior_bboxes 和 prior_variances 這兩個變量中。

template <typename Dtype>
void GetPriorBBoxes(const Dtype* prior_data, const int num_priors,
      vector<NormalizedBBox>* prior_bboxes,
      vector<vector<float> >* prior_variances) {
  prior_bboxes->clear();
  prior_variances->clear();
  for (int i = 0; i < num_priors; ++i) {
    int start_idx = i * 4;
    NormalizedBBox bbox;
    bbox.set_xmin(prior_data[start_idx]);
    bbox.set_ymin(prior_data[start_idx + 1]);
    bbox.set_xmax(prior_data[start_idx + 2]);
    bbox.set_ymax(prior_data[start_idx + 3]);
    float bbox_size = BBoxSize(bbox);
    bbox.set_size(bbox_size);
    prior_bboxes->push_back(bbox);
  }
  
  //prior_data由兩部分組成,前面部分統一是座標,後面部分統一是variances
  for (int i = 0; i < num_priors; ++i) {
    int start_idx = (num_priors + i) * 4;
    vector<float> var;
    for (int j = 0; j < 4; ++j) {
      var.push_back(prior_data[start_idx + j]);
    }
    prior_variances->push_back(var);
  }
}

接下來要調用的 GetLocPredictions 函數,預測 box 存入 loc_preds 中。通常我們都是設置 share_location 爲 True,多類共享同一個預測框的座標位置。

template <typename Dtype>
void GetLocPredictions(const Dtype* loc_data, const int num,
      const int num_preds_per_class, const int num_loc_classes,
      const bool share_location, vector<LabelBBox>* loc_preds) {
  loc_preds->clear();
  if (share_location) {
    CHECK_EQ(num_loc_classes, 1);
  }
  loc_preds->resize(num); //這裏很關鍵,相當於是分配了內存空間
  for (int i = 0; i < num; ++i) {  
    //typedef map<int, vector<NormalizedBBox> > LabelBBox;
    LabelBBox& label_bbox = (*loc_preds)[i];
    for (int p = 0; p < num_preds_per_class; ++p) { //share_location=>1
      int start_idx = p * num_loc_classes * 4;
      for (int c = 0; c < num_loc_classes; ++c) {
        //share_location爲true的話,label爲-1
        //否則的話label就從0開始,一直到(num_loc_classes -1)
        int label = share_location ? -1 : c; 
        if (label_bbox.find(label) == label_bbox.end()) {
          label_bbox[label].resize(num_preds_per_class);
        }
        //label_bbox 是 map,所以 label 爲-1沒啥問題
        label_bbox[label][p].set_xmin(loc_data[start_idx + c * 4]);
        label_bbox[label][p].set_ymin(loc_data[start_idx + c * 4 + 1]);
        label_bbox[label][p].set_xmax(loc_data[start_idx + c * 4 + 2]);
        label_bbox[label][p].set_ymax(loc_data[start_idx + c * 4 + 3]);
      }
    }
    loc_data += num_preds_per_class * num_loc_classes * 4;
  }
}

在 SSD 論文中討論 Matching Strategy 時有下面這段話:

在訓練過程中,首先要確定訓練圖片中的gt與哪個anchor來進行匹配,與之匹配的anchor所將負責預測訪 gt。在Yolo中,gt的中心落在哪個單元格,該單元格中與其IOU最大的邊界框負責預測它。但是在SSD中卻完全不一樣,SSD的anchor與gt的匹配原則主要有兩點。首先,對於圖片中每個gt,找到與其IOU最大的anchor爲與之匹配的 anchor。這樣,可以保證每個gt一定與有可匹配的 anchor。通常稱與gt匹配的anchor爲正樣本(其實應該是先驗框對應的預測box,不過由於是一一對應的就這樣稱呼了)。反之,若某個 anchor沒有匹配上任何的 gt,那麼該anchor只能與背景匹配, 那它就是負樣本。一個圖片中gt是非常少的, 而anchor卻很多,如果僅按第一個原則匹配,很多anchor會是負樣本,正負樣本極其不平衡。所以需要第二個原則。第二個原則是:對於剩餘的未匹配到 gt的anchor,若和某個 gt 的IOU 大於閾值(一般是0.5),那麼該anchor也是和訪 gt 匹配。這樣就意味着某個gt可能與多個anchor 匹配,相當於多個預測框是針對的同一個目標,這沒問題。但是反過來卻不行,因爲一個anchor只能匹配一個gt,如果多個gt與某個anchor大於閾值,那麼anchor只與IOU最大的那個gt進行匹配。關於 anchor 與 gt 的匹配就是在 FindMatches 函數中完成的。

void FindMatches(const vector<LabelBBox>& all_loc_preds,
      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
      const vector<NormalizedBBox>& prior_bboxes,
      const vector<vector<float> >& prior_variances,
      const MultiBoxLossParameter& multibox_loss_param,
      vector<map<int, vector<float> > >* all_match_overlaps,
      vector<map<int, vector<int> > >* all_match_indices) {
  // all_match_overlaps->clear();
  // all_match_indices->clear();
  // Get parameters.
  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
  const int num_classes = multibox_loss_param.num_classes();
  CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
  const bool share_location = multibox_loss_param.share_location();
  const int loc_classes = share_location ? 1 : num_classes;
  //
  const MatchType match_type = multibox_loss_param.match_type(); //匹配類型
 //IoU 閾值,用來劃分正負樣本的
  const float overlap_threshold = multibox_loss_param.overlap_threshold(); 
  const bool use_prior_for_matching = multibox_loss_param.use_prior_for_matching();
  const int background_label_id = multibox_loss_param.background_label_id();
  const CodeType code_type = multibox_loss_param.code_type();
  const bool encode_variance_in_target =
      multibox_loss_param.encode_variance_in_target();
  const bool ignore_cross_boundary_bbox =
      multibox_loss_param.ignore_cross_boundary_bbox();
  // Find the matches.
  int num = all_loc_preds.size(); //圖片的數目
  for (int i = 0; i < num; ++i) {
    map<int, vector<int> > match_indices;
    map<int, vector<float> > match_overlaps;
    // Check if there is ground truth for current image.
    /*對於當前圖像是否有gt, all_gt_bboxes 是一個map<int, vector<NormalizedBBox>>
    結構,int 爲圖像 id,沒有 gt 的圖像在 GetGroundTruth 函數填充 all_gt_bboxes 時
    會直接 pass 掉*/
    if (all_gt_bboxes.find(i) == all_gt_bboxes.end()) {
      // There is no gt for current image. All predictions are negative.
      //對於當前圖像沒有 gt,那自然所有預測 box 都是負樣本
      const vector<NormalizedBBox> gt_bboxes;
      vector<int> temp_match_indices;
      vector<float> temp_match_overlaps;
      const int label = -1;
      MatchBBox(gt_bboxes, prior_bboxes, label, match_type, overlap_threshold,
                ignore_cross_boundary_bbox, &temp_match_indices,
                &temp_match_overlaps, multibox_loss_param.ignore_difficult_gt());
      if (share_location) {
        match_indices[label] = temp_match_indices;
        match_overlaps[label] = temp_match_overlaps;
      }
      all_match_indices->push_back(match_indices);
      all_match_overlaps->push_back(match_overlaps);
      continue;
    }
    // Find match between predictions and ground truth.
    const vector<NormalizedBBox>& gt_bboxes = all_gt_bboxes.find(i)->second;
    //是不使用默認框(default box)進行匹配,default true
    if (!use_prior_for_matching) {
      for (int c = 0; c < loc_classes; ++c) {
        int label = share_location ? -1 : c;
        if (!share_location && label == background_label_id) {
          // Ignore background loc predictions.
          continue;
        }
        // Decode the prediction into bbox first.
        vector<NormalizedBBox> loc_bboxes;
        bool clip_bbox = false;
        DecodeBBoxes(prior_bboxes, prior_variances,
                     code_type, encode_variance_in_target, clip_bbox,
                     all_loc_preds[i].find(label)->second, &loc_bboxes);
        MatchBBox(gt_bboxes, loc_bboxes, label, match_type,
                  overlap_threshold, ignore_cross_boundary_bbox,
                  &match_indices[label], &match_overlaps[label],
                  multibox_loss_param.ignore_difficult_gt());
      }
    } else {
      // Use prior bboxes to match against all ground truth.
      vector<int> temp_match_indices;
      vector<float> temp_match_overlaps;
      const int label = -1;
      //gt與 prior box 的匹配在訪函數中完成
      MatchBBox(gt_bboxes, prior_bboxes, label, match_type, overlap_threshold,
                ignore_cross_boundary_bbox, &temp_match_indices,
                &temp_match_overlaps, multibox_loss_param.ignore_difficult_gt());
      if (share_location) {
        match_indices[label] = temp_match_indices;
        match_overlaps[label] = temp_match_overlaps;
      } else {
        // Get ground truth label for each ground truth bbox.
        vector<int> gt_labels;
        for (int g = 0; g < gt_bboxes.size(); ++g) {
          gt_labels.push_back(gt_bboxes[g].label());
        }
        // Distribute the matching results to different loc_class.
        for (int c = 0; c < loc_classes; ++c) {
          if (c == background_label_id) {
            // Ignore background loc predictions.
            continue;
          }
          match_indices[c].resize(temp_match_indices.size(), -1);
          match_overlaps[c] = temp_match_overlaps;
          for (int m = 0; m < temp_match_indices.size(); ++m) {
            if (temp_match_indices[m] > -1) {
              const int gt_idx = temp_match_indices[m];
              CHECK_LT(gt_idx, gt_labels.size());
              if (c == gt_labels[gt_idx]) {
                match_indices[c][m] = gt_idx;
              }
            }
          }
        }
      }
    }
    all_match_indices->push_back(match_indices);
    all_match_overlaps->push_back(match_overlaps);
  };
}

MineHardExamples函數實現了 SSD 中的提出的 OHNM(Online Hard Negative Mining)機制,在配置mining_type 是可以選擇:None、MAX_NEGATIVE和 HARD_EXAMPLE 三種策略。其中 None 相當於不使用難樣本挖掘。而 MAX_NEGATIVE只計算分類 loss,不計算定位 loss,只針對負樣本選擇 loss 最大的3倍於正樣本數量的負樣本。HARD_EXAMPLE會同時計算loss的時候會同時計算分類 loss 和 confidence loss。

template <typename Dtype>
void MineHardExamples(const Blob& conf_blob,
    const vector<LabelBBox>& all_loc_preds,
    const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
    const vector<NormalizedBBox>& prior_bboxes,
    const vector<vector<float> >& prior_variances,
    const vector<map<int, vector<float> > >& all_match_overlaps,
    const MultiBoxLossParameter& multibox_loss_param,
    int* num_matches, int* num_negs,
    vector<map<int, vector<int> > >* all_match_indices,
    vector<vector<int> >* all_neg_indices) {
  int num = all_loc_preds.size();
  // CHECK_EQ(num, all_match_overlaps.size());
  // CHECK_EQ(num, all_match_indices->size());
  // all_neg_indices->clear();
  *num_matches = CountNumMatches(*all_match_indices, num);
  *num_negs = 0;
  int num_priors = prior_bboxes.size();
  CHECK_EQ(num_priors, prior_variances.size());
  // Get parameters.
  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
  const int num_classes = multibox_loss_param.num_classes();
  CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
  const int background_label_id = multibox_loss_param.background_label_id();
  const bool use_prior_for_nms = multibox_loss_param.use_prior_for_nms();
  const ConfLossType conf_loss_type = multibox_loss_param.conf_loss_type();
  const MiningType mining_type = multibox_loss_param.mining_type();
  //配置爲None表示不做驗樣本挖掘
  if (mining_type == MultiBoxLossParameter_MiningType_NONE) {
    return;
  }
  const LocLossType loc_loss_type = multibox_loss_param.loc_loss_type();
  //負樣本比例???
  const float neg_pos_ratio = multibox_loss_param.neg_pos_ratio();
  //負樣本IoU
  const float neg_overlap = multibox_loss_param.neg_overlap();
  //CENTER_SIZE?(cx,cy,w,h)
  const CodeType code_type = multibox_loss_param.code_type();
  const bool encode_variance_in_target =
      multibox_loss_param.encode_variance_in_target();
  const bool has_nms_param = multibox_loss_param.has_nms_param();
  float nms_threshold = 0;
  int top_k = -1;
  if (has_nms_param) {
    nms_threshold = multibox_loss_param.nms_param().nms_threshold();
    top_k = multibox_loss_param.nms_param().top_k();
  }
  const int sample_size = multibox_loss_param.sample_size();
  // Compute confidence losses based on matching results.
  // 反正先將 conf loss計算出來,不管後面 type 是啥,它都是要計算的
  vector<vector<float> > all_conf_loss;
  ComputeConfLossGPU<Dtype>(conf_blob, num, num_priors, num_classes,
      background_label_id, conf_loss_type, *all_match_indices, all_gt_bboxes,
      &all_conf_loss);

  vector<vector<float> > all_loc_loss;
  //貌似會同時計算分類和定位loss
  if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {
    // Compute localization losses based on matching results.
    TBlob<Dtype> loc_pred, loc_gt;
    if (*num_matches != 0) {
      vector<int> loc_shape(2, 1);
      loc_shape[1] = *num_matches * 4;
      loc_pred.Reshape(loc_shape);
      loc_gt.Reshape(loc_shape);
      Dtype* loc_pred_data = loc_pred.mutable_cpu_data();
      Dtype* loc_gt_data = loc_gt.mutable_cpu_data();
      EncodeLocPrediction(all_loc_preds, all_gt_bboxes, *all_match_indices,
                          prior_bboxes, prior_variances, multibox_loss_param,
                          loc_pred_data, loc_gt_data);
    }
    ComputeLocLoss(loc_pred, loc_gt, *all_match_indices, num,
                   num_priors, loc_loss_type, &all_loc_loss);
  } else {
    // No localization loss.
    //只計算了分類 loss,配置成 MAX_NEGATIVE
    for (int i = 0; i < num; ++i) {
      vector<float> loc_loss(num_priors, 0.f); //全是0
      all_loc_loss.push_back(loc_loss);
    }
  }
  for (int i = 0; i < num; ++i) {
    map<int, vector<int> >& match_indices = (*all_match_indices)[i];
    const map<int, vector<float> >& match_overlaps = all_match_overlaps[i];
    // loc + conf loss.
    const vector<float>& conf_loss = all_conf_loss[i];
    const vector<float>& loc_loss = all_loc_loss[i];
    vector<float> loss;
    std::transform(conf_loss.begin(), conf_loss.end(), loc_loss.begin(),
                   std::back_inserter(loss), std::plus<float>());
    // Pick negatives or hard examples based on loss.
    set<int> sel_indices;
    vector<int> neg_indices;
    for (map<int, vector<int> >::iterator it = match_indices.begin();
         it != match_indices.end(); ++it) {
      const int label = it->first;
      int num_sel = 0;
      // Get potential indices and loss pairs.
      vector<pair<float, int> > loss_indices;
      for (int m = 0; m < match_indices[label].size(); ++m) {
        if (IsEligibleMining(mining_type, match_indices[label][m],
            match_overlaps.find(label)->second[m], neg_overlap)) {
          loss_indices.push_back(std::make_pair(loss[m], m));
          ++num_sel;
        }
      }
      //OHEM type
      if (mining_type == MultiBoxLossParameter_MiningType_MAX_NEGATIVE) {
        int num_pos = 0;
        for (int m = 0; m < match_indices[label].size(); ++m) {
          if (match_indices[label][m] > -1) {
            ++num_pos;
          }
        }
        //根據正樣本數及正負樣本比例計算出負樣本數,然後和num_sel 比較取其小
        num_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), num_sel);
      } else if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {
        CHECK_GT(sample_size, 0);
        num_sel = std::min(sample_size, num_sel);
      }
      // Select samples.
      if (has_nms_param && nms_threshold > 0) {
        // Do nms before selecting samples.
        vector<float> sel_loss;
        vector<NormalizedBBox> sel_bboxes;
        if (use_prior_for_nms) {
          for (int m = 0; m < match_indices[label].size(); ++m) {
            if (IsEligibleMining(mining_type, match_indices[label][m],
                match_overlaps.find(label)->second[m], neg_overlap)) {
              sel_loss.push_back(loss[m]);
              sel_bboxes.push_back(prior_bboxes[m]);
            }
          }
        } else {
          // Decode the prediction into bbox first.
          vector<NormalizedBBox> loc_bboxes;
          bool clip_bbox = false;
          DecodeBBoxes(prior_bboxes, prior_variances,
                       code_type, encode_variance_in_target, clip_bbox,
                       all_loc_preds[i].find(label)->second, &loc_bboxes);
          for (int m = 0; m < match_indices[label].size(); ++m) {
            if (IsEligibleMining(mining_type, match_indices[label][m],
                match_overlaps.find(label)->second[m], neg_overlap)) {
              sel_loss.push_back(loss[m]);
              sel_bboxes.push_back(loc_bboxes[m]);
            }
          }
        }
        // Do non-maximum suppression based on the loss.
        vector<int> nms_indices;
        ApplyNMS(sel_bboxes, sel_loss, nms_threshold, top_k, &nms_indices);
        if (nms_indices.size() < num_sel) {
          LOG(INFO) << "not enough sample after nms: " << nms_indices.size();
        }
        // Pick top example indices after nms.
        num_sel = std::min(static_cast<int>(nms_indices.size()), num_sel);
        for (int n = 0; n < num_sel; ++n) {
          sel_indices.insert(loss_indices[nms_indices[n]].second);
        }
      } else {
        // Pick top example indices based on loss.
        std::sort(loss_indices.begin(), loss_indices.end(),
                  SortScorePairDescend<int>);
        for (int n = 0; n < num_sel; ++n) {
          sel_indices.insert(loss_indices[n].second);
        }
      }
      // Update the match_indices and select neg_indices.
      for (int m = 0; m < match_indices[label].size(); ++m) {
        if (match_indices[label][m] > -1) {
          if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE &&
              sel_indices.find(m) == sel_indices.end()) {
            match_indices[label][m] = -1;
            *num_matches -= 1;
          }
        } else if (match_indices[label][m] == -1) {
          if (sel_indices.find(m) != sel_indices.end()) {
            neg_indices.push_back(m);
            *num_negs += 1;
          }
        }
      }
    }
    all_neg_indices->push_back(neg_indices);
  }
}

反向傳播(Backward)

template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::Backward_cpu(const vector<Blob*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob*>& bottom) {
  if (propagate_down[2]) {
    LOG(FATAL) << this->type()
        << " Layer cannot backpropagate to prior inputs.";
  }
  if (propagate_down[3]) {
    LOG(FATAL) << this->type()
        << " Layer cannot backpropagate to label inputs.";
  }
  // Back propagate on location prediction.
  if (propagate_down[0]) {
    Dtype* loc_bottom_diff = bottom[0]->mutable_cpu_diff<Dtype>();
    caffe_set(bottom[0]->count(), Dtype(0), loc_bottom_diff);
    if (num_matches_ >= 1) {
      vector<bool> loc_propagate_down;
      // Only back propagate on prediction, not ground truth.
      loc_propagate_down.push_back(true);
      loc_propagate_down.push_back(false);
      loc_loss_layer_->Backward(loc_top_vec_, loc_propagate_down,
                                loc_bottom_vec_);
      // Scale gradient.
      Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
          normalization_, num_, num_priors_, num_matches_);
      Dtype loss_weight = top[0]->cpu_diff<Dtype>()[0] / normalizer;
      caffe_scal(loc_pred_->count(), loss_weight, loc_pred_->mutable_cpu_diff<Dtype>());
      // Copy gradient back to bottom[0].
      const Dtype* loc_pred_diff = loc_pred_->cpu_diff<Dtype>();
      int count = 0;
      for (int i = 0; i < num_; ++i) {
        for (map<int, vector<int> >::iterator it =
             all_match_indices_[i].begin();
             it != all_match_indices_[i].end(); ++it) {
          const int label = share_location_ ? 0 : it->first;
          const vector<int>& match_index = it->second;
          for (int j = 0; j < match_index.size(); ++j) {
            if (match_index[j] <= -1) {
              continue;
            }
            // Copy the diff to the right place.
            int start_idx = loc_classes_ * 4 * j + label * 4;
            caffe_copy(4, loc_pred_diff + count * 4,
                              loc_bottom_diff + start_idx);
            ++count;
          }
        }
        loc_bottom_diff += bottom[0]->offset(1);
      }
    }
  }

  // Back propagate on confidence prediction.
  if (propagate_down[1]) {
    Dtype* conf_bottom_diff = bottom[1]->mutable_cpu_diff<Dtype>();
    caffe_set(bottom[1]->count(), Dtype(0), conf_bottom_diff);
    if (num_conf_ >= 1) {
      vector<bool> conf_propagate_down;
      // Only back propagate on prediction, not ground truth.
      conf_propagate_down.push_back(true);
      conf_propagate_down.push_back(false);
      conf_loss_layer_->Backward(conf_top_vec_, conf_propagate_down,
                                 conf_bottom_vec_);
      // Scale gradient.
      Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
          normalization_, num_, num_priors_, num_matches_);
      Dtype loss_weight = top[0]->cpu_diff<Dtype>()[0] / normalizer;
      caffe_scal(conf_pred_->count(), loss_weight,
                 conf_pred_->mutable_cpu_diff<Dtype>());
      // Copy gradient back to bottom[1].
      const Dtype* conf_pred_diff = conf_pred_->cpu_diff<Dtype>();
      if (do_neg_mining_) {
        int count = 0;
        for (int i = 0; i < num_; ++i) {
          // Copy matched (positive) bboxes scores' diff.
          const map<int, vector<int> >& match_indices = all_match_indices_[i];
          for (map<int, vector<int> >::const_iterator it =
               match_indices.begin(); it != match_indices.end(); ++it) {
            const vector<int>& match_index = it->second;
            CHECK_EQ(match_index.size(), num_priors_);
            for (int j = 0; j < num_priors_; ++j) {
              if (match_index[j] <= -1) {
                continue;
              }
              // Copy the diff to the right place.
              caffe_copy(num_classes_,
                                conf_pred_diff + count * num_classes_,
                                conf_bottom_diff + j * num_classes_);
              ++count;
            }
          }
          // Copy negative bboxes scores' diff.
          for (int n = 0; n < all_neg_indices_[i].size(); ++n) {
            int j = all_neg_indices_[i][n];
            CHECK_LT(j, num_priors_);
            caffe_copy(num_classes_,
                              conf_pred_diff + count * num_classes_,
                              conf_bottom_diff + j * num_classes_);
            ++count;
          }
          conf_bottom_diff += bottom[1]->offset(1);
        }
      } else {
        // The diff is already computed and stored.
        //bottom[1]->ShareDiff(*conf_pred_);
        caffe_copy(conf_pred_->count(),conf_pred_diff,conf_bottom_diff);

      }
    }
  }

  if (0) {
      float loss_xy = 0, loss_wh = 0, loss_obj = 0, loss_cls = 0;
      Blob* loc = bottom[0];
      Blob* conf = bottom[1];
      int num = loc->shape()[0];
      int nboxes = loc->shape()[1] / 4;
      CHECK(nboxes == conf->shape()[1] / num_classes_);
      for (int n=0; n<num; n++) {
          for (int j=0; j<nboxes; j++) {
              const float* p = loc->cpu_diff<float>() + n*nboxes*4 + j*4;
              loss_xy += std::abs(p[0]);
              loss_xy += std::abs(p[1]);
              loss_wh += std::abs(p[2]);
              loss_wh += std::abs(p[3]);
          }
      }
      for (int n=0; n<num; n++) {
          for (int j=0; j<nboxes; j++) {
              const float* p = conf->cpu_diff<float>() + n*nboxes*num_classes_ + j*num_classes_;
              loss_obj += std::abs(p[0]);
              for (int c=1; c<num_classes_; c++) {
                  loss_cls += std::abs(p[c]);
              }
          }
      }
      const Solver* solver = this->parent_solver();
      if ((solver && solver->display()) || solver==0) {
          //LOG(INFO) << "the number of pred boxes is " << nboxes;
          float loss = loss_xy+loss_wh+loss_obj+loss_cls;
          char str[1024];
          snprintf(str, 1024, "%s, iter %d, loss %g, loss_xy %g, loss_wh %g, loss_obj %g, loss_cls %g\n",
                  this->name().c_str(), this->iter(),
                  loss, loss_xy, loss_wh, loss_obj, loss_cls);
          LOG(INFO) << str;
      }
  }

  // After backward, remove match statistics.
  all_match_indices_.clear();
  all_neg_indices_.clear();
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章