【NVCaffe源码解析】MultiBoxLossLayer(Ing)

    NVCaffe中MultiBoxLossLayer的代码说真的不是那么好容易理解,有很多细节的地方最好是结合 SSD的论文反复推敲,方得真谛。在深入解读源码之前,还是先给一个访层的参数配置样例。在后文中对代码进行推演的时候以访样例为参考。

layer {
    name: "mbox_loss"
    type: "MultiBoxLoss"
    bottom: "mbox_loc"
    bottom: "mbox_conf"
    bottom: "mbox_priorbox"
    bottom: "label"
    top: "mbox_loss"
    include {
      phase: TRAIN
    }
    propagate_down: true
    propagate_down: true
    propagate_down: false
    propagate_down: false
    loss_param {
      normalization: VALID
    }
    multibox_loss_param {
      loc_loss_type: SMOOTH_L1 //位置损失函数
      conf_loss_type: SOFTMAX  //置信度损失函数
      loc_weight: 1
      num_classes: 5           //类别数(背景类 + 目标类别)
      share_location: true
      match_type: PER_PREDICTION
      overlap_threshold: 0.5
      use_prior_for_matching: true
      background_label_id: 0   //背景类id,通常为0
      use_difficult_gt: false
      neg_pos_ratio: 3         //正负样本比例1:3
      neg_overlap: 0.5         //负样本IoU 阈值
      code_type: CENTER_SIZE
      ignore_cross_boundary_bbox: false
      mining_type: MAX_NEGATIVE  //难样本挖掘策略
    }
  }

LayerSetUp

    每个网络层都将调用 LayerSetUp 进行特定的设置,主要包括读取和处理对应访层的相关参数,设置输出 blob 的 shape 等。

template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
      const vector<Blob*>& top) {
  LossLayer<Ftype, Btype>::LayerSetUp(bottom, top);
  //配置文件如果没有明确给出propagate_down参数配置,则照此默认配置.
  if (this->layer_param_.propagate_down_size() == 0) {
    this->layer_param_.add_propagate_down(true);
    this->layer_param_.add_propagate_down(true);
    this->layer_param_.add_propagate_down(false);
    this->layer_param_.add_propagate_down(false);
  }
  const MultiBoxLossParameter& multibox_loss_param =
      this->layer_param_.multibox_loss_param();
  multibox_loss_param_ = this->layer_param_.multibox_loss_param();
 
  //图片的数量即(N,C,H,W)中的 N
  num_ = bottom[0]->num(); 
  /*bootom[2]是 mbox_priorbox,它是 PriorBoxLayer的输出,即生成的 Anchor.
  bootom 的 shape 为(1,2,N),其中 N 为访层生成的所有Anchor的座标,用于 Anchor 用
  4个座标来表示,所以这里除以4就得到了总的生成的 Anchor 的个数*/
  num_priors_ = bottom[2]->height() / 4;
  // Get other parameters.
  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
  //类别数,注意要目标类别数再加上一个背景类
  num_classes_ = multibox_loss_param.num_classes();
  CHECK_GE(num_classes_, 1) << "num_classes should not be less than 1.";
  /*什么意思呢?就是说各个类别是否共享一组位置, 相当于是在同一个 box上区分
  不同的类别,通常都是true*/
  share_location_ = multibox_loss_param.share_location();
  loc_classes_ = share_location_ ? 1 : num_classes_;
  background_label_id_ = multibox_loss_param.background_label_id();
  use_difficult_gt_ = multibox_loss_param.use_difficult_gt();
  //难样本挖掘策略
  mining_type_ = multibox_loss_param.mining_type();
  if (multibox_loss_param.has_do_neg_mining()) {
    LOG(WARNING) << "do_neg_mining is deprecated, use mining_type instead.";
    do_neg_mining_ = multibox_loss_param.do_neg_mining();
    CHECK_EQ(do_neg_mining_,
             mining_type_ != MultiBoxLossParameter_MiningType_NONE);
  }
  //如果配置为None,表示不采用难样本挖掘
  do_neg_mining_ = mining_type_ != MultiBoxLossParameter_MiningType_NONE;

  if (!this->layer_param_.loss_param().has_normalization() &&
      this->layer_param_.loss_param().has_normalize()) {
    normalization_ = this->layer_param_.loss_param().normalize() ?
                     LossParameter_NormalizationMode_VALID :
                     LossParameter_NormalizationMode_BATCH_SIZE;
  } else {
    normalization_ = this->layer_param_.loss_param().normalization();
  }

  if (do_neg_mining_) {
    CHECK(share_location_)
        << "Currently only support negative mining if share_location is true.";
  }

  vector<int> loss_shape(1, 1);
  // Set up localization loss layer.
  loc_weight_ = multibox_loss_param.loc_weight();
  loc_loss_type_ = multibox_loss_param.loc_loss_type();
  // fake shape.
  vector<int> loc_shape(1, 1);
  loc_shape.push_back(4);
  loc_pred_ = Blob::create<Dtype>();
  loc_pred_->Reshape(loc_shape);
  loc_gt_ = Blob::create<Dtype>();
  loc_gt_->Reshape(loc_shape);
  loc_bottom_vec_.push_back(loc_pred_.get());
  loc_bottom_vec_.push_back(loc_gt_.get());
  loc_loss_ = Blob::create<Dtype>();
  loc_loss_->Reshape(loss_shape);
  loc_top_vec_.push_back(loc_loss_.get());
  if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_L2) {
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_l2_loc");
    layer_param.set_type("EuclideanLoss");
    layer_param.add_loss_weight(loc_weight_);
    loc_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
    loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_);
  } else if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_SMOOTH_L1) {
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_smooth_L1_loc");
    layer_param.set_type("SmoothL1Loss");
    layer_param.add_loss_weight(loc_weight_);
    loc_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
    loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_);
  } else {
    LOG(FATAL) << "Unknown localization loss type.";
  }
  // Set up confidence loss layer.
  conf_weight_ = multibox_loss_param.conf_weight();
  conf_loss_type_ = multibox_loss_param.conf_loss_type();
  conf_pred_ = Blob::create<Dtype>();
  conf_gt_ = Blob::create<Dtype>();
  conf_loss_ = Blob::create<Dtype>();
  conf_loss_->Reshape(loss_shape);
  conf_top_vec_.push_back(conf_loss_.get());
  if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
    CHECK_GE(background_label_id_, 0)
        << "background_label_id should be within [0, num_classes) for Softmax.";
    CHECK_LT(background_label_id_, num_classes_)
        << "background_label_id should be within [0, num_classes) for Softmax.";
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_softmax_conf");
    layer_param.set_type("SoftmaxWithLoss");
    layer_param.add_loss_weight(conf_weight_);
    // layer_param.add_loss_weight(Dtype(1.));
    layer_param.mutable_loss_param()->set_normalization(
        LossParameter_NormalizationMode_NONE);
    SoftmaxParameter* softmax_param = layer_param.mutable_softmax_param();
    softmax_param->set_axis(1);
    // Fake reshape.
    vector<int> conf_shape(1, 1);
    conf_gt_->Reshape(conf_shape);
    conf_shape.push_back(num_classes_);
    conf_pred_->Reshape(conf_shape);
    conf_bottom_vec_.push_back(conf_pred_.get());
    conf_bottom_vec_.push_back(conf_gt_.get());
    conf_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
    conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_);
  } else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_logistic_conf");
    layer_param.set_type("SigmoidCrossEntropyLoss");
    layer_param.add_loss_weight(conf_weight_);
    // layer_param.add_loss_weight(Dtype(1.));
    // Fake reshape.
    vector<int> conf_shape(1, 1);
    conf_shape.push_back(num_classes_);
    conf_gt_->Reshape(conf_shape);
    conf_pred_->Reshape(conf_shape);
    conf_bottom_vec_.push_back(conf_pred_.get());
    conf_bottom_vec_.push_back(conf_gt_.get());
    conf_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
    conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_);
  } else {
    LOG(FATAL) << "Unknown confidence loss type.";
  }
}

Reshape

template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
      const vector<Blob*>& top) {
  LossLayer<Ftype, Btype>::Reshape(bottom, top);
  num_ = bottom[0]->num();
  num_priors_ = bottom[2]->height() / 4; //anchor 的数量,上文已做解释
  num_gt_ = bottom[3]->height(); //bottom[3]是 label,这里得到 gt 框的数目
  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
  CHECK_EQ(num_priors_ * loc_classes_ * 4, bottom[0]->channels())
      << "Number of priors must match number of location predictions.";
  CHECK_EQ(num_priors_ * num_classes_, bottom[1]->channels())
      << "Number of priors must match number of confidence predictions.";
}

bottom[3]表示为 label,bottom[3]->height()得到的是gt 框的总的个数,为什么这么说呢?数据输入层以 AnnotatedDataLayer 为例,

它的输出 shape 一般情况下为(1,1,N,8),这里的 第2维N 就是一个 batch 中的 gt 框的数量,这个就是这么设定的。第3维的8表示对应 gt 框的相关信息(分类信息、位置信息等)。

前向推理(Forward)

    最核心的部分,也最复杂。主要完成的工作包括:1).正负样本的匹配、划分;2).难样本挖掘;3).loss 的计算等等。

template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::Forward_cpu(const vector<Blob*>& bottom,
    const vector<Blob*>& top) {
  const Dtype* loc_data = bottom[0]->cpu_data<Dtype>();
  const Dtype* conf_data = bottom[1]->cpu_data<Dtype>();
  const Dtype* prior_data = bottom[2]->cpu_data<Dtype>();
  const Dtype* gt_data = bottom[3]->cpu_data<Dtype>();
  // refinedet
  const Dtype* arm_conf_data = NULL;
  const Dtype* arm_loc_data = NULL;
  vector<LabelBBox> all_arm_loc_preds;
  if (bottom.size() >= 5) {
	  arm_conf_data = bottom[4]->cpu_data<Dtype>();
  }
  if (bottom.size() >= 6) {
    arm_loc_data = bottom[5]->cpu_data<Dtype>();
    GetLocPredictions(arm_loc_data, num_, num_priors_, loc_classes_, share_location_,
                      &all_arm_loc_preds);
  }
  // Retrieve all ground truth.
  map<int, vector<NormalizedBBox> > all_gt_bboxes;
  GetGroundTruth(gt_data, num_classes_, num_gt_, background_label_id_, use_difficult_gt_,
                 &all_gt_bboxes);

  // Retrieve all prior bboxes. It is same within a batch since we assume all
  // images in a batch are of same dimension.
  vector<NormalizedBBox> prior_bboxes;
  vector<vector<float> > prior_variances;
  GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances);

  // Retrieve all predictions.
  vector<LabelBBox> all_loc_preds;
  GetLocPredictions(loc_data, num_, num_priors_, loc_classes_, share_location_,
                    &all_loc_preds);

  // Find matches between source bboxes and ground truth bboxes.
  vector<map<int, vector<float> > > all_match_overlaps;
  if (bottom.size() >= 6) {
    CasRegFindMatches(all_loc_preds, all_gt_bboxes, prior_bboxes, prior_variances,
			    multibox_loss_param_, &all_match_overlaps, &all_match_indices_, all_arm_loc_preds);
  } else {
    FindMatches(all_loc_preds, all_gt_bboxes, prior_bboxes, prior_variances,
                multibox_loss_param_, &all_match_overlaps, &all_match_indices_);
  }
  num_matches_ = 0;
  int num_negs = 0;
  // Sample hard negative (and positive) examples based on mining type.
  MineHardExamples<Dtype>(*bottom[1],
      all_loc_preds, all_gt_bboxes, prior_bboxes,
      prior_variances, all_match_overlaps, multibox_loss_param_,
      &num_matches_, &num_negs, &all_match_indices_, &all_neg_indices_, arm_conf_data, do_neg_mining_);

  if (num_matches_ >= 1) {
    // Form data to pass on to loc_loss_layer_.
    vector<int> loc_shape(2);
    loc_shape[0] = 1;
    loc_shape[1] = num_matches_ * 4;
    loc_pred_->Reshape(loc_shape);
    loc_gt_->Reshape(loc_shape);
    Dtype* loc_pred_data = loc_pred_->mutable_cpu_data<Dtype>();
    Dtype* loc_gt_data = loc_gt_->mutable_cpu_data<Dtype>();
    if (bottom.size() >= 6) {
      CasRegEncodeLocPrediction(all_loc_preds, all_gt_bboxes, all_match_indices_,
	  					prior_bboxes, prior_variances, multibox_loss_param_,
	  					loc_pred_data, loc_gt_data, all_arm_loc_preds);
    } else {
      EncodeLocPrediction(all_loc_preds, all_gt_bboxes, all_match_indices_,
                          prior_bboxes, prior_variances, multibox_loss_param_,
                          loc_pred_data, loc_gt_data);
    }
    loc_loss_layer_->Reshape(loc_bottom_vec_, loc_top_vec_);
    loc_loss_layer_->Forward(loc_bottom_vec_, loc_top_vec_);
  } else {
    loc_loss_->mutable_cpu_data<Dtype>()[0] = 0;
  }

  // Form data to pass on to conf_loss_layer_.
  if (do_neg_mining_) {
    num_conf_ = num_matches_ + num_negs;
  } else {
    num_conf_ = num_ * num_priors_;
  }

  if (0) {
      const Solver* solver = this->parent_solver();

      if ((solver && solver->display()) || solver==0) {
        LOG(INFO) << cv::format("iter %d, do_neg_mining %d, num_matches %d, num_negs %d, num_conf %d, num %d, num_priors %d\n",
                              this->iter(), do_neg_mining_, num_matches_, num_negs, num_conf_, num_, num_priors_);
      }
  }

  if (num_conf_ >= 1) {
    // Reshape the confidence data.
    vector<int> conf_shape;
    if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
      conf_shape.push_back(num_conf_);
      conf_bottom_vec_[1]->Reshape(conf_shape);
      conf_shape.push_back(num_classes_);
      conf_bottom_vec_[0]->Reshape(conf_shape);
    } else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
      conf_shape.push_back(1);
      conf_shape.push_back(num_conf_);
      conf_shape.push_back(num_classes_);
      conf_bottom_vec_[0]->Reshape(conf_shape);
      conf_bottom_vec_[1]->Reshape(conf_shape);
    } else {
      LOG(FATAL) << "Unknown confidence loss type.";
    }
    if (!do_neg_mining_) {
      // Consider all scores.
      // Share data and diff with bottom[1].
      CHECK_EQ(conf_pred_->count(), bottom[1]->count());
      conf_pred_->ShareData(*(bottom[1]));
    }
    Dtype* conf_pred_data = conf_pred_->mutable_cpu_data<Dtype>();
    Dtype* conf_gt_data = conf_gt_->mutable_cpu_data<Dtype>();
    caffe_set(conf_gt_->count(), Dtype(background_label_id_), conf_gt_data);
    EncodeConfPrediction(conf_data, num_, num_priors_, multibox_loss_param_,
                         all_match_indices_, all_neg_indices_, all_gt_bboxes,
                         conf_pred_data, conf_gt_data, do_neg_mining_);
    conf_loss_layer_->Reshape(conf_bottom_vec_, conf_top_vec_);
    conf_loss_layer_->Forward(conf_bottom_vec_, conf_top_vec_);
  } else {
    conf_loss_->mutable_cpu_data<Dtype>()[0] = 0;
  }

  top[0]->mutable_cpu_data<Dtype>()[0] = 0;
  if (this->layer_param_.propagate_down(0)) {
    Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
        normalization_, num_, num_priors_, num_matches_);
    top[0]->mutable_cpu_data<Dtype>()[0] +=
        loc_weight_ * loc_loss_->cpu_data<Dtype>()[0] / normalizer;
  }
  if (this->layer_param_.propagate_down(1)) {
    Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
        normalization_, num_, num_priors_, num_matches_);
    top[0]->mutable_cpu_data<Dtype>()[0] +=
        conf_weight_ * conf_loss_->cpu_data<Dtype>()[0] / normalizer;
  }
}

这里首先调用 GetGroundTruth 函数获取当前 batch 下的所有 gt 框存入 all_gt_bboxes 中,访变量是一个 map。

template <typename Dtype>
void GetGroundTruth(const Dtype* gt_data, const int num_classes, const int num_gt,
      const int background_label_id, const bool use_difficult_gt,
      map<int, vector<NormalizedBBox> >* all_gt_bboxes) {
  all_gt_bboxes->clear();
  for (int i = 0; i < num_gt; ++i) {
    //8->item_id,group_label,instance_id,xmin,ymin,xmax,ymax,difficult
    int start_idx = i * 8; 
    int item_id = gt_data[start_idx];
    if (item_id == -1) {
      continue;
    }
    //group_label,也就是类别label
    int label = std::round(gt_data[start_idx + 1]);
    if (label <= background_label_id) {
      DLOG(WARNING) << "Ignoring background label in the dataset: " << gt_data[start_idx + 1];
      continue;
    }
    if (label >= num_classes) {
      DLOG(WARNING) << "Ignoring label >= num_classes in the dataset: " << gt_data[start_idx + 1];
      continue;
    }
    bool difficult = static_cast<bool>(gt_data[start_idx + 7]);
    if (!use_difficult_gt && difficult) {
      // Skip reading difficult ground truth.
      continue;
    }
    NormalizedBBox bbox;
    bbox.set_label(label);
    bbox.set_xmin(gt_data[start_idx + 3]);
    bbox.set_ymin(gt_data[start_idx + 4]);
    bbox.set_xmax(gt_data[start_idx + 5]);
    bbox.set_ymax(gt_data[start_idx + 6]);
    bbox.set_difficult(difficult);
    //面积
    float bbox_size = BBoxSize(bbox);
    bbox.set_size(bbox_size);
    (*all_gt_bboxes)[item_id].push_back(bbox);
  }
}

接下来是调用 GetPriorBBoxes 函数,它的含义并不是命名上的这个意思即获取 prior bboxes,因为 prior bboxes 本身就已经存在于 prior_data 里面了,来源于 bottom[2]。这个函数只是将 prior_data 中关于 prior box 的部分和 variance 的部分分别提取出来存入prior_bboxes 和 prior_variances 这两个变量中。

template <typename Dtype>
void GetPriorBBoxes(const Dtype* prior_data, const int num_priors,
      vector<NormalizedBBox>* prior_bboxes,
      vector<vector<float> >* prior_variances) {
  prior_bboxes->clear();
  prior_variances->clear();
  for (int i = 0; i < num_priors; ++i) {
    int start_idx = i * 4;
    NormalizedBBox bbox;
    bbox.set_xmin(prior_data[start_idx]);
    bbox.set_ymin(prior_data[start_idx + 1]);
    bbox.set_xmax(prior_data[start_idx + 2]);
    bbox.set_ymax(prior_data[start_idx + 3]);
    float bbox_size = BBoxSize(bbox);
    bbox.set_size(bbox_size);
    prior_bboxes->push_back(bbox);
  }
  
  //prior_data由两部分组成,前面部分统一是座标,后面部分统一是variances
  for (int i = 0; i < num_priors; ++i) {
    int start_idx = (num_priors + i) * 4;
    vector<float> var;
    for (int j = 0; j < 4; ++j) {
      var.push_back(prior_data[start_idx + j]);
    }
    prior_variances->push_back(var);
  }
}

接下来要调用的 GetLocPredictions 函数,预测 box 存入 loc_preds 中。通常我们都是设置 share_location 为 True,多类共享同一个预测框的座标位置。

template <typename Dtype>
void GetLocPredictions(const Dtype* loc_data, const int num,
      const int num_preds_per_class, const int num_loc_classes,
      const bool share_location, vector<LabelBBox>* loc_preds) {
  loc_preds->clear();
  if (share_location) {
    CHECK_EQ(num_loc_classes, 1);
  }
  loc_preds->resize(num); //这里很关键,相当于是分配了内存空间
  for (int i = 0; i < num; ++i) {  
    //typedef map<int, vector<NormalizedBBox> > LabelBBox;
    LabelBBox& label_bbox = (*loc_preds)[i];
    for (int p = 0; p < num_preds_per_class; ++p) { //share_location=>1
      int start_idx = p * num_loc_classes * 4;
      for (int c = 0; c < num_loc_classes; ++c) {
        //share_location为true的话,label为-1
        //否则的话label就从0开始,一直到(num_loc_classes -1)
        int label = share_location ? -1 : c; 
        if (label_bbox.find(label) == label_bbox.end()) {
          label_bbox[label].resize(num_preds_per_class);
        }
        //label_bbox 是 map,所以 label 为-1没啥问题
        label_bbox[label][p].set_xmin(loc_data[start_idx + c * 4]);
        label_bbox[label][p].set_ymin(loc_data[start_idx + c * 4 + 1]);
        label_bbox[label][p].set_xmax(loc_data[start_idx + c * 4 + 2]);
        label_bbox[label][p].set_ymax(loc_data[start_idx + c * 4 + 3]);
      }
    }
    loc_data += num_preds_per_class * num_loc_classes * 4;
  }
}

在 SSD 论文中讨论 Matching Strategy 时有下面这段话:

在训练过程中,首先要确定训练图片中的gt与哪个anchor来进行匹配,与之匹配的anchor所将负责预测访 gt。在Yolo中,gt的中心落在哪个单元格,该单元格中与其IOU最大的边界框负责预测它。但是在SSD中却完全不一样,SSD的anchor与gt的匹配原则主要有两点。首先,对于图片中每个gt,找到与其IOU最大的anchor为与之匹配的 anchor。这样,可以保证每个gt一定与有可匹配的 anchor。通常称与gt匹配的anchor为正样本(其实应该是先验框对应的预测box,不过由于是一一对应的就这样称呼了)。反之,若某个 anchor没有匹配上任何的 gt,那么该anchor只能与背景匹配, 那它就是负样本。一个图片中gt是非常少的, 而anchor却很多,如果仅按第一个原则匹配,很多anchor会是负样本,正负样本极其不平衡。所以需要第二个原则。第二个原则是:对于剩余的未匹配到 gt的anchor,若和某个 gt 的IOU 大于阈值(一般是0.5),那么该anchor也是和访 gt 匹配。这样就意味着某个gt可能与多个anchor 匹配,相当于多个预测框是针对的同一个目标,这没问题。但是反过来却不行,因为一个anchor只能匹配一个gt,如果多个gt与某个anchor大于阈值,那么anchor只与IOU最大的那个gt进行匹配。关于 anchor 与 gt 的匹配就是在 FindMatches 函数中完成的。

void FindMatches(const vector<LabelBBox>& all_loc_preds,
      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
      const vector<NormalizedBBox>& prior_bboxes,
      const vector<vector<float> >& prior_variances,
      const MultiBoxLossParameter& multibox_loss_param,
      vector<map<int, vector<float> > >* all_match_overlaps,
      vector<map<int, vector<int> > >* all_match_indices) {
  // all_match_overlaps->clear();
  // all_match_indices->clear();
  // Get parameters.
  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
  const int num_classes = multibox_loss_param.num_classes();
  CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
  const bool share_location = multibox_loss_param.share_location();
  const int loc_classes = share_location ? 1 : num_classes;
  //
  const MatchType match_type = multibox_loss_param.match_type(); //匹配类型
 //IoU 阈值,用来划分正负样本的
  const float overlap_threshold = multibox_loss_param.overlap_threshold(); 
  const bool use_prior_for_matching = multibox_loss_param.use_prior_for_matching();
  const int background_label_id = multibox_loss_param.background_label_id();
  const CodeType code_type = multibox_loss_param.code_type();
  const bool encode_variance_in_target =
      multibox_loss_param.encode_variance_in_target();
  const bool ignore_cross_boundary_bbox =
      multibox_loss_param.ignore_cross_boundary_bbox();
  // Find the matches.
  int num = all_loc_preds.size(); //图片的数目
  for (int i = 0; i < num; ++i) {
    map<int, vector<int> > match_indices;
    map<int, vector<float> > match_overlaps;
    // Check if there is ground truth for current image.
    /*对于当前图像是否有gt, all_gt_bboxes 是一个map<int, vector<NormalizedBBox>>
    结构,int 为图像 id,没有 gt 的图像在 GetGroundTruth 函数填充 all_gt_bboxes 时
    会直接 pass 掉*/
    if (all_gt_bboxes.find(i) == all_gt_bboxes.end()) {
      // There is no gt for current image. All predictions are negative.
      //对于当前图像没有 gt,那自然所有预测 box 都是负样本
      const vector<NormalizedBBox> gt_bboxes;
      vector<int> temp_match_indices;
      vector<float> temp_match_overlaps;
      const int label = -1;
      MatchBBox(gt_bboxes, prior_bboxes, label, match_type, overlap_threshold,
                ignore_cross_boundary_bbox, &temp_match_indices,
                &temp_match_overlaps, multibox_loss_param.ignore_difficult_gt());
      if (share_location) {
        match_indices[label] = temp_match_indices;
        match_overlaps[label] = temp_match_overlaps;
      }
      all_match_indices->push_back(match_indices);
      all_match_overlaps->push_back(match_overlaps);
      continue;
    }
    // Find match between predictions and ground truth.
    const vector<NormalizedBBox>& gt_bboxes = all_gt_bboxes.find(i)->second;
    //是不使用默认框(default box)进行匹配,default true
    if (!use_prior_for_matching) {
      for (int c = 0; c < loc_classes; ++c) {
        int label = share_location ? -1 : c;
        if (!share_location && label == background_label_id) {
          // Ignore background loc predictions.
          continue;
        }
        // Decode the prediction into bbox first.
        vector<NormalizedBBox> loc_bboxes;
        bool clip_bbox = false;
        DecodeBBoxes(prior_bboxes, prior_variances,
                     code_type, encode_variance_in_target, clip_bbox,
                     all_loc_preds[i].find(label)->second, &loc_bboxes);
        MatchBBox(gt_bboxes, loc_bboxes, label, match_type,
                  overlap_threshold, ignore_cross_boundary_bbox,
                  &match_indices[label], &match_overlaps[label],
                  multibox_loss_param.ignore_difficult_gt());
      }
    } else {
      // Use prior bboxes to match against all ground truth.
      vector<int> temp_match_indices;
      vector<float> temp_match_overlaps;
      const int label = -1;
      //gt与 prior box 的匹配在访函数中完成
      MatchBBox(gt_bboxes, prior_bboxes, label, match_type, overlap_threshold,
                ignore_cross_boundary_bbox, &temp_match_indices,
                &temp_match_overlaps, multibox_loss_param.ignore_difficult_gt());
      if (share_location) {
        match_indices[label] = temp_match_indices;
        match_overlaps[label] = temp_match_overlaps;
      } else {
        // Get ground truth label for each ground truth bbox.
        vector<int> gt_labels;
        for (int g = 0; g < gt_bboxes.size(); ++g) {
          gt_labels.push_back(gt_bboxes[g].label());
        }
        // Distribute the matching results to different loc_class.
        for (int c = 0; c < loc_classes; ++c) {
          if (c == background_label_id) {
            // Ignore background loc predictions.
            continue;
          }
          match_indices[c].resize(temp_match_indices.size(), -1);
          match_overlaps[c] = temp_match_overlaps;
          for (int m = 0; m < temp_match_indices.size(); ++m) {
            if (temp_match_indices[m] > -1) {
              const int gt_idx = temp_match_indices[m];
              CHECK_LT(gt_idx, gt_labels.size());
              if (c == gt_labels[gt_idx]) {
                match_indices[c][m] = gt_idx;
              }
            }
          }
        }
      }
    }
    all_match_indices->push_back(match_indices);
    all_match_overlaps->push_back(match_overlaps);
  };
}

MineHardExamples函数实现了 SSD 中的提出的 OHNM(Online Hard Negative Mining)机制,在配置mining_type 是可以选择:None、MAX_NEGATIVE和 HARD_EXAMPLE 三种策略。其中 None 相当于不使用难样本挖掘。而 MAX_NEGATIVE只计算分类 loss,不计算定位 loss,只针对负样本选择 loss 最大的3倍于正样本数量的负样本。HARD_EXAMPLE会同时计算loss的时候会同时计算分类 loss 和 confidence loss。

template <typename Dtype>
void MineHardExamples(const Blob& conf_blob,
    const vector<LabelBBox>& all_loc_preds,
    const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
    const vector<NormalizedBBox>& prior_bboxes,
    const vector<vector<float> >& prior_variances,
    const vector<map<int, vector<float> > >& all_match_overlaps,
    const MultiBoxLossParameter& multibox_loss_param,
    int* num_matches, int* num_negs,
    vector<map<int, vector<int> > >* all_match_indices,
    vector<vector<int> >* all_neg_indices) {
  int num = all_loc_preds.size();
  // CHECK_EQ(num, all_match_overlaps.size());
  // CHECK_EQ(num, all_match_indices->size());
  // all_neg_indices->clear();
  *num_matches = CountNumMatches(*all_match_indices, num);
  *num_negs = 0;
  int num_priors = prior_bboxes.size();
  CHECK_EQ(num_priors, prior_variances.size());
  // Get parameters.
  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
  const int num_classes = multibox_loss_param.num_classes();
  CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
  const int background_label_id = multibox_loss_param.background_label_id();
  const bool use_prior_for_nms = multibox_loss_param.use_prior_for_nms();
  const ConfLossType conf_loss_type = multibox_loss_param.conf_loss_type();
  const MiningType mining_type = multibox_loss_param.mining_type();
  //配置为None表示不做验样本挖掘
  if (mining_type == MultiBoxLossParameter_MiningType_NONE) {
    return;
  }
  const LocLossType loc_loss_type = multibox_loss_param.loc_loss_type();
  //负样本比例???
  const float neg_pos_ratio = multibox_loss_param.neg_pos_ratio();
  //负样本IoU
  const float neg_overlap = multibox_loss_param.neg_overlap();
  //CENTER_SIZE?(cx,cy,w,h)
  const CodeType code_type = multibox_loss_param.code_type();
  const bool encode_variance_in_target =
      multibox_loss_param.encode_variance_in_target();
  const bool has_nms_param = multibox_loss_param.has_nms_param();
  float nms_threshold = 0;
  int top_k = -1;
  if (has_nms_param) {
    nms_threshold = multibox_loss_param.nms_param().nms_threshold();
    top_k = multibox_loss_param.nms_param().top_k();
  }
  const int sample_size = multibox_loss_param.sample_size();
  // Compute confidence losses based on matching results.
  // 反正先将 conf loss计算出来,不管后面 type 是啥,它都是要计算的
  vector<vector<float> > all_conf_loss;
  ComputeConfLossGPU<Dtype>(conf_blob, num, num_priors, num_classes,
      background_label_id, conf_loss_type, *all_match_indices, all_gt_bboxes,
      &all_conf_loss);

  vector<vector<float> > all_loc_loss;
  //貌似会同时计算分类和定位loss
  if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {
    // Compute localization losses based on matching results.
    TBlob<Dtype> loc_pred, loc_gt;
    if (*num_matches != 0) {
      vector<int> loc_shape(2, 1);
      loc_shape[1] = *num_matches * 4;
      loc_pred.Reshape(loc_shape);
      loc_gt.Reshape(loc_shape);
      Dtype* loc_pred_data = loc_pred.mutable_cpu_data();
      Dtype* loc_gt_data = loc_gt.mutable_cpu_data();
      EncodeLocPrediction(all_loc_preds, all_gt_bboxes, *all_match_indices,
                          prior_bboxes, prior_variances, multibox_loss_param,
                          loc_pred_data, loc_gt_data);
    }
    ComputeLocLoss(loc_pred, loc_gt, *all_match_indices, num,
                   num_priors, loc_loss_type, &all_loc_loss);
  } else {
    // No localization loss.
    //只计算了分类 loss,配置成 MAX_NEGATIVE
    for (int i = 0; i < num; ++i) {
      vector<float> loc_loss(num_priors, 0.f); //全是0
      all_loc_loss.push_back(loc_loss);
    }
  }
  for (int i = 0; i < num; ++i) {
    map<int, vector<int> >& match_indices = (*all_match_indices)[i];
    const map<int, vector<float> >& match_overlaps = all_match_overlaps[i];
    // loc + conf loss.
    const vector<float>& conf_loss = all_conf_loss[i];
    const vector<float>& loc_loss = all_loc_loss[i];
    vector<float> loss;
    std::transform(conf_loss.begin(), conf_loss.end(), loc_loss.begin(),
                   std::back_inserter(loss), std::plus<float>());
    // Pick negatives or hard examples based on loss.
    set<int> sel_indices;
    vector<int> neg_indices;
    for (map<int, vector<int> >::iterator it = match_indices.begin();
         it != match_indices.end(); ++it) {
      const int label = it->first;
      int num_sel = 0;
      // Get potential indices and loss pairs.
      vector<pair<float, int> > loss_indices;
      for (int m = 0; m < match_indices[label].size(); ++m) {
        if (IsEligibleMining(mining_type, match_indices[label][m],
            match_overlaps.find(label)->second[m], neg_overlap)) {
          loss_indices.push_back(std::make_pair(loss[m], m));
          ++num_sel;
        }
      }
      //OHEM type
      if (mining_type == MultiBoxLossParameter_MiningType_MAX_NEGATIVE) {
        int num_pos = 0;
        for (int m = 0; m < match_indices[label].size(); ++m) {
          if (match_indices[label][m] > -1) {
            ++num_pos;
          }
        }
        //根据正样本数及正负样本比例计算出负样本数,然后和num_sel 比较取其小
        num_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), num_sel);
      } else if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {
        CHECK_GT(sample_size, 0);
        num_sel = std::min(sample_size, num_sel);
      }
      // Select samples.
      if (has_nms_param && nms_threshold > 0) {
        // Do nms before selecting samples.
        vector<float> sel_loss;
        vector<NormalizedBBox> sel_bboxes;
        if (use_prior_for_nms) {
          for (int m = 0; m < match_indices[label].size(); ++m) {
            if (IsEligibleMining(mining_type, match_indices[label][m],
                match_overlaps.find(label)->second[m], neg_overlap)) {
              sel_loss.push_back(loss[m]);
              sel_bboxes.push_back(prior_bboxes[m]);
            }
          }
        } else {
          // Decode the prediction into bbox first.
          vector<NormalizedBBox> loc_bboxes;
          bool clip_bbox = false;
          DecodeBBoxes(prior_bboxes, prior_variances,
                       code_type, encode_variance_in_target, clip_bbox,
                       all_loc_preds[i].find(label)->second, &loc_bboxes);
          for (int m = 0; m < match_indices[label].size(); ++m) {
            if (IsEligibleMining(mining_type, match_indices[label][m],
                match_overlaps.find(label)->second[m], neg_overlap)) {
              sel_loss.push_back(loss[m]);
              sel_bboxes.push_back(loc_bboxes[m]);
            }
          }
        }
        // Do non-maximum suppression based on the loss.
        vector<int> nms_indices;
        ApplyNMS(sel_bboxes, sel_loss, nms_threshold, top_k, &nms_indices);
        if (nms_indices.size() < num_sel) {
          LOG(INFO) << "not enough sample after nms: " << nms_indices.size();
        }
        // Pick top example indices after nms.
        num_sel = std::min(static_cast<int>(nms_indices.size()), num_sel);
        for (int n = 0; n < num_sel; ++n) {
          sel_indices.insert(loss_indices[nms_indices[n]].second);
        }
      } else {
        // Pick top example indices based on loss.
        std::sort(loss_indices.begin(), loss_indices.end(),
                  SortScorePairDescend<int>);
        for (int n = 0; n < num_sel; ++n) {
          sel_indices.insert(loss_indices[n].second);
        }
      }
      // Update the match_indices and select neg_indices.
      for (int m = 0; m < match_indices[label].size(); ++m) {
        if (match_indices[label][m] > -1) {
          if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE &&
              sel_indices.find(m) == sel_indices.end()) {
            match_indices[label][m] = -1;
            *num_matches -= 1;
          }
        } else if (match_indices[label][m] == -1) {
          if (sel_indices.find(m) != sel_indices.end()) {
            neg_indices.push_back(m);
            *num_negs += 1;
          }
        }
      }
    }
    all_neg_indices->push_back(neg_indices);
  }
}

反向传播(Backward)

template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::Backward_cpu(const vector<Blob*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob*>& bottom) {
  if (propagate_down[2]) {
    LOG(FATAL) << this->type()
        << " Layer cannot backpropagate to prior inputs.";
  }
  if (propagate_down[3]) {
    LOG(FATAL) << this->type()
        << " Layer cannot backpropagate to label inputs.";
  }
  // Back propagate on location prediction.
  if (propagate_down[0]) {
    Dtype* loc_bottom_diff = bottom[0]->mutable_cpu_diff<Dtype>();
    caffe_set(bottom[0]->count(), Dtype(0), loc_bottom_diff);
    if (num_matches_ >= 1) {
      vector<bool> loc_propagate_down;
      // Only back propagate on prediction, not ground truth.
      loc_propagate_down.push_back(true);
      loc_propagate_down.push_back(false);
      loc_loss_layer_->Backward(loc_top_vec_, loc_propagate_down,
                                loc_bottom_vec_);
      // Scale gradient.
      Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
          normalization_, num_, num_priors_, num_matches_);
      Dtype loss_weight = top[0]->cpu_diff<Dtype>()[0] / normalizer;
      caffe_scal(loc_pred_->count(), loss_weight, loc_pred_->mutable_cpu_diff<Dtype>());
      // Copy gradient back to bottom[0].
      const Dtype* loc_pred_diff = loc_pred_->cpu_diff<Dtype>();
      int count = 0;
      for (int i = 0; i < num_; ++i) {
        for (map<int, vector<int> >::iterator it =
             all_match_indices_[i].begin();
             it != all_match_indices_[i].end(); ++it) {
          const int label = share_location_ ? 0 : it->first;
          const vector<int>& match_index = it->second;
          for (int j = 0; j < match_index.size(); ++j) {
            if (match_index[j] <= -1) {
              continue;
            }
            // Copy the diff to the right place.
            int start_idx = loc_classes_ * 4 * j + label * 4;
            caffe_copy(4, loc_pred_diff + count * 4,
                              loc_bottom_diff + start_idx);
            ++count;
          }
        }
        loc_bottom_diff += bottom[0]->offset(1);
      }
    }
  }

  // Back propagate on confidence prediction.
  if (propagate_down[1]) {
    Dtype* conf_bottom_diff = bottom[1]->mutable_cpu_diff<Dtype>();
    caffe_set(bottom[1]->count(), Dtype(0), conf_bottom_diff);
    if (num_conf_ >= 1) {
      vector<bool> conf_propagate_down;
      // Only back propagate on prediction, not ground truth.
      conf_propagate_down.push_back(true);
      conf_propagate_down.push_back(false);
      conf_loss_layer_->Backward(conf_top_vec_, conf_propagate_down,
                                 conf_bottom_vec_);
      // Scale gradient.
      Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
          normalization_, num_, num_priors_, num_matches_);
      Dtype loss_weight = top[0]->cpu_diff<Dtype>()[0] / normalizer;
      caffe_scal(conf_pred_->count(), loss_weight,
                 conf_pred_->mutable_cpu_diff<Dtype>());
      // Copy gradient back to bottom[1].
      const Dtype* conf_pred_diff = conf_pred_->cpu_diff<Dtype>();
      if (do_neg_mining_) {
        int count = 0;
        for (int i = 0; i < num_; ++i) {
          // Copy matched (positive) bboxes scores' diff.
          const map<int, vector<int> >& match_indices = all_match_indices_[i];
          for (map<int, vector<int> >::const_iterator it =
               match_indices.begin(); it != match_indices.end(); ++it) {
            const vector<int>& match_index = it->second;
            CHECK_EQ(match_index.size(), num_priors_);
            for (int j = 0; j < num_priors_; ++j) {
              if (match_index[j] <= -1) {
                continue;
              }
              // Copy the diff to the right place.
              caffe_copy(num_classes_,
                                conf_pred_diff + count * num_classes_,
                                conf_bottom_diff + j * num_classes_);
              ++count;
            }
          }
          // Copy negative bboxes scores' diff.
          for (int n = 0; n < all_neg_indices_[i].size(); ++n) {
            int j = all_neg_indices_[i][n];
            CHECK_LT(j, num_priors_);
            caffe_copy(num_classes_,
                              conf_pred_diff + count * num_classes_,
                              conf_bottom_diff + j * num_classes_);
            ++count;
          }
          conf_bottom_diff += bottom[1]->offset(1);
        }
      } else {
        // The diff is already computed and stored.
        //bottom[1]->ShareDiff(*conf_pred_);
        caffe_copy(conf_pred_->count(),conf_pred_diff,conf_bottom_diff);

      }
    }
  }

  if (0) {
      float loss_xy = 0, loss_wh = 0, loss_obj = 0, loss_cls = 0;
      Blob* loc = bottom[0];
      Blob* conf = bottom[1];
      int num = loc->shape()[0];
      int nboxes = loc->shape()[1] / 4;
      CHECK(nboxes == conf->shape()[1] / num_classes_);
      for (int n=0; n<num; n++) {
          for (int j=0; j<nboxes; j++) {
              const float* p = loc->cpu_diff<float>() + n*nboxes*4 + j*4;
              loss_xy += std::abs(p[0]);
              loss_xy += std::abs(p[1]);
              loss_wh += std::abs(p[2]);
              loss_wh += std::abs(p[3]);
          }
      }
      for (int n=0; n<num; n++) {
          for (int j=0; j<nboxes; j++) {
              const float* p = conf->cpu_diff<float>() + n*nboxes*num_classes_ + j*num_classes_;
              loss_obj += std::abs(p[0]);
              for (int c=1; c<num_classes_; c++) {
                  loss_cls += std::abs(p[c]);
              }
          }
      }
      const Solver* solver = this->parent_solver();
      if ((solver && solver->display()) || solver==0) {
          //LOG(INFO) << "the number of pred boxes is " << nboxes;
          float loss = loss_xy+loss_wh+loss_obj+loss_cls;
          char str[1024];
          snprintf(str, 1024, "%s, iter %d, loss %g, loss_xy %g, loss_wh %g, loss_obj %g, loss_cls %g\n",
                  this->name().c_str(), this->iter(),
                  loss, loss_xy, loss_wh, loss_obj, loss_cls);
          LOG(INFO) << str;
      }
  }

  // After backward, remove match statistics.
  all_match_indices_.clear();
  all_neg_indices_.clear();
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章