这篇博客主要写multibox_loss_layer,multibox_loss_layer也是SSD比较关键内容,主要包括内建了两个layer进行loss回归,还包括比如FindMatches,MineHardExamples,EncodeLocPrediction && EncodeConfPrediction等都是比较重要的函数(其中有一部分在bbox_util中,后面会介绍)
代码:
#include <algorithm> #include <map> #include <utility> #include <vector> #include "caffe/layers/multibox_loss_layer.hpp" #include "caffe/util/math_functions.hpp" namespace caffe { // layer setup,在这个函数里面还分别新建了两个layer用于loc回归和conf loss的计算 template <typename Dtype> void MultiBoxLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { LossLayer<Dtype>::LayerSetUp(bottom, top); if (this->layer_param_.propagate_down_size() == 0) { this->layer_param_.add_propagate_down(true); // 定位 this->layer_param_.add_propagate_down(true); // 分类得分 this->layer_param_.add_propagate_down(false); // prior this->layer_param_.add_propagate_down(false); // ground truth } const MultiBoxLossParameter& multibox_loss_param = this->layer_param_.multibox_loss_param(); multibox_loss_param_ = this->layer_param_.multibox_loss_param(); // 这句话多余吧? num_ = bottom[0]->num(); // batch size num_priors_ = bottom[2]->height() / 4; // 先验的个数,每个先验包含左上角和右下角的点坐标 // Get other parameters. CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes."; num_classes_ = multibox_loss_param.num_classes(); // 类别个数 CHECK_GE(num_classes_, 1) << "num_classes should not be less than 1."; share_location_ = multibox_loss_param.share_location(); // 共享类别位置预测 default = true loc_classes_ = share_location_ ? 1 : num_classes_; // 如果shared表示所有的类别同用一个location prediction,否则每一类各自预测。还不是很懂这样做的原因 background_label_id_ = multibox_loss_param.background_label_id(); // background的id use_difficult_gt_ = multibox_loss_param.use_difficult_gt(); // 是否使用difficutlt的ground truth,这个具体是什么还有待考虑 mining_type_ = multibox_loss_param.mining_type(); // 这里跟老版SSD代码有些许不同 if (multibox_loss_param.has_do_neg_mining()) { LOG(WARNING) << "do_neg_mining is deprecated, use mining_type instead."; do_neg_mining_ = multibox_loss_param.do_neg_mining(); // 难例挖掘 true CHECK_EQ(do_neg_mining_, mining_type_ != MultiBoxLossParameter_MiningType_NONE); // MultiBoxLossParameter_MiningType_NONE变量?还不清楚具体的用法 } do_neg_mining_ = mining_type_ != MultiBoxLossParameter_MiningType_NONE; if (!this->layer_param_.loss_param().has_normalization() && // loss normalization,出自LossParameter,默认VALID this->layer_param_.loss_param().has_normalize()) { normalization_ = this->layer_param_.loss_param().normalize() ? LossParameter_NormalizationMode_VALID : LossParameter_NormalizationMode_BATCH_SIZE; } else { normalization_ = this->layer_param_.loss_param().normalization(); } if (do_neg_mining_) { CHECK(share_location_) << "Currently only support negative mining if share_location is true."; } vector<int> loss_shape(1, 1); // Set up localization loss layer. // 定位loss loc_weight_ = multibox_loss_param.loc_weight(); // loc weight 1.0 loc_loss_type_ = multibox_loss_param.loc_loss_type(); // loss 类型 SMOOTH_L1 // fake shape. vector<int> loc_shape(1, 1); // 1维 loc_shape.push_back(4); // 1,4 loc_pred_.Reshape(loc_shape); // 1*2 [1,4] loc_gt_.Reshape(loc_shape); // [1,4] loc_bottom_vec_.push_back(&loc_pred_); // 存放前面的指针 loc_bottom_vec_.push_back(&loc_gt_); // 存放gt的指针 loc_loss_.Reshape(loss_shape); // location的loss [1,4] loc_top_vec_.push_back(&loc_loss_); // 存放top的指针 if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_L2) { // 新建一个层,实现对locationloss的计算 LayerParameter layer_param; layer_param.set_name(this->layer_param_.name() + "_l2_loc"); layer_param.set_type("EuclideanLoss"); layer_param.add_loss_weight(loc_weight_); loc_loss_layer_ = LayerRegistry<Dtype>::CreateLayer(layer_param); loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_); } else if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_SMOOTH_L1) { // SMOOTH_L1,SSD是选这个 LayerParameter layer_param; layer_param.set_name(this->layer_param_.name() + "_smooth_L1_loc"); // mbox_loss_smooth_L1_loc layer_param.set_type("SmoothL1Loss"); layer_param.add_loss_weight(loc_weight_); // 1.0 loc_loss_layer_ = LayerRegistry<Dtype>::CreateLayer(layer_param); // 创建layer loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_); //送入推断和gt,输出loc_loss,有一点不太清楚loc_bottom_vec_是两个地址,后面怎么弄? } else { LOG(FATAL) << "Unknown localization loss type."; } // Set up confidence loss layer. // 新建一个层,实现的是对confidence loss的计算 conf_loss_type_ = multibox_loss_param.conf_loss_type(); // SOFTMAX conf_bottom_vec_.push_back(&conf_pred_); // conf_pred_ 是blob conf_bottom_vec_.push_back(&conf_gt_); // conf_gt_ 是blob conf_loss_.Reshape(loss_shape); // [1,4] conf_top_vec_.push_back(&conf_loss_); // 也是一维向量 if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) { CHECK_GE(background_label_id_, 0) << "background_label_id should be within [0, num_classes) for Softmax."; CHECK_LT(background_label_id_, num_classes_) << "background_label_id should be within [0, num_classes) for Softmax."; LayerParameter layer_param; layer_param.set_name(this->layer_param_.name() + "_softmax_conf"); // mbox_loss_softmax_conf layer_param.set_type("SoftmaxWithLoss"); layer_param.add_loss_weight(Dtype(1.)); // 1.0 layer_param.mutable_loss_param()->set_normalization( LossParameter_NormalizationMode_NONE); SoftmaxParameter* softmax_param = layer_param.mutable_softmax_param(); softmax_param->set_axis(1); // Fake reshape. vector<int> conf_shape(1, 1); conf_gt_.Reshape(conf_shape); // [1] conf_shape.push_back(num_classes_); // 这两个参数没有用到 conf_pred_.Reshape(conf_shape); conf_loss_layer_ = LayerRegistry<Dtype>::CreateLayer(layer_param); conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_); } else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) { LayerParameter layer_param; layer_param.set_name(this->layer_param_.name() + "_logistic_conf"); layer_param.set_type("SigmoidCrossEntropyLoss"); layer_param.add_loss_weight(Dtype(1.)); // Fake reshape. vector<int> conf_shape(1, 1); conf_shape.push_back(num_classes_); conf_gt_.Reshape(conf_shape); conf_pred_.Reshape(conf_shape); conf_loss_layer_ = LayerRegistry<Dtype>::CreateLayer(layer_param); conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_); } else { LOG(FATAL) << "Unknown confidence loss type."; } } // layer setup 结束 template <typename Dtype> void MultiBoxLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { LossLayer<Dtype>::Reshape(bottom, top); num_ = bottom[0]->num(); // batch num num_priors_ = bottom[2]->height() / 4; // 这里的blob维度还需要再仔细分析下 num_gt_ = bottom[3]->height(); CHECK_EQ(bottom[0]->num(), bottom[1]->num()); CHECK_EQ(num_priors_ * loc_classes_ * 4, bottom[0]->channels()) // loc_classes_共享是1,不共享就是classes数 << "Number of priors must match number of location predictions."; CHECK_EQ(num_priors_ * num_classes_, bottom[1]->channels()) << "Number of priors must match number of confidence predictions."; } // 预测loction bottom[0] dimension is [N*C*1*1],confidence bottom[1] dimension is [N*C*1*1] // priors bottom[2] dimension is [N*1*2*W], gound truth bottom[3] dimension is [N*1*H*8] template <typename Dtype> void MultiBoxLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const Dtype* loc_data = bottom[0]->cpu_data(); const Dtype* conf_data = bottom[1]->cpu_data(); const Dtype* prior_data = bottom[2]->cpu_data(); const Dtype* gt_data = bottom[3]->cpu_data(); // Retrieve all ground truth. /* message NormalizedBBox { optional float xmin = 1; optional float ymin = 2; optional float xmax = 3; optional float ymax = 4; optional int32 label = 5; optional bool difficult = 6; optional float score = 7; optional float size = 8; } */ // Retrieve all ground truth. map<int, vector<NormalizedBBox> > all_gt_bboxes; //转化ground truth bounding box,存放在all_gt_bboxes GetGroundTruth(gt_data, num_gt_, background_label_id_, use_difficult_gt_, // background_label_id_=0,use_difficult_gt_=true &all_gt_bboxes); // Retrieve all prior bboxes. It is same within a batch since we assume all // images in a batch are of same dimension. // 把prior box 存入prior_bboxes,把variances存入prior_variances vector<NormalizedBBox> prior_bboxes; vector<vector<float> > prior_variances; GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances); // Retrieve all predictions. vector<LabelBBox> all_loc_preds; // map<int, vector<NormalizedBBox> > LabelBBox; GetLocPredictions(loc_data, num_, num_priors_, loc_classes_, share_location_, &all_loc_preds); // 这里是把所有预测的box写入了all_loc_preds,这些box就是bottom[0],loc_data // Find matches between source bboxes and ground truth bboxes. vector<map<int, vector<float> > > all_match_overlaps; FindMatches(all_loc_preds, all_gt_bboxes, prior_bboxes, prior_variances, multibox_loss_param_, &all_match_overlaps, &all_match_indices_); num_matches_ = 0; int num_negs = 0; // Sample hard negative (and positive) examples based on mining type. MineHardExamples(*bottom[1], all_loc_preds, all_gt_bboxes, prior_bboxes, prior_variances, all_match_overlaps, multibox_loss_param_, &num_matches_, &num_negs, &all_match_indices_, &all_neg_indices_); if (num_matches_ >= 1) { // Form data to pass on to loc_loss_layer_. vector<int> loc_shape(2); loc_shape[0] = 1; loc_shape[1] = num_matches_ * 4; loc_pred_.Reshape(loc_shape); // 地址已经存放进了loc_bottom_vec_ loc_gt_.Reshape(loc_shape); Dtype* loc_pred_data = loc_pred_.mutable_cpu_data(); Dtype* loc_gt_data = loc_gt_.mutable_cpu_data(); EncodeLocPrediction(all_loc_preds, all_gt_bboxes, all_match_indices_, prior_bboxes, prior_variances, multibox_loss_param_, loc_pred_data, loc_gt_data); loc_loss_layer_->Reshape(loc_bottom_vec_, loc_top_vec_); loc_loss_layer_->Forward(loc_bottom_vec_, loc_top_vec_); // 前向计算 } else { loc_loss_.mutable_cpu_data()[0] = 0; } // 这里完成loc的loss前向计算 // Form data to pass on to conf_loss_layer_. if (do_neg_mining_) { // 计算positive和negative样本 num_conf_ = num_matches_ + num_negs; } else { num_conf_ = num_ * num_priors_; } if (num_conf_ >= 1) { // Reshape the confidence data. vector<int> conf_shape; if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) { // 选softmax conf_shape.push_back(num_conf_); conf_gt_.Reshape(conf_shape); conf_shape.push_back(num_classes_); conf_pred_.Reshape(conf_shape); } else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) { conf_shape.push_back(1); conf_shape.push_back(num_conf_); conf_shape.push_back(num_classes_); conf_gt_.Reshape(conf_shape); conf_pred_.Reshape(conf_shape); } else { LOG(FATAL) << "Unknown confidence loss type."; } if (!do_neg_mining_) { // Consider all scores. // Share data and diff with bottom[1]. CHECK_EQ(conf_pred_.count(), bottom[1]->count()); conf_pred_.ShareData(*(bottom[1])); } Dtype* conf_pred_data = conf_pred_.mutable_cpu_data(); Dtype* conf_gt_data = conf_gt_.mutable_cpu_data(); caffe_set(conf_gt_.count(), Dtype(background_label_id_), conf_gt_data); EncodeConfPrediction(conf_data, num_, num_priors_, multibox_loss_param_, all_match_indices_, all_neg_indices_, all_gt_bboxes, conf_pred_data, conf_gt_data); conf_loss_layer_->Reshape(conf_bottom_vec_, conf_top_vec_); conf_loss_layer_->Forward(conf_bottom_vec_, conf_top_vec_); } else { conf_loss_.mutable_cpu_data()[0] = 0; } // 这里结束conf的loss计算 top[0]->mutable_cpu_data()[0] = 0; if (this->layer_param_.propagate_down(0)) { // true 正则化一下 loc_loss Dtype normalizer = LossLayer<Dtype>::GetNormalizer( normalization_, num_, num_priors_, num_matches_); top[0]->mutable_cpu_data()[0] += loc_weight_ * loc_loss_.cpu_data()[0] / normalizer; } if (this->layer_param_.propagate_down(1)) { // true conf_loss Dtype normalizer = LossLayer<Dtype>::GetNormalizer( normalization_, num_, num_priors_, num_matches_); top[0]->mutable_cpu_data()[0] += conf_loss_.cpu_data()[0] / normalizer; } } // 结束Forward计算 template <typename Dtype> void MultiBoxLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (propagate_down[2]) { LOG(FATAL) << this->type() << " Layer cannot backpropagate to prior inputs."; } if (propagate_down[3]) { LOG(FATAL) << this->type() << " Layer cannot backpropagate to label inputs."; } // Back propagate on location prediction. if (propagate_down[0]) { // 先回传 loc_loss Dtype* loc_bottom_diff = bottom[0]->mutable_cpu_diff(); caffe_set(bottom[0]->count(), Dtype(0), loc_bottom_diff); if (num_matches_ >= 1) { vector<bool> loc_propagate_down; // Only back propagate on prediction, not ground truth. loc_propagate_down.push_back(true); loc_propagate_down.push_back(false); loc_loss_layer_->Backward(loc_top_vec_, loc_propagate_down, loc_bottom_vec_); // Scale gradient. Dtype normalizer = LossLayer<Dtype>::GetNormalizer( normalization_, num_, num_priors_, num_matches_); Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer; caffe_scal(loc_pred_.count(), loss_weight, loc_pred_.mutable_cpu_diff()); // Copy gradient back to bottom[0]. const Dtype* loc_pred_diff = loc_pred_.cpu_diff(); int count = 0; for (int i = 0; i < num_; ++i) { for (map<int, vector<int> >::iterator it = all_match_indices_[i].begin(); it != all_match_indices_[i].end(); ++it) { const int label = share_location_ ? 0 : it->first; const vector<int>& match_index = it->second; for (int j = 0; j < match_index.size(); ++j) { if (match_index[j] <= -1) { continue; } // Copy the diff to the right place. int start_idx = loc_classes_ * 4 * j + label * 4; caffe_copy<Dtype>(4, loc_pred_diff + count * 4, loc_bottom_diff + start_idx); ++count; } } loc_bottom_diff += bottom[0]->offset(1); } } } // Back propagate on confidence prediction. if (propagate_down[1]) { Dtype* conf_bottom_diff = bottom[1]->mutable_cpu_diff(); caffe_set(bottom[1]->count(), Dtype(0), conf_bottom_diff); if (num_conf_ >= 1) { vector<bool> conf_propagate_down; // Only back propagate on prediction, not ground truth. conf_propagate_down.push_back(true); conf_propagate_down.push_back(false); conf_loss_layer_->Backward(conf_top_vec_, conf_propagate_down, conf_bottom_vec_); // Scale gradient. Dtype normalizer = LossLayer<Dtype>::GetNormalizer( normalization_, num_, num_priors_, num_matches_); Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer; caffe_scal(conf_pred_.count(), loss_weight, conf_pred_.mutable_cpu_diff()); // Copy gradient back to bottom[1]. const Dtype* conf_pred_diff = conf_pred_.cpu_diff(); if (do_neg_mining_) { int count = 0; for (int i = 0; i < num_; ++i) { // Copy matched (positive) bboxes scores' diff. const map<int, vector<int> >& match_indices = all_match_indices_[i]; for (map<int, vector<int> >::const_iterator it = match_indices.begin(); it != match_indices.end(); ++it) { const vector<int>& match_index = it->second; CHECK_EQ(match_index.size(), num_priors_); for (int j = 0; j < num_priors_; ++j) { if (match_index[j] <= -1) { continue; } // Copy the diff to the right place. caffe_copy<Dtype>(num_classes_, conf_pred_diff + count * num_classes_, conf_bottom_diff + j * num_classes_); ++count; } } // Copy negative bboxes scores' diff. for (int n = 0; n < all_neg_indices_[i].size(); ++n) { int j = all_neg_indices_[i][n]; CHECK_LT(j, num_priors_); caffe_copy<Dtype>(num_classes_, conf_pred_diff + count * num_classes_, conf_bottom_diff + j * num_classes_); ++count; } conf_bottom_diff += bottom[1]->offset(1); } } else { // The diff is already computed and stored. bottom[1]->ShareDiff(conf_pred_); } } } // After backward, remove match statistics. all_match_indices_.clear(); all_neg_indices_.clear(); } INSTANTIATE_CLASS(MultiBoxLossLayer); REGISTER_LAYER_CLASS(MultiBoxLoss); } // namespace caffe