MTCNN代码解读

代码基于bm1682芯片
#include "mtcnn.hpp"
#include "utils.hpp"

using namespace std;
using namespace bmruntime;

MTCNN::MTCNN(const vector<string>& bmodel) {
  min_size_ = 40;//为外部设置的检测图像中人脸的最小尺寸
  min_pyramid_size_ = 12;
  factor_ = 0.5;
  in_w_ = 1920;
  in_h_ = 1080;
  thresholds_.push_back(0.6);
  thresholds_.push_back(0.7);
  thresholds_.push_back(0.7);

  means_.push_back(127.5);
  means_.push_back(127.5);
  means_.push_back(127.5);

  pnet_ = new Net(bmodel[0]);
  rnet_ = new Net(bmodel[1]);
  onet_ = new Net(bmodel[2]);

  in_data_ = new float[1 * 3 * in_w_ * in_h_];
  ts_ = nullptr;
}

MTCNN::~MTCNN() {
  delete pnet_;
  delete rnet_;
  delete onet_;
  delete []in_data_;
}

static inline bool compareBBox(const FaceRect &a, const FaceRect &b) {
  return a.score > b.score;
}

/*
*	src :w*h=640*480
*	dst :
*/
float MTCNN::rescale_image(const cv::Mat &src, cv::Mat *dst) {
  float ratio = 1.0;

  //in_w_=1920,in_h_=1080
  if (((size_t)src.rows == in_h_) && ((size_t)src.cols == in_w_)) {
    ratio = 1.0;
    *dst = src;
    return ratio;
  }
  //ratio = 0.444444 = std::max(1.0 * 480 / 1080, 1.0 * 640 /1920)
  ratio = std::max(1.0 * src.rows / in_h_, 1.0 * src.cols /in_w_);
  //pad_bottom = 0 = 1080 - 480/0.444
  int pad_bottom = in_h_ - src.rows / ratio;
  //pad_right = 480 = 1920 - 640/0.444
  int pad_right = in_w_ - src.cols / ratio;
  //将src resize为(w,h) = (1440,1080) ,这里先将原图等比例缩放，保证不形变
  cv::resize(src, *dst, cv::Size(src.cols / ratio, src.rows / ratio), 0, 0, cv::INTER_NEAREST);

  //然后根据pad_bottom和pad_right的值，将resize后的dst的右侧和底部填充数字0，保证dst大小变为(w,h)=(1920,1080)
  //之所以填充右侧和底侧，估计原因是边界框的计算是根据左上角的坐标点和边界框的宽高得到的
  if (pad_bottom || pad_right) {
	//在top,bottom,left,right分别填充0,0,0,480
    cv::copyMakeBorder(*dst, *dst, 0, pad_bottom, 0, pad_right, cv::BORDER_CONSTANT, cv::Scalar(0));
  }

  return ratio;
}

void MTCNN::nms(const std::vector<FaceRect> &proposals,
                    std::vector<FaceRect> &nmsProposals) {
  if (proposals.empty()) {
    nmsProposals.clear();
    return;
  }
  std::vector<FaceRect> bboxes = proposals;
  std::sort(bboxes.begin(), bboxes.end(), compareBBox);

  int select_idx = 0;
  int num_bbox = bboxes.size();
  std::vector<int> mask_merged(num_bbox, 0);
  bool all_merged = false;
  while (!all_merged) {
    while (select_idx < num_bbox && 1 == mask_merged[select_idx])
      ++select_idx;

    if (select_idx == num_bbox) {
      all_merged = true;
      continue;
    }
    nmsProposals.push_back(bboxes[select_idx]);
    mask_merged[select_idx] = 1;
    FaceRect select_bbox = bboxes[select_idx];
    float area1 = (select_bbox.x2 - select_bbox.x1 + 1) *
                  (select_bbox.y2 - select_bbox.y1 + 1);
    ++select_idx;
    for (int i = select_idx; i < num_bbox; ++i) {
      if (mask_merged[i] == 1)
        continue;
      FaceRect &bbox_i = bboxes[i];
      float x = std::max(select_bbox.x1, bbox_i.x1);
      float y = std::max(select_bbox.y1, bbox_i.y1);
      float w = std::min(select_bbox.x2, bbox_i.x2) - x + 1;
      float h = std::min(select_bbox.y2, bbox_i.y2) - y + 1;
      if (w <= 0 || h <= 0)
        continue;
      float area2 = (bbox_i.x2 - bbox_i.x1 + 1) * (bbox_i.y2 - bbox_i.y1 + 1);
      float area_intersect = w * h;
      // Union method
      if (area_intersect / (area1 + area2 - area_intersect) > nms_threshold_)
        mask_merged[i] = 1;
    }
  }
}

void MTCNN::padding(const cv::Mat &image,
                        const std::vector<FaceRect> &boxes,
                        std::vector<FaceRect> &paddings) {
  paddings.clear();
  for (uint32_t i = 0; i < boxes.size(); i++) {
    int img_w = image.cols;
    int img_h = image.rows;
    FaceRect rect;
    rect.x1 = (boxes[i].x1 < 0) ? 0 : boxes[i].x1;
    rect.y1 = (boxes[i].y1 < 0) ? 0 : boxes[i].y1;
    rect.x2 = (boxes[i].x2 > img_w - 1) ? img_w - 1 : boxes[i].x2;
    rect.y2 = (boxes[i].y2 > img_h - 1) ? img_h - 1 : boxes[i].y2;
    paddings.push_back(rect);
  }
}

void MTCNN::bbox2square(std::vector<FaceRect> &bboxes) {
  for (uint32_t i = 0; i < bboxes.size(); ++i) {
    float w = bboxes[i].x2 - bboxes[i].x1 + 1;
    float h = bboxes[i].y2 - bboxes[i].y1 + 1;
    float side = std::max<float>(w, h);
    bboxes[i].x1 += (w - side) * 0.5;
    bboxes[i].y1 += (h - side) * 0.5;
    bboxes[i].x2 = (int)(bboxes[i].x1 + side - 1);
    bboxes[i].y2 = (int)(bboxes[i].y1 + side - 1);
    bboxes[i].x1 = (int)(bboxes[i].x1);
    bboxes[i].y1 = (int)(bboxes[i].y1);
  }
}

void MTCNN::boxRegress(const std::vector<FaceRect> &faceRects,
                           std::vector<FaceRect> &regressedRects) {
  for (uint32_t bboxId = 0; bboxId < faceRects.size(); ++bboxId) {
    FaceRect faceRect;
    float regw = faceRects[bboxId].x2 - faceRects[bboxId].x1 + 1;
    float regh = faceRects[bboxId].y2 - faceRects[bboxId].y1 + 1;
    faceRect.x1 =
        faceRects[bboxId].x1 + regw * faceRects[bboxId].regression[0] - 1;
    faceRect.y1 =
        faceRects[bboxId].y1 + regh * faceRects[bboxId].regression[1] - 1;
    faceRect.x2 =
        faceRects[bboxId].x2 + regw * faceRects[bboxId].regression[2] - 1;
    faceRect.y2 =
        faceRects[bboxId].y2 + regh * faceRects[bboxId].regression[3] - 1;
    if (faceRect.x1 >= faceRect.x2 || faceRect.y1 >= faceRect.y2)
      continue;
    faceRect.score = faceRects[bboxId].score;
    faceRect.regression = faceRects[bboxId].regression;
    faceRect.pts = faceRects[bboxId].pts;
    regressedRects.push_back(faceRect);
  }
}


/*
*	生成边界框
*	prob:置信度向量
*	reg：边界框回归向量
*	scale：金字塔尺度
*	thresh:P-Net阈值
*	im_w,im_h：输入到P-Net的图像宽高(576,324)
*	proposals：需要返回的候选框
*/
void MTCNN::generateBoundingBox(Blob *prob, Blob *reg,
                                    float scale, float thresh, int im_w,
                                    int im_h,
                                    std::vector<FaceRect> &proposals) {
  int stride = 2;
  int cellSize = 12;
  //计算卷积网路输出得尺寸，W和H大小的计算，可以根据卷积神经网络W2=(W1-F+2P)/S+1, H2=(H1-F+2P)/S+1的方式递归计算出来
  int fm_width = ceil((im_w - cellSize) * 1.0 / stride) + 1; //(576-12)*1.0/2+1 = 283
  int fm_height = ceil((im_h - cellSize) * 1.0 / stride) + 1;//(324-12)*1.0/2+1 = 157

  int offset = fm_height * fm_width;//一个特征图得长度
  //prob是一个[1,2,157,283]的向量，特征图每个点有2个值，分别表示该点是否是人脸和对应分数，prob->data()) + offset得到的是特征图对应的分数
  const float *confidence_data = reinterpret_cast<float *>(prob->data()) + offset;
  //reg是一个[1,4,157,283]的向量，特征图每个点有4个值
  const float *reg_data = reinterpret_cast<float *>(reg->data());
  for (int y = 0; y < fm_height; ++y) {
    for (int x = 0; x < fm_width; ++x) {
      int index = y * fm_width + x;
      //cout << "generateBoundingBox: confidence_data[] = " << confidence_data[index] <<  endl;
      if (confidence_data[index] >= thresh) {
		//计算特征图每个点对应输入图的边界框坐标
        float xTop = (int)((x * stride) / scale);
        float yTop = (int)((y * stride) / scale);
		//加cellSize=12，是因为检测的人脸是输入P-Net的图像[576,324]上的12*12的人脸，因此得到的人脸大小是[576,324]上的12*12局部图，再将该人脸边界框复原到原图[1920,1080]对应坐标
        float xBot = (int)((x * stride + cellSize - 1) / scale);
        float yBot = (int)((y * stride + cellSize - 1) / scale);
        FaceRect faceRect;
        faceRect.x1 = xTop;
        faceRect.y1 = yTop;
        faceRect.x2 = xBot;
        faceRect.y2 = yBot;
        faceRect.score = confidence_data[index];
		//提取每个边界框的回归向量
        faceRect.regression = cv::Vec4f(
            reg_data[index], reg_data[offset + index],
            reg_data[2 * offset + index], reg_data[3 * offset + index]);
        proposals.push_back(faceRect);
      }
    }
  }
}

size_t MTCNN::getBoxPerBatch(int *numBox) {
  if (*numBox >= 128) {
    *numBox = *numBox - 128;
    return 128;
  } else {  // get the left most bit
    int index = 6;
    while (index >= 0) {
      if (*numBox & (1 << index)) {
        *numBox &= ~(1 << index);
        return 1 << index;
      }
      index--;
    }
  }

  return 0;
}

void MTCNN::wrapInputLayer(float *input_data, int c, int h, int w, std::vector<cv::Mat>* input_channels) {

  for (int i = 0; i < c; ++i) {
    cv::Mat channel(h, w, CV_32FC1, input_data);
    input_channels->push_back(channel);
    input_data += h * w;
  }
}

void MTCNN::classify_face(const std::vector<FaceRect> &boxes,
                              const std::vector<FaceRect> &paddings,
                              const cv::Mat &image,
                              Net *net,
                              double threshold, int flag,
                              std::vector<FaceRect> &results) {
  int num_box = boxes.size();
  int input_width = (flag == 0) ? 24 : 48;
  int input_height = (flag == 0) ? 24 : 48;
  cv::Size dsize;
  dsize.width = input_width;
  dsize.height = input_height;

  int numBoxPerBatch = getBoxPerBatch(&num_box);
  int reg_idx = 0;

  while (numBoxPerBatch > 0) {
    float * cur_input = in_data_;
    for (int i = 0; i < numBoxPerBatch; ++i) {
      cv::Mat res;
      int pad_left = std::abs(paddings[reg_idx + i].x1 - boxes[reg_idx + i].x1);
      int pad_top = std::abs(paddings[reg_idx + i].y1 - boxes[reg_idx + i].y1);
      int pad_right = std::abs(paddings[reg_idx + i].x2 - boxes[reg_idx + i].x2);
      int pad_bottom = std::abs(paddings[reg_idx + i].y2 - boxes[reg_idx + i].y2);
	  //利用人脸候选框裁剪原图，得到人脸图
      cv::Mat crop_img = image(cv::Range(paddings[reg_idx + i].y1, paddings[reg_idx + i].y2 + 1),
                               cv::Range(paddings[reg_idx + i].x1, paddings[reg_idx + i].x2 + 1));
	  //利用人脸候选框裁剪原图，得到人脸图
      cv::copyMakeBorder(crop_img, crop_img, pad_top, pad_bottom, pad_left,
                         pad_right, cv::BORDER_CONSTANT, cv::Scalar(0));
      // resize_convertTo(crop_img, res, dsize, cv::Scalar(127.5, 127.5, 127.5),
      //                 0.0078125);
	  //resize
      cv::resize(crop_img, res, dsize, 0, 0);

      res.convertTo(res, CV_32FC3);
      cv::Scalar mean(means_[0], means_[1], means_[2]);
      res = (res - mean) * 0.0078125;

      std::vector<cv::Mat> input_channels;
      wrapInputLayer(cur_input, 3, input_height, input_width, &input_channels);
      cur_input  += 3 * input_height * input_width;
      cv::split(res, input_channels);
    }

    vector<Blob> input_blobs;
    shape_t input_shape = shape_t4(numBoxPerBatch, 3, input_height, input_width);
    input_blobs.push_back(Blob(in_data_, input_shape));
    int ret = net->forward(input_blobs);
    if (ret) {
      cout << "net forward failed." << endl;
      return;
    }

    std::string outPutLayerName = ((flag == 0) ? "conv5-2" : "conv6-2");
    Blob *reg_blob = net->output(outPutLayerName);
    if (reg_blob == nullptr) {
      cout << "get output failed." << endl;
      return;
    }

    Blob *prob_blob = net->output("prob1");
    if (prob_blob == nullptr) {
      cout << "get output failed." << endl;
      return;
    }

    Blob *pts_blob;
    if (flag) {
      pts_blob = net->output("conv6-3");
      if (prob_blob == nullptr) {
        cout << "get output failed." << endl;
        return;
      }
      //pts_blob->dump();
      //reg_blob->dump();
      //prob_blob->dump();
    }

    const float *confidence_data = reinterpret_cast<float *>(prob_blob->data());
    const float *reg_data = reinterpret_cast<float *>(reg_blob->data());
    for (int i = 0; i < numBoxPerBatch; ++i) {
      if (flag) {
        //cout << "confidence_data[] = " << confidence_data[i * 2 + 1] <<  endl;
        //cout << "reg_data[] = " << reg_data[4 * i + 0] << " " << reg_data[4 * i + 1] << " " << reg_data[4 * i + 2] << " " << reg_data[4 * i + 3] <<  endl;
        //cout << "reg_data[] = " << reg_data[i * 2 + 1] <<  endl;
      }
      if (confidence_data[i * 2 + 1] > threshold) {
        FaceRect faceRect;
        faceRect.x1 = boxes[reg_idx + i].x1;
        faceRect.y1 = boxes[reg_idx + i].y1;
        faceRect.x2 = boxes[reg_idx + i].x2;
        faceRect.y2 = boxes[reg_idx + i].y2;
        faceRect.score = confidence_data[i * 2 + 1];
        faceRect.regression = cv::Vec4f(reg_data[4 * i + 0], reg_data[4 * i + 1],
                                        reg_data[4 * i + 2], reg_data[4 * i + 3]);
        if (flag) {
          const float *points_data = reinterpret_cast<float *>(pts_blob->data());
          FacePts face_pts;
          float w = faceRect.x2 - faceRect.x1 + 1;
          float h = faceRect.y2 - faceRect.y1 + 1;
          for (int j = 0; j < 5; j++) {
            face_pts.x[j] = faceRect.x1 + points_data[j + 10 * i] * w - 1;
            face_pts.y[j] = faceRect.y1 + points_data[j + 10 * i + 5] * h - 1;
          }
          faceRect.pts = face_pts;
        }
        results.push_back(faceRect);
      }
    }

    reg_idx += numBoxPerBatch;
    numBoxPerBatch = getBoxPerBatch(&num_box);
  } // while
}

void MTCNN::enable_profiling(TimeStamp *ts) {
  ts_ = ts;
}

/*
*	img :w*h=640*480
*	faceRects :[]
*/
void MTCNN::detect(const cv::Mat &img,
                   std::vector<FaceRect> &faceRects) {
  faceRects.clear();
  cv::Mat image;
  cv::Mat resized;
  //将img=(w,h)=(640,480)缩放到image = (w,h) = (1920,1080),并返回最大的缩放比例，宽高不足的部分在bottom和right填充0
  //ratio = 0.44444
  float ratio = rescale_image(img, &image);

  int width = image.cols; //1920
  int height = image.rows; //1080
  int min_wh = std::min(height, width); //1080 = std::min(1080, 1920)
  int factor_count = 0;
  //12表示网络检测的人脸大小，min_size_为外部设置的检测图像中人脸的最小尺寸
  double m = 12. / min_size_; //m = 0.3 = 12./40
  //金字塔中最大的图像大小，在该大小时，原图中min_size_的人脸变成12*12
  min_wh *= m; //min_wh = 324 = 1080 * 0.3
  std::vector<double> scales;
  
  //计算图像金字塔的每层的尺度
  /*
  * min_pyramid_size_ = 12
  * factor_ = 0.5
  * scales=[0.3,0.15,0.075,0.0375,0.01875]
  * min_wh=[162,81  ,40   ,20    ,10]
  */
  while (min_wh >= min_pyramid_size_) {
    scales.push_back(m * std::pow(factor_, factor_count));
    min_wh *= factor_;
    ++factor_count;
  }

  std::vector<FaceRect> total_boxes;
  for (int i = 0; i < factor_count; ++i) {
    double scale = scales[i];
	//计算第i层金字塔的图像宽高
    int ws = std::ceil(width * scale); //1920*0.3 = 576
    int hs = std::ceil(height * scale);//1080*0.3 = 324
    cv::Size dsize;
    dsize.width = ws;
    dsize.height = hs;
    /*resize_convertTo(image,
               resized,
               dsize,
               cv::Scalar(127.5, 127.5, 127.5),
               0.0078125);*/

    if (ts_)
      ts_->save("resize factor #" + to_string(i));
    //将image resize到第i层金字塔的尺寸
    cv::resize(image, resized, dsize, 0, 0);

    if (ts_)
      ts_->save("resize factor #" + to_string(i));

    if (ts_)
      ts_->save("preprocess factor #" + to_string(i));

    resized.convertTo(resized, CV_32FC3);
    resized = (resized - 127.5) * 0.0078125;

    std::vector<cv::Mat> input_channels;
    wrapInputLayer(in_data_, 3, hs, ws, &input_channels);
    cv::split(resized, input_channels);

    if (ts_)
      ts_->save("preprocess factor #" + to_string(i));

    vector<Blob> input_blobs;

    if (ts_)
      ts_->save("net-forward factor #" + to_string(i));

    shape_t input_shape = shape_t4(1, 3, hs, ws);
    input_blobs.push_back(Blob(in_data_, input_shape));
    int ret = pnet_->forward(input_blobs);
    if (ret != 0) {
      cout << "net forward failed: ret = " << ret << endl;
      return;
    }

    Blob* reg_blob = pnet_->output("conv4-2");
    if (reg_blob == nullptr) {
      cout << "get output failed." << endl;
      return;
    }

    Blob* prob_blob = pnet_->output("prob1");
    if (prob_blob == nullptr) {
      cout << "get output failed." << endl;
      return;
    }

    if (ts_)
      ts_->save("net-forward factor #" + to_string(i));

    std::vector<FaceRect> proposals, nmsProposals;
	//得到候选框，以及候选框对应的回归向量
    generateBoundingBox(prob_blob, reg_blob, scale, thresholds_[0], ws, hs,
                        proposals);

    if (ts_)
      ts_->save("nms factor #" + to_string(i));

    nms_threshold_ = 0.5;
	//num计算，得到nmsProposals
    nms(proposals, nmsProposals);

    if (ts_)
      ts_->save("nms factor #" + to_string(i));

    //std::cout << "pyramid w " << ws << " h " << hs << " gen "
    //          << nmsProposals.size() << std::endl;
    total_boxes.insert(total_boxes.end(), nmsProposals.begin(),
                       nmsProposals.end());
  }

  int num_boxes = total_boxes.size();
  if (num_boxes > 0) {
    if (ts_)
      ts_->save("box reg post PNET");
    nms_threshold_ = 0.7;
    std::vector<FaceRect> temp, paddings;
	//调整nms_threshold_=0.7，继续执行nms
    nms(total_boxes, temp);
    total_boxes.clear();
	//边界框利用回归向量进行回归计算
    boxRegress(temp, total_boxes);
	//将矩形候选框转换为正方形框
    bbox2square(total_boxes);
	//将超出图像范围的边界框限定到尺寸范围内
    padding(image, total_boxes, paddings);
    if (ts_)
      ts_->save("box reg post PNET");
    //std::cout << "PNet generate " << total_boxes.size() << std::endl;

    temp.clear();
    if (ts_)
      ts_->save("RNET");
    //R-Net
    classify_face(total_boxes, paddings, image, rnet_, thresholds_[1], 0, temp);
    if (ts_)
      ts_->save("RNET");
    total_boxes.clear();
    std::vector<FaceRect> temp1;
    if (ts_)
      ts_->save("box reg post RNET");
    nms(temp, temp1);
    boxRegress(temp1, total_boxes);
    bbox2square(total_boxes);
    padding(image, total_boxes, paddings);
    if (ts_)
      ts_->save("box reg post RNET");

    num_boxes = total_boxes.size();
    //std::cout << "RNet generate " << num_boxes << std::endl;
    if (num_boxes > 0) {
      temp.clear();
      temp1.clear();
      if (ts_)
        ts_->save("ONET");
	  //O-Net
      classify_face(total_boxes, paddings, image, onet_, thresholds_[2], 1,
                    temp);
      if (ts_)
        ts_->save("ONET");
      //cout << "after classify_face: " << temp.size() << endl;
      if (ts_)
        ts_->save("box reg post ONET");
      boxRegress(temp, temp1);
      nms_threshold_ = 0.5;
      nms(temp1, faceRects);
      for (size_t i = 0; i < faceRects.size(); ++i) {
        int h = image.rows;
        int w = image.cols;
        faceRects[i].x1 = (faceRects[i].x1 < 0 ? 0 : faceRects[i].x1) * ratio;
        faceRects[i].y1 = (faceRects[i].y1 < 0 ? 0 : faceRects[i].y1) * ratio;
        faceRects[i].x2 = (faceRects[i].x2 > w - 1 ? w - 1 : faceRects[i].x2) * ratio;
        faceRects[i].y2 = (faceRects[i].y2 > h - 1 ? h - 1 : faceRects[i].y2) * ratio;
        for (int j = 0; j < 5; j++) {
          faceRects[i].pts.x[j] *= ratio;
          faceRects[i].pts.y[j] *= ratio;
        }
      }
      if (ts_)
        ts_->save("box reg post ONET");
    }
  }

  //cout << "final predict " << faceRects.size() << " bboxes" << endl << endl;
}
猜你喜欢