caffe中backward(反向传播)的实现

人工智能/机器学习/深度学习交流QQ群：116270156 也可以扫一扫下面二维码加入微信群，如果二维码失效，可以添加博主个人微信，拉你进群

backward是利用损失函数求取关于网络中每个参数梯度的过程，为后面更新网络参数做准备。求取梯度的过程也是一个矩阵运算的过程，后面会有详细介绍，本身求取梯度的过程并不是很复杂，而且网络中的各层求取梯度的过程都是相似的。下面就按照backward的运行顺序，详细介绍Euclidean Loss和SoftmaxWithLoss层的实现。

Euclidean Loss层

损失函数：
在这里插入图片描述

对yn的偏导：
$在这里插入图片描述$
对yn’的偏导数：

前向传播的CPU代码：

template <typename Dtype>
void EuclideanLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  int count = bottom[0]->count();
  caffe_sub(
      count,
      bottom[0]->cpu_data(),
      bottom[1]->cpu_data(),
      diff_.mutable_cpu_data());//diff_ = bottom[0] - bottom[1]
  Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data());  // dot = ||diff_||^2
  Dtype loss = dot / bottom[0]->num() / Dtype(2);//输出的loss
  top[0]->mutable_cpu_data()[0] = loss;
}

反向传播的CPU代码：

template <typename Dtype>
void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  for (int i = 0; i < 2; ++i) {
    if (propagate_down[i]) {//对于输入的label bottom propagate_dowm 为0
      const Dtype sign = (i == 0) ? 1 : -1;//由于diff_ = bottom[0] - bottom[1]
      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
      caffe_cpu_axpby(
          bottom[i]->count(),              // count
          alpha,                              // alpha
          diff_.cpu_data(),                   // a
          Dtype(0),                           // beta
          bottom[i]->mutable_cpu_diff());  // b
    }//bottom[i]->mutable_cpu_diff()) = alpha*diff_.cpu_data()
  }
}

前向传播里就是按照损失函数的样式计算损失，并且将损失保存到top[0]->cpu_data()[0]中。其中有用的是计算得到的diff_->cpu_data()。
反向传播代码里的两个for循环就是就是分别计算对的偏导和对对的偏导，其中sign是正负号，用于控制是对还是对求偏导，而top[0]->cpu_diff()[0]是在网络的定义中（即prototxt文件中），loss层的定义中设置的loss_weight：1.0，即top[0]->cpu_diff()[0]=1.0，则alpha就是1/n或者-1/n，而diff_.cpu_data()=-，caffe_cpu_axpby()得到了1*(-)/n即对的偏导，和-1*(-)/n即对的偏导，并把结果存到bottom[i]->cpu_diff()中，用以传向前面的层。

SoftmaxWithLoss层

这里定义batch_size为网络输入的批大小，label_num表示标签的类别数。而loss层的输入blob是两个，一个是全连接层，维度是batch_sizelabel_num，一个是标签层，维度是label_num1，为了通俗易懂，我们举个例子，比如mnist问题的lenLet网络，是一个10类的分类问题（数字0~9），训练时，每个batch大小为64，所以，这里的batch_size=64，label_num=10。这里Softmax 层的各种原理，以及根据loss反向传播时的梯度推导，因为这里写公式不方便，我就在word里写了，如下图，
在这里插入图片描述
CPU代码：
头文件：

#ifndef CAFFE_MY_LOSS_LAYER_HPP_
#define CAFFE_MY_LOSS_LAYER_HPP_
 
#include <vector>
 
#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
 
#include "caffe/layers/loss_layer.hpp"
#include "caffe/layers/softmax_layer.hpp"
 
namespace caffe {
 
template <typename Dtype>
class MyLossLayer : public LossLayer<Dtype> {
 public:
  explicit MyLossLayer(const LayerParameter& param)
      : LossLayer<Dtype>(param) {}
  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
 
  virtual inline const char* type() const { return "MyLoss"; }
  virtual inline int ExactNumTopBlobs() const { return 1; }
  virtual inline int MinTopBlobs() const { return 1; }
  virtual inline int MaxTopBlobs() const { return 2; }
 
 protected:
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
  vector<vector<Dtype> > prob_;   //保存置信度
  int label_num;    //标签个数
  int batch_size;   //批大小
 
};
 
}  // namespace caffe
 
#endif  // CAFFE_MY_LOSS_LAYER_HPP_

源文件，反向传播时，按照公式更新梯度就好了

#include <algorithm>
#include <cfloat>
#include <vector>
 
#include "caffe/layers/my_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"
using namespace std;
namespace caffe {
 
template <typename Dtype>
void MyLossLayer<Dtype>::LayerSetUp(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  LossLayer<Dtype>::LayerSetUp(bottom, top);
}
 
template <typename Dtype>
void MyLossLayer<Dtype>::Reshape(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  LossLayer<Dtype>::Reshape(bottom, top);
  this->label_num=bottom[0]->channels();   //标签数 ，比如mnist为10
  this->batch_size=bottom[0]->num();       //batch大小，比如mnist 一次输入64个
  this->prob_=vector<vector<Dtype> >(batch_size,vector<Dtype>(label_num,Dtype(0)));  //置信度数组 64*10
}
 
template <typename Dtype>
void MyLossLayer<Dtype>::Forward_cpu(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 
	//为了避免数值问题，计算prob_时，先减最大值，再按照softmax公式计算各置信度
	for(int i=0;i<batch_size;++i){
		//求最大值，并减最大值
		Dtype mmax=-10000000;
		for(int j=0;j<label_num;++j)
			mmax=max<Dtype>(mmax,bottom[0]->data_at(i,j,0,0));
		for(int j=0;j<label_num;++j)
			prob_[i][j]=bottom[0]->data_at(i,j,0,0)-mmax;
		Dtype sum=0.0;   //求出分母
		for(int j=0;j<label_num;++j)
			sum+=exp(prob_[i][j]);
		for(int j=0;j<label_num;++j)   //计算各个置信度
			prob_[i][j]=exp(prob_[i][j])/sum;
	}
	//根据计算好的置信度，计算loss
	Dtype loss=0.0;
    const Dtype* label = bottom[1]->cpu_data();   //标签数组  64
	for(int i=0;i<batch_size;++i){
		int realLabel=static_cast<int>(label[i]);  //图片i的真实标签
		Dtype tmpProb=prob_[i][realLabel];         //属于真实标签的置信度
        loss -= log(max<Dtype>(tmpProb,Dtype(FLT_MIN)));   //防止数据溢出问题
	}
 
    top[0]->mutable_cpu_data()[0] = loss / batch_size;
}
 
//反向传播，计算梯度
template <typename Dtype>
void MyLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  if (propagate_down[0]) {
    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
    const Dtype* label = bottom[1]->cpu_data();   //标签 
 
	for(int i=0;i<batch_size;++i){
		int realLabel=static_cast<int>(label[i]);  //图片i的真实标签
		for(int j=0;j<label_num;++j){
			int offset=bottom[0]->offset(i,j);
			if(j==realLabel)                       //按照公式，如果分量就是真实标签，直接在置信度上减去1，就得到该分量的梯度
				bottom_diff[offset]=prob_[i][j]-1;
			else                                  //否则，梯度等于置信度
				bottom_diff[offset]=prob_[i][j]; 
		}
	}
	for(int i=0;i<bottom[0]->count();++i)   //梯度归一化，除以batch大小
		bottom_diff[i]/=batch_size;
  }
}
 
 
INSTANTIATE_CLASS(MyLossLayer);
REGISTER_LAYER_CLASS(MyLoss);
 
}  // namespace caffe

编译好后，用mnist的数据跑一下试试：

layer {
    name: "my_loss"
    type: "MyLoss"
    bottom: "ip2"
    bottom: "label"
    top: "my_loss"
}

最后结果：
在这里插入图片描述

最后

欢迎大家扫一扫下面二维码加入微信交流群，如果二维码失效，可以添加博主个人微信，拉你进群