版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/m_buddy/article/details/82779727
1. Caffe修改
1.1 修改caffe.proto
在message LayerParameter{}中添加如下代码:
optional CenterLossParameter center_loss_param = 147;
在caffe.proto文末添加
message CenterLossParameter {
optional uint32 num_output = 1; // The number of outputs for the layer
optional FillerParameter center_filler = 2; // The filler for the centers
// The first axis to be lumped into a single inner product computation;
// all preceding axes are retained in the output.
// May be negative to index from the end (e.g., -1 for the last axis).
optional int32 axis = 3 [default = 1];
}
1.2 添加头文件center_loss_layer.hpp
#ifndef CAFFE_CONTRASTIVE_LOSS_LAYER_HPP_
#define CAFFE_CONTRASTIVE_LOSS_LAYER_HPP_
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/layers/loss_layer.hpp"
namespace caffe {
/**
* @brief Computes the contrastive loss @f$
* E = \frac{1}{2N} \sum\limits_{n=1}^N \left(y\right) d^2 +
* \left(1-y\right) \max \left(margin-d, 0\right)^2
* @f$ where @f$
* d = \left| \left| a_n - b_n \right| \right|_2 @f$. This can be
* used to train siamese networks.
*
* @param bottom input Blob vector (length 3)
* -# @f$ (N \times C \times 1 \times 1) @f$
* the features @f$ a \in [-\infty, +\infty]@f$
* -# @f$ (N \times C \times 1 \times 1) @f$
* the features @f$ b \in [-\infty, +\infty]@f$
* -# @f$ (N \times 1 \times 1 \times 1) @f$
* the binary similarity @f$ s \in [0, 1]@f$
* @param top output Blob vector (length 1)
* -# @f$ (1 \times 1 \times 1 \times 1) @f$
* the computed contrastive loss: @f$ E =
* \frac{1}{2N} \sum\limits_{n=1}^N \left(y\right) d^2 +
* \left(1-y\right) \max \left(margin-d, 0\right)^2
* @f$ where @f$
* d = \left| \left| a_n - b_n \right| \right|_2 @f$.
* This can be used to train siamese networks.
*/
template <typename Dtype>
class ContrastiveLossLayer : public LossLayer<Dtype> {
public:
explicit ContrastiveLossLayer(const LayerParameter& param)
: LossLayer<Dtype>(param), diff_() {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual inline int ExactNumBottomBlobs() const { return 3; }
virtual inline const char* type() const { return "ContrastiveLoss"; }
/**
* Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate
* to the first two inputs.
*/
virtual inline bool AllowForceBackward(const int bottom_index) const {
return bottom_index != 2;
}
protected:
/// @copydoc ContrastiveLossLayer
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
/**
* @brief Computes the Contrastive error gradient w.r.t. the inputs.
*
* Computes the gradients with respect to the two input vectors (bottom[0] and
* bottom[1]), but not the similarity label (bottom[2]).
*
* @param top output Blob vector (length 1), providing the error gradient with
* respect to the outputs
* -# @f$ (1 \times 1 \times 1 \times 1) @f$
* This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
* as @f$ \lambda @f$ is the coefficient of this layer's output
* @f$\ell_i@f$ in the overall Net loss
* @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
* @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
* (*Assuming that this top Blob is not used as a bottom (input) by any
* other layer of the Net.)
* @param propagate_down see Layer::Backward.
* @param bottom input Blob vector (length 2)
* -# @f$ (N \times C \times 1 \times 1) @f$
* the features @f$a@f$; Backward fills their diff with
* gradients if propagate_down[0]
* -# @f$ (N \times C \times 1 \times 1) @f$
* the features @f$b@f$; Backward fills their diff with gradients if
* propagate_down[1]
*/
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
Blob<Dtype> diff_; // cached for backward pass
Blob<Dtype> dist_sq_; // cached for backward pass
Blob<Dtype> diff_sq_; // tmp storage for gpu forward pass
Blob<Dtype> summer_vec_; // tmp storage for gpu forward pass
};
} // namespace caffe
#endif // CAFFE_CONTRASTIVE_LOSS_LAYER_HPP_
1.3 添加center_loss_layer.cpp以及center_loss_layer.cu
center_loss_layer.cpp:
#include <vector>
#include "caffe/filler.hpp"
#include "caffe/layers/center_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
template <typename Dtype>
void CenterLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const int num_output = this->layer_param_.center_loss_param().num_output();
N_ = num_output;
const int axis = bottom[0]->CanonicalAxisIndex(
this->layer_param_.center_loss_param().axis());
// Dimensions starting from "axis" are "flattened" into a single
// length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
// and axis == 1, N inner products with dimension CHW are performed.
K_ = bottom[0]->count(axis);
// Check if we need to set up the weights
if (this->blobs_.size() > 0) {
LOG(INFO) << "Skipping parameter initialization";
} else {
this->blobs_.resize(1);
// Intialize the weight
vector<int> center_shape(2);
center_shape[0] = N_;
center_shape[1] = K_;
this->blobs_[0].reset(new Blob<Dtype>(center_shape));
// fill the weights
shared_ptr<Filler<Dtype> > center_filler(GetFiller<Dtype>(
this->layer_param_.center_loss_param().center_filler()));
center_filler->Fill(this->blobs_[0].get());
} // parameter initialization
this->param_propagate_down_.resize(this->blobs_.size(), true);
}
template <typename Dtype>
void CenterLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
CHECK_EQ(bottom[1]->channels(), 1);
CHECK_EQ(bottom[1]->height(), 1);
CHECK_EQ(bottom[1]->width(), 1);
M_ = bottom[0]->num();
// The top shape will be the bottom shape with the flattened axes dropped,
// and replaced by a single axis with dimension num_output (N_).
LossLayer<Dtype>::Reshape(bottom, top);
distance_.ReshapeLike(*bottom[0]);
variation_sum_.ReshapeLike(*this->blobs_[0]);
}
template <typename Dtype>
void CenterLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const Dtype* bottom_data = bottom[0]->cpu_data();
const Dtype* label = bottom[1]->cpu_data();
const Dtype* center = this->blobs_[0]->cpu_data();
Dtype* distance_data = distance_.mutable_cpu_data();
// the i-th distance_data
for (int i = 0; i < M_; i++) {
const int label_value = static_cast<int>(label[i]);
// D(i,:) = X(i,:) - C(y(i),:)
caffe_sub(K_, bottom_data + i * K_, center + label_value * K_, distance_data + i * K_);
}
Dtype dot = caffe_cpu_dot(M_ * K_, distance_.cpu_data(), distance_.cpu_data());
Dtype loss = dot / M_ / Dtype(2);
top[0]->mutable_cpu_data()[0] = loss;
}
template <typename Dtype>
void CenterLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom) {
// Gradient with respect to centers
if (this->param_propagate_down_[0]) {
const Dtype* label = bottom[1]->cpu_data();
Dtype* center_diff = this->blobs_[0]->mutable_cpu_diff();
Dtype* variation_sum_data = variation_sum_.mutable_cpu_data();
const Dtype* distance_data = distance_.cpu_data();
// \sum_{y_i==j}
caffe_set(N_ * K_, (Dtype)0., variation_sum_.mutable_cpu_data());
for (int n = 0; n < N_; n++) {
int count = 0;
for (int m = 0; m < M_; m++) {
const int label_value = static_cast<int>(label[m]);
if (label_value == n) {
count++;
caffe_sub(K_, variation_sum_data + n * K_, distance_data + m * K_, variation_sum_data + n * K_);
}
}
caffe_axpy(K_, (Dtype)1./(count + (Dtype)1.), variation_sum_data + n * K_, center_diff + n * K_);
}
}
// Gradient with respect to bottom data
if (propagate_down[0]) {
caffe_copy(M_ * K_, distance_.cpu_data(), bottom[0]->mutable_cpu_diff());
caffe_scal(M_ * K_, top[0]->cpu_diff()[0] / M_, bottom[0]->mutable_cpu_diff());
}
if (propagate_down[1]) {
LOG(FATAL) << this->type()
<< " Layer cannot backpropagate to label inputs.";
}
}
#ifdef CPU_ONLY
STUB_GPU(CenterLossLayer);
#endif
INSTANTIATE_CLASS(CenterLossLayer);
REGISTER_LAYER_CLASS(CenterLoss);
} // namespace caffe
center_loss_layer.cu:
#include <vector>
#include "caffe/filler.hpp"
#include "caffe/layers/center_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
template <typename Dtype>
__global__ void Compute_distance_data_gpu(int nthreads, const int K, const Dtype* bottom,
const Dtype* label, const Dtype* center, Dtype* distance) {
CUDA_KERNEL_LOOP(index, nthreads) {
int m = index / K;
int k = index % K;
const int label_value = static_cast<int>(label[m]);
// distance(i) = x(i) - c_{y(i)}
distance[index] = bottom[index] - center[label_value * K + k];
}
}
template <typename Dtype>
__global__ void Compute_center_diff_gpu(int nthreads, const int M, const int K,
const Dtype* label, const Dtype* distance, Dtype* variation_sum,
Dtype* center_diff) {
CUDA_KERNEL_LOOP(index, nthreads) {
int count = 0;
for (int m = 0; m < M; m++) {
const int label_value = static_cast<int>(label[m]);
if (label_value == index) {
count++;
for (int k = 0; k < K; k++) {
variation_sum[index * K + k] -= distance[m * K + k];
}
}
}
for (int k = 0; k < K; k++) {
center_diff[index * K + k] = variation_sum[index * K + k] /(count + (Dtype)1.);
}
}
}
template <typename Dtype>
void CenterLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
int nthreads = M_ * K_;
Compute_distance_data_gpu<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
CAFFE_CUDA_NUM_THREADS>>>(nthreads, K_, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
this->blobs_[0]->gpu_data(), distance_.mutable_gpu_data());
Dtype dot;
caffe_gpu_dot(M_ * K_, distance_.gpu_data(), distance_.gpu_data(), &dot);
Dtype loss = dot / M_ / Dtype(2);
top[0]->mutable_cpu_data()[0] = loss;
}
template <typename Dtype>
void CenterLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom) {
int nthreads = N_;
caffe_gpu_set(N_ * K_, (Dtype)0., variation_sum_.mutable_cpu_data());
Compute_center_diff_gpu<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
CAFFE_CUDA_NUM_THREADS>>>(nthreads, M_, K_, bottom[1]->gpu_data(), distance_.gpu_data(),
variation_sum_.mutable_cpu_data(), this->blobs_[0]->mutable_gpu_diff());
if (propagate_down[0]) {
caffe_gpu_scale(M_ * K_, top[0]->cpu_diff()[0] / M_,
distance_.gpu_data(), bottom[0]->mutable_gpu_diff());
}
if (propagate_down[1]) {
LOG(FATAL) << this->type()
<< " Layer cannot backpropagate to label inputs.";
}
}
INSTANTIATE_LAYER_GPU_FUNCS(CenterLossLayer);
} // namespace caffe
1.4 重新编译caffe
make all -j8
make pycaffe -j8
2. Center_loss使用
在训练网络中加入如下部分,添加center_loss到网络中
############## center loss ###############
layer {
name: "center_loss"
type: "CenterLoss"
bottom: "InnerProduct1" # InnerProduct层输出
bottom: "Data2" # Label数据
top: "center_loss"
param {
lr_mult: 1
decay_mult: 2
}
center_loss_param {
num_output: 4 # 最后一个InnerProduct层输出的类别数
center_filler {
type: "xavier"
}
}
loss_weight: 0.008 # 权重参数,太大会使网络训练异常,注意调整
}
这里使用本地的数据进行分类,按照center loss论文中给出的方法进行可视化得到其可视化结果,加入center loss之前
加入center loss之后:
对比分析可以看到加入center loss之后的特征变得更加聚集一些了。。。