只看一眼学习笔记（一）

yolo的作者不久前发表了yolov3，网络结构变深了，但是速度依然非常快，准确度也得到了提升。

yolov项目地址：https://pjreddie.com/darknet/yolo/

自己的工作需要使用yolo检测目标，记录一下学习内容以备日后翻阅。查看了新的yolov3.cfg，其中有一些新的参数，如[upsample]和[yolo]等，根据AlexeyAB在github上的解答（https://github.com/AlexeyAB/darknet/issues/504#issuecomment-377290060）upsample层实现的是类似yolov2中reorg层的功能，查看代码后，我的理解是当forward输入的一个特征图上每个点用它的scale倍（默认为1，可以在cfg里设置）扩充为4个点，最后输出的w和h都成为原来的2倍。这里是为了使高层特征的维度和低层特征的维度相同，然后concatenate起来提高细粒度特征的提取能力。

yolo层是新的检测器层，替代region层使用了多尺度预测的方法。下面是解析cfg文件中yolo层的代码。

layer parse_yolo(list *options, size_params params)
{
	int classes = option_find_int(options, "classes", 20);    // 检测任务的目标类别数
	int total = option_find_int(options, "num", 1);            // anchor的总个数
	int num = total;                                

	char *a = option_find_str(options, "mask", 0);         // 该层使用的anchor的索引，yolov3使用多尺度预测，每个尺度使用3个anchor
	int *mask = parse_yolo_mask(a, &num);              // 把a字符串中以,分隔得数字转换为整形保存在mask数组中，num更新为a中数字的个数
	layer l = make_yolo_layer(params.batch, params.w, params.h, num, total, mask, classe);
	assert(l.outputs == params.inputs);

	l.max_boxes = option_find_int_quiet(options, "max", 90);
	l.jitter = option_find_float(options, "jitter", .2);        // 数据抖动范围，默认0~0.2

	l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);    // 决定是否需要计算IOU误差的参数
	l.truth_thresh = option_find_float(options, "truth_thresh", 1);
	l.random = option_find_int_quiet(options, "random", 0);    // 如果为1每次迭代图片大小随机，为0训练大小与输入大小一致

	char *map_file = option_find_str(options, "map", 0);
	if (map_file) l.map = read_map(map_file);

	a = option_find_str(options, "anchors", 0);
	if (a) {
		int len = strlen(a);
		int n = 1;
		int i;
		for (i = 0; i < len; ++i) {
			if (a[i] == ',') ++n;        // 统计anchor宽和高的总和
		}
		for (i = 0; i < n; ++i) {
			float bias = atof(a);
			l.biases[i] = bias;
			a = strchr(a, ',') + 1;     // 定位a到下一个，后面第一个位置
		}
	}
	return l;
}

下面是make_yolo_layer函数的代码：

layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes)
{
    int i;
    layer l = {0};
    l.type = YOLO;

    l.n = n;                        // 该层使用的anchor数
    l.total = total;                // yolov3的总anchor数
    l.batch = batch;
    l.h = h;
    l.w = w;
    l.c = n*(classes + 4 + 1);    
    l.out_w = l.w;
    l.out_h = l.h;
    l.out_c = l.c;
    l.classes = classes;
    l.cost = calloc(1, sizeof(float));
    l.biases = calloc(total*2, sizeof(float));
    if(mask) l.mask = mask;
    else{
        l.mask = calloc(n, sizeof(int));
        for(i = 0; i < n; ++i){
            l.mask[i] = i;
        }
    }
    l.bias_updates = calloc(n*2, sizeof(float));
    l.outputs = h*w*n*(classes + 4 + 1);            
    l.inputs = l.outputs;
    l.truths = 90*(4 + 1);            // max boxes默认90，即每张图像中目标的最大个数
    l.delta = calloc(batch*l.outputs, sizeof(float));
    l.output = calloc(batch*l.outputs, sizeof(float));
    for(i = 0; i < total*2; ++i){
        l.biases[i] = .5;
    }

    l.forward = forward_yolo_layer;
    l.backward = backward_yolo_layer;
#ifdef GPU
    l.forward_gpu = forward_yolo_layer_gpu;
    l.backward_gpu = backward_yolo_layer_gpu;
    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
#endif

    fprintf(stderr, "detection\n");
    srand(0);

    return l;
}

定义了一些输入输出维度和anchor个数以及分配内存后，主要还是forward和backward函数，首先需要了解一些存储结构，才能读懂源码中的索引。

首先是net.truth，其实是target的存储格式：x，y，w，h，class，x，y，w，h，class...

然后是output数组的存储格式：w×h×entry×n×batch，entry对应每个anchor生成的向量维度，长度是（4+1+classes），依次存储box的（x,y,w,h）和有无目标的confidence以及类别class。

void forward_yolo_layer(const layer l, network net)
{
    int i,j,b,t,n;
    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));

#ifndef GPU
    for (b = 0; b < l.batch; ++b){
        for(n = 0; n < l.n; ++n){
            int index = entry_index(l, b, n*l.w*l.h, 0);
            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
            index = entry_index(l, b, n*l.w*l.h, 4);
            activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
        }
    }
#endif

    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));            // 梯度清零
    if(!net.train) return;                
    float avg_iou = 0;            // 平均IOU
    float recall = 0;            // 召回
    float recall75 = 0;            // IOU大于.75的召回
    float avg_cat = 0;            // 平均类别辨识率
    float avg_obj = 0;
    float avg_anyobj = 0;
    int count = 0;                // 检测到的target数
    int class_count = 0;
    *(l.cost) = 0;                // 损失
    for (b = 0; b < l.batch; ++b) {
        for (j = 0; j < l.h; ++j) {
            for (i = 0; i < l.w; ++i) {
                for (n = 0; n < l.n; ++n) {
                    int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);    
                    // 带入 entry_index, 由output tensor的存储格式可以知道这里是第n类anchor在(i,j)上对应box的首地址
                    box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);
                    // 在（i，j）上预测的box的x和y相对于l.w和l.h的值及w和h相对于原图w和h的值
                    float best_iou = 0;
                    int best_t = 0;
                    for(t = 0; t < l.max_boxes; ++t){
                        box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
                        // 从gt中提取一个真实目标框
                        if(!truth.x) break;
                        float iou = box_iou(pred, truth);    // 计算IOU
                        if (iou > best_iou) {
                            best_iou = iou;            // 找到与当前预测box的最大IOU
                            best_t = t;
                        }
                    }
                    int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
                    // 找到第n个anchor在（i,j）处的confidence的首地址
                    avg_anyobj += l.output[obj_index];    // 有目标的概率
                    l.delta[obj_index] = 0 - l.output[obj_index];    
                    if (best_iou > l.ignore_thresh) {
                        l.delta[obj_index] = 0;                // 预测框中有目标
                    }
                    if (best_iou > l.truth_thresh) {            // l.truth_thresh = 1
                        l.delta[obj_index] = 1 - l.output[obj_index];

                        int class = net.truth[best_t*(4 + 1) + b*l.truths + 4];
                        if (l.map) class = l.map[class];
                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                        delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
                        box truth = float_to_box(net.truth + best_t*(4 + 1) + b*l.truths, 1);
                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
                    }
                }
            }
        }
        for(t = 0; t < l.max_boxes; ++t){
            box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
             // 把truth转成box类型，x，y，w，h，都是归一化后的真实值

            if(!truth.x) break;
            float best_iou = 0;
            int best_n = 0;
            i = (truth.x * l.w);            // 计算该truth所在的cell的i，j坐标
            j = (truth.y * l.h);
            box truth_shift = truth;
            truth_shift.x = truth_shift.y = 0;
            for(n = 0; n < l.total; ++n){        
                                            // 找出与该truth的IOU最大的是哪一种anchor
                box pred = {0};
                pred.w = l.biases[2*n]/net.w;
                pred.h = l.biases[2*n+1]/net.h;
                float iou = box_iou(pred, truth_shift);
                if (iou > best_iou){
                    best_iou = iou;
                    best_n = n;
                }
            }

            int mask_n = int_index(l.mask, best_n, l.n);    // 求对应于当前mask中anchor标号的下标，没有返回-1
            if(mask_n >= 0){                                // 如果不是当前层的mask数组中的，不执行
                int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);    // 第mask_n个anchor的在i，j位置预测的box 
                float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
                // 用第mask_n个anchor得到预测box，然后计算与该truth的IOU，得到IOU同时更新梯度数组：用真正需要平移的量和缩放量减去预测的box

                int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
                avg_obj += l.output[obj_index];            // obj_index就是i，j位置是否有目标confidence的首地址
                l.delta[obj_index] = 1 - l.output[obj_index];

                int class = net.truth[t*(4 + 1) + b*l.truths + 4];    // target：x，y，w，h，class
                if (l.map) class = l.map[class];
                int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
                                                        // 第mask_n个anchor在i，j位置预测的class的首地址，4+1是坐标加confidence

                delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);
                                                        // 更新预测class的梯度和avg_cat的值

                ++count;
                ++class_count;
                if(iou > .5) recall += 1;
                if(iou > .75) recall75 += 1;
                avg_iou += iou;
            }
        }
    }
    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);        // MSE loss
    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
}

参考：

https://github.com/AlexeyAB/darknet

https://www.cnblogs.com/makefile/p/YOLOv3.html

http://www.cnblogs.com/YiXiaoZhou/p/7429481.html

https://www.cnblogs.com/zf-blog/p/7142463.html

猜你喜欢

目录

热门文章