rpn代码详解

版权声明:如使用此博客内容,请经过作者同意,谢谢 https://blog.csdn.net/qq_40994943/article/details/86500305
   # coding: UTF-8
from __future__ import absolute_import
import numpy as np
import cv2
import random
import copy
# 这里C代表一个参数类(上面的Config),C = Config()
def calc_rpn(C, img_data, width, height, resized_width, resized_height, img_length_calc_function):


    # 接下来读取了几个参数,downscale就是从图片到特征图的缩放倍数(默认为16.0) 这里,
    # img_length_calc_function(也就是实际的vgg中的get_img_output_length中整除的值一样。)
    # anchor_size和anchor_ratios是我们初步选区大小的参数,比如3个size和3个ratios,可以组合成9种不同形状大小的选区。
    downscale = float(C.rpn_stride)
    anchor_sizes = C.anchor_box_scales
    anchor_ratios = C.anchor_box_ratios
    num_anchors = len(anchor_sizes) * len(anchor_ratios)

    # calculate the output map size based on the network architecture
    # 接下来,
    # 通过img_length_calc_function 对VGG16 返回的是一个height和width都整除16的结果这个方法计算出了特征图的尺寸。
    # output_width = output_height = 600 // 16 = 37
    (output_width, output_height) = img_length_calc_function(resized_width, resized_height)


    # 下一步是几个变量初始化可以先不看,后面用到的时候再看。

    # n_anchratios = 3
    n_anchratios = len(anchor_ratios)

    # initialise empty output objectives
    y_rpn_overlap = np.zeros((output_height, output_width, num_anchors))
    y_is_box_valid = np.zeros((output_height, output_width, num_anchors))
    y_rpn_regr = np.zeros((output_height, output_width, num_anchors * 4))

    num_bboxes = len(img_data['bboxes'])

    num_anchors_for_bbox = np.zeros(num_bboxes).astype(int)
    best_anchor_for_bbox = -1*np.ones((num_bboxes, 4)).astype(int)
    best_iou_for_bbox = np.zeros(num_bboxes).astype(np.float32)
    best_x_for_bbox = np.zeros((num_bboxes, 4)).astype(int)
    best_dx_for_bbox = np.zeros((num_bboxes, 4)).astype(np.float32)


    # 因为我们的计算都是基于resize以后的图像的,所以接下来把bbox中的x1,x2,y1,y2分别通过缩放匹配到resize以后的图像。
    # 这里记做gta,尺寸为(num_of_bbox,4)。
    # get the GT box coordinates, and resize to account for image resizing
    gta = np.zeros((num_bboxes, 4))
    for bbox_num, bbox in enumerate(img_data['bboxes']):
        # get the GT box coordinates, and resize to account for image resizing
        gta[bbox_num, 0] = bbox['x1'] * (resized_width / float(width))
        gta[bbox_num, 1] = bbox['x2'] * (resized_width / float(width))
        gta[bbox_num, 2] = bbox['y1'] * (resized_height / float(height))
        gta[bbox_num, 3] = bbox['y2'] * (resized_height / float(height))

    # rpn ground truth
    # 这一段计算了anchor的长宽,然后比较重要的就是把特征图的每一个点作为一个锚点,
    # 通过乘以downscale,映射到图片的实际尺寸,再结合anchor的尺寸,忽略掉超出图片范围的。
    # 一个个大小、比例不一的矩形选框就跃然纸上了。
    # 第一层for 3层
    # 第二层for 3层
    for anchor_size_idx in range(len(anchor_sizes)):
        for anchor_ratio_idx in range(n_anchratios):
            # 框的尺寸选定
            anchor_x = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][0]
            anchor_y = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][1]
            # 对1024 --> 600 --> 37的形式,output_width = 37
            # 选定锚点坐标: x_anc y_anc
            for ix in range(output_width):  # 37
                # x-coordinates of the current anchor box
                x1_anc = downscale * (ix + 0.5) - anchor_x / 2
                x2_anc = downscale * (ix + 0.5) + anchor_x / 2

                # ignore boxes that go across image boundaries
                if x1_anc < 0 or x2_anc > resized_width:
                    continue

                for jy in range(output_height):

                    # y-coordinates of the current anchor box
                    y1_anc = downscale * (jy + 0.5) - anchor_y / 2  # 原图
                    y2_anc = downscale * (jy + 0.5) + anchor_y / 2

                    # ignore boxes that go across image boundaries
                    if y1_anc < 0 or y2_anc > resized_height:
                        continue


                    # 定义了两个变量,bbox_type和best_iou_for_loc,后面会用到。计算了anchor与gta的交集 iou(),
                    # 然后就是如果交集大于best_iou_for_bbox[bbox_num]或者大于我们设定的阈值,就会去计算gta和anchor的中心点坐标,
                    # bbox_type indicates whether an anchor should be a target
                    bbox_type = 'neg'

                    # this is the best IOU for the (x,y) coord and the current anchor
                    # note that this is different from the best IOU for a GT bbox
                    best_iou_for_loc = 0.0

                    # 对选出的选择框,判断其和实际上图片的所有Bbox中,有无满足大于规定threshold的情况。
                    for bbox_num in range(num_bboxes):  # bbox_num原图像有几个框

                        # get IOU of the current GT box and the current anchor box
                        curr_iou = iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1_anc, y1_anc, x2_anc, y2_anc])
                        # calculate the regression targets if they will be needed
                        # 默认的最大rpn重叠部分(rpn_max_overlap)为0.7,最小(rpn_min_overlap)为0.3
                        if curr_iou > best_iou_for_bbox[bbox_num] or curr_iou > C.rpn_max_overlap:
                            cx = (gta[bbox_num, 0] + gta[bbox_num, 1]) / 2.0
                            cy = (gta[bbox_num, 2] + gta[bbox_num, 3]) / 2.0
                            cxa = (x1_anc + x2_anc)/2.0
                            cya = (y1_anc + y2_anc)/2.0

                            # 计算出x,y,w,h四个值的梯度值。
                            # 为什么要计算这个梯度呢?因为RPN计算出来的区域不一定是很准确的,从只有9个尺寸的anchor也可以推测出来,
                            # 因此我们在预测时还会进行一次回归计算,而不是直接使用这个区域的坐标。
                            tx = (cx - cxa) / (x2_anc - x1_anc)
                            ty = (cy - cya) / (y2_anc - y1_anc)
                            tw = np.log((gta[bbox_num, 1] - gta[bbox_num, 0]) / (x2_anc - x1_anc))
                            th = np.log((gta[bbox_num, 3] - gta[bbox_num, 2]) / (y2_anc - y1_anc))

                        # 前提是:当前的bbox不是背景 != 'bg'
                        if img_data['bboxes'][bbox_num]['class'] != 'bg':

                            # all GT boxes should be mapped to an anchor box, so we keep track of which anchor box was best
                            if curr_iou > best_iou_for_bbox[bbox_num]:
                                # jy 高度 ix 宽度
                                best_anchor_for_bbox[bbox_num] = [jy, ix, anchor_ratio_idx, anchor_size_idx]
                                best_iou_for_bbox[bbox_num] = curr_iou
                                best_x_for_bbox[bbox_num,:] = [x1_anc, x2_anc, y1_anc, y2_anc]
                                best_dx_for_bbox[bbox_num,:] = [tx, ty, tw, th]

                            # we set the anchor to positive if the IOU is >0.7 (it does not matter if there was another better box, it just indicates overlap)
                            if curr_iou > C.rpn_max_overlap:
                                bbox_type = 'pos'
                                # 因为num_anchors_for_bbox 形式为 [0, 0, 0, 0]
                                # 这步操作的结果为 [1, 1, 1, 1]
                                num_anchors_for_bbox[bbox_num] += 1
                                # we update the regression layer target if this IOU is the best for the current (x,y) and anchor position
                                if curr_iou > best_iou_for_loc:
                                    # 不断修正最佳iou对应的区域和梯度
                                    best_iou_for_loc = curr_iou
                                    best_grad = (tx, ty, tw, th)

                            # if the IOU is >0.3 and <0.7, it is ambiguous and no included in the objective
                            if C.rpn_min_overlap < curr_iou < C.rpn_max_overlap:
                                # gray zone between neg and pos
                                if bbox_type != 'pos':
                                    bbox_type = 'neutral'

                    # turn on or off outputs depending on IOUs
                    # 接下来根据bbox_type对本anchor进行打标,y_is_box_valid和y_rpn_overlap分别定义了这个anchor是否可用和是否包含对象。
                    if bbox_type == 'neg':
                        y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                        y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                    elif bbox_type == 'neutral':
                        y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                        y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                    elif bbox_type == 'pos':
                        y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                        y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                        # 默认是36个选择
                        start = 4 * (anchor_ratio_idx + n_anchratios * anchor_size_idx)  # 4(2+3*2)=32
                        y_rpn_regr[jy, ix, start:start+4] = best_grad  # 当前最佳梯度放到回归层框对应位置


    # we ensure that every bbox has at least one positive RPN region
    # 这里又出现了一个潜在问题: 可能会有bbox可能找不到心仪的anchor,那这些训练数据就没法利用了,
    # 因此我们用一个折中的办法来保证每个bbox至少有一个anchor与之对应。
    # 下面是具体的方法,比较简单,对于没有对应anchor的bbox,在中性anchor里挑最好的,当然前提是你不能跟我完全不相交,那就太过分了。。
    for idx in range(num_anchors_for_bbox.shape[0]):
        if num_anchors_for_bbox[idx] == 0:
            # no box with an IOU greater than zero ... 遇到这种情况只能pass了
            if best_anchor_for_bbox[idx, 0] == -1:
                continue
            y_is_box_valid[
                best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], best_anchor_for_bbox[idx,2] + n_anchratios *
                best_anchor_for_bbox[idx,3]] = 1
            y_rpn_overlap[
                best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], best_anchor_for_bbox[idx,2] + n_anchratios *
                best_anchor_for_bbox[idx,3]] = 1
            start = 4 * (best_anchor_for_bbox[idx,2] + n_anchratios * best_anchor_for_bbox[idx,3])
            y_rpn_regr[
                best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], start:start+4] = best_dx_for_bbox[idx, :]

    # y_rpn_overlap 原来的形式np.zeros((output_height, output_width, num_anchors))
    # 现在变为 (num_anchors, output_height, output_width)
    y_rpn_overlap = np.transpose(y_rpn_overlap, (2, 0, 1))
    # (新的一列,num_anchors, output_height, output_width)
    y_rpn_overlap = np.expand_dims(y_rpn_overlap, axis=0)

    y_is_box_valid = np.transpose(y_is_box_valid, (2, 0, 1))
    y_is_box_valid = np.expand_dims(y_is_box_valid, axis=0)

    y_rpn_regr = np.transpose(y_rpn_regr, (2, 0, 1))
    y_rpn_regr = np.expand_dims(y_rpn_regr, axis=0)

    # pos表示box neg表示背景
    pos_locs = np.where(np.logical_and(y_rpn_overlap[0, :, :, :] == 1, y_is_box_valid[0, :, :, :] == 1))
    neg_locs = np.where(np.logical_and(y_rpn_overlap[0, :, :, :] == 0, y_is_box_valid[0, :, :, :] == 1))

    num_pos = len(pos_locs[0])

    # one issue is that the RPN has many more negative than positive regions, so we turn off some of the negative
    # regions. We also limit it to 256 regions.
    # 因为negtive的anchor肯定远多于postive的,
    # 因此在这里设定了regions数量的最大值为256,并对pos和neg的样本进行了均匀的取样。
    num_regions = 256

    # 对感兴趣的框超过128的时候...
    if len(pos_locs[0]) > num_regions/2:
        # val_locs为一个list
        val_locs = random.sample(range(len(pos_locs[0])), len(pos_locs[0]) - num_regions/2)
        y_is_box_valid[0, pos_locs[0][val_locs], pos_locs[1][val_locs], pos_locs[2][val_locs]] = 0
        num_pos = num_regions/2
    # 使得neg(背景)和pos(锚框)数量一致
    if len(neg_locs[0]) + num_pos > num_regions:
        val_locs = random.sample(range(len(neg_locs[0])), len(neg_locs[0]) - num_pos)
        y_is_box_valid[0, neg_locs[0][val_locs], neg_locs[1][val_locs], neg_locs[2][val_locs]] = 0
    # axis = 1按行拼接
    # a = [[1 2 3]
    # [4 5 6]]
    # b = [[11 21 31]
    #  [ 7  8  9]]
    # np.concatenate((a,b), axis=1) =
    # [[ 1  2  3 11 21 31]
    #  [ 4  5  6  7  8  9]]

    y_rpn_cls = np.concatenate([y_is_box_valid, y_rpn_overlap], axis=1)
    y_rpn_regr = np.concatenate([np.repeat(y_rpn_overlap, 4, axis=1), y_rpn_regr], axis=1)
    # 最后,得到了两个返回值y_rpn_cls,y_rpn_regr。分别用于确定anchor是否包含物体,和回归梯度。
    # 值得注意的是, y_rpn_cls和y_rpn_regr数量是比实际的输入图片对应的Bbox数量多挺多的。
    return np.copy(y_rpn_cls), np.copy(y_rpn_regr)

refer to blog1 blog2

猜你喜欢

转载自blog.csdn.net/qq_40994943/article/details/86500305
RPN
今日推荐