[CV学习笔记] yolo&tensorrt多线程推理-第二部分

1、前言

在上个博客介绍了yolo&tensorrt多线程推理代码，成功运行infe代码之后，本文将继续学习实现代码，主要学习单张图片推理的内容，对应的代码为 src/main.cpp中的single_inference函数
yolo&tensorrt项目代码:https://github.com/shouxieai/infer
第一部分代码:https://blog.csdn.net/weixin_42108183/article/details/129411759

2、实现代码学习

yolo.hpp

// 实例分割结果
struct InstanceSegmentMap {
    
    }

// 目标检测结果
struct Box {
    
    }

// 推理图片，直接使用Mat速度可能会慢一些，因此使用Mat的data
struct Image{
    
    
    const void *bgrptr = nullptr;
}

// 推理接口
class Infer {
    
    
public:
    // 推理单张图片
    virtual BoxArray forward(const Image &image, void *stream = nullptr) = 0;
    // 推理多张图片
    virtual std::vector<BoxArray> forwards(const std::vector<Image> &images,
                                         void *stream = nullptr) = 0;
};

// 构造 Infer 加载模型等等...
std::shared_ptr<Infer> load(const std::string &engine_file, Type type,
                                float confidence_threshold = 0.25f,
                                float nms_threshold = 0.5f);

yolo.cu

/* 归一化操作，可以支持均值标准差，alpha beta，和swap RB */
struct Norm {
    
    };

    const int NUM_BOX_ELEMENT = 8; // left, top, right, bottom, confidence, class, keepflag, row_index(output,行索引？)
    const int MAX_IMAGE_BOXES = 1024; // 一张图片最多检测1024个目标
    inline int upbound(int n, int align = 32){
    
    }; ???
    tatic __host__ __device__ void affine_project(){
    
    } // 计算预处理中的仿射变换
    // 从推理结果中 获得目标框
    static __global__ void decode_kernel_common(...){
    
    
        // predict 模型输出结果的指针
        // 处理第position个框
        int position = blockDim.x * blockIdx.x + threadIdx.x;
        if (position >= num_bboxes)  // 
            return;
        //  第position个框的地址
        float *pitem = predict + (5 + num_classes) * position;
        // 当前框的前景概率
        float objectness = pitem[4];
        // 前景概率过滤
        if (objectness < confidence_threshold)
            return;
        // 当前框的类别概率
        float *class_confidence = pitem + 5;
    
        // 找到最大类别概率及其label索引
        float confidence = *class_confidence++;
        int label = 0;
        for (int i = 1; i < num_classes; ++i, ++class_confidence)
        {
    
    
            if (*class_confidence > confidence)
            {
    
    
                confidence = *class_confidence;
                label = i;
            }
        }
        // 置信度
        confidence *= objectness
        // 置信度过滤
        if (confidence < confidence_threshold)
            return;
        // parray 的第一个位置记录当前这张图片存在多少个目标框，index就表示处理当前框是parray的第几个结果  
        // 画图解释
        int index = atomicAdd(parray, 1);
        
        // 超过最大个数，则忽略
        if (index >= MAX_IMAGE_BOXES)
                return;
        // 将目标框恢复到原图尺寸    
        float cx = *pitem++;
        float cy = *pitem++;
        float width = *pitem++;
        float height = *pitem++;
        float left = cx - width * 0.5f;
        float top = cy - height * 0.5f;
        float right = cx + width * 0.5f;
        float bottom = cy + height * 0.5f;
        affine_project(invert_affine_matrix, left, top, &left, &top);
        affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
        
        // 保存边界框
        float *pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
        *pout_item++ = left;
        *pout_item++ = top;
        *pout_item++ = right;
        *pout_item++ = bottom;
        *pout_item++ = confidence;
        *pout_item++ = label;
        *pout_item++ = 1; // 1 = keep, 0 = ignore
    }
    
    // 与上面函数一致
    static __global__ void decode_kernel_v8(){
    
    }
    // 就散两个box之间的交并比
    static __device__ float box_iou(){
    
    }
    // cuda加速的nms
    static __global__ void fast_nms_kernel(){
    
    
        // 处理第 position 个框
        int position = (blockDim.x * blockIdx.x + threadIdx.x);
        // 一张图片最多 count个框
        int count = min((int)*bboxes, MAX_IMAGE_BOXES);
        if (position >= count)
            return;
        // 当前框的地址
        float *pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
        float *pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
        for (int i = 0; i < count; ++i)
        {
    
    
            // 遍历所有框
            float *pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
            // 遍历到当前框，或者遍历的框与当前框不属于同一类别，则跳过
            if (i == position || pcurrent[5] != pitem[5])
                continue;
            for (int i = 0; i < count; ++i){
    
    
                // 遍历框地址
                float *pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
                // 遍历框的得分大于等position框的得分
                if (pitem[4] >= pcurrent[4])
                {
    
       
                    // 遍历框的得分等当前框的得分 且 遍历框的位置比当前框靠前，则跳过(避免出现重复计算)
                    if (pitem[4] == pcurrent[4] && i < position)
                        continue;
                    // 两个框的iou
                    float iou = box_iou(
                        pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
                        pitem[0], pitem[1], pitem[2], pitem[3]);
                    
                    if (iou > threshold)
                    {
    
    
                        // 当前框是否被保留
                        pcurrent[6] = 0; // 1=keep, 0=ignore
                        return;
                    }
                }
            }
        }
    // 调用解码函数
    static void decode_kernel_invoker(...)
    
    // 预处理
    static __global__ void warp_affine_bilinear_and_normalize_plane_kernel(...){
    
    
        // 当前像素点
        int dx = blockDim.x * blockIdx.x + threadIdx.x;
        int dy = blockDim.y * blockIdx.y + threadIdx.y;
        if (dx >= dst_width || dy >= dst_height)
            return;
        
        // 图片外面的设置为const_value_st
        if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height)
        {
    
    
            // out of range
            c0 = const_value_st;
            c1 = const_value_st;
            c2 = const_value_st;
        } 
        else{
    
    
            // 当前像素点通过仿射变换的矩阵之后一般为小数，因此这里需要双线性插值
            // 当前像素点周围的四个像素点
            int y_low = floorf(src_y);
            int x_low = floorf(src_x);
            int y_high = y_low + 1;
            int x_high = x_low + 1;
            // 计算四组面积
            float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
            ...
            // b g r
            c0 = ... 
            c1 = ... 
            c2 = ...
        }
        // bgr2rgb
        if (norm.channel_type == ChannelType::SwapRB)
        {
    
    
            float t = c2;
            c2 = c0;
            c0 = t;
        }
        // 归一化
        if (norm.type == NormType::MeanStd){
    
    }
        else if (norm.type == NormType::AlphaBeta){
    
    }
    }
    
    // 启动预处理核函数
    static void warp_affine_bilinear_and_normalize_plane(...){
    
    
        dim3 grid((dst_width + 31) / 32, (dst_height + 31) / 32);  // grid数量，为32的 倍数
        dim3 block(32, 32); // 每个block包含 32*32个线程
        ...
    }
    // 解码 mask 一个目标框的实例分割结果   
    static __global__ void decode_single_mask_kernel(...){
    
    
        // mask_predict to mask_out
        // mask_weights @ mask_predict  32 *  32*(160*160)  最终会得到一张 160 * 160的灰度图像 
        // 
        int dx = blockDim.x * blockIdx.x + threadIdx.x;
        int dy = blockDim.y * blockIdx.y + threadIdx.y;
        // left,top  目标框左上角
        int sx = left + dx;  // 当前线程对应的目标中的像素坐标
        int sy = top + dy; // 
        // 超出范围设为0
        if (sx < 0 || sx >= mask_width || sy < 0 || sy >= mask_height)
        {
    
    
            mask_out[dy * out_width + dx] = 0;
            return;
        }
        float cumprod = 0;  // mask_dim = 32
        for (int ic = 0; ic < mask_dim; ++ic){
    
    
            float cval = mask_predict[(ic * mask_height + sy) * mask_width + sx]; // 某层mask上一个像素点
            wval = mask_weights[ic]; // 某层mask的权重系数
        }
        
        float alpha = 1.0f / (1.0f + exp(-cumprod)); // sigmoid
        mask_out[dy * out_width + dx] = alpha * 255; // 缩放的图像空间
    }   
    // 启动mask解码函数
    static void decode_single_mask(){
    
    ...}{
    
    }
    // 仿射变换
    struct AffineMatrix{
    
    }
    
    // 接口实现类，使用时使用父类指针(接口类)指向子类对象
    class InferImpl : public Infer(){
    
    
        // 根据输入分配cpu、gpu内存大小
        void adjust_memory(int batch_size){
    
    }
    }
    
    void preprocess(){
    
    
        // 计算仿射变换矩阵
        affine.compute(){
    
    }
        // 图片预处理
        warp_affine_bilinear_and_normalize_plane(){
    
    }
    }
     bool load(){
    
    
        // 加载engine
        trt_ = trt::load(engine_file); 
        
        // 网络宽高
        network_input_width_ = input_dim[3];
        network_input_height_ = input_dim[2];
        // batch==-1说明该engine是动态batch
        isdynamic_model_ = trt_->has_dynamic_dim();
     }
     
     // inference
    virtual BoxArray forward(){
    
    }
    
    virtual vector<BoxArray> forwards(){
    
    
        ...
        // 预处理
        for (int i = 0; i < num_image; ++i){
    
    
            preprocess(i, images[i], preprocess_buffers_[i], affine_matrixs[i],stream);
        }
    }
    // 绑定输入输出
    vector<void *> bindings{
    
    input_buffer_.gpu(), bbox_output_device};
    if (has_segment_)
    {
    
    
        bindings = {
    
    input_buffer_.gpu(), segment_predict_.gpu(),
                    bbox_output_device};
    }
    
    // 执行推理
    if (!trt_->forward(bindings, stream)){
    
    
        for (int ib = 0; ib < num_image; ++ib){
    
    
            // 第ib张图片推理的结果
            float *parray = output_boxarray_.cpu() + ib * (32 + MAX_IMAGE_BOXES * NUM_BOX_ELEMENT);
            // 数组parray储存的该图片目标框个数
            int count = min(MAX_IMAGE_BOXES, (int)*parray);
            
            // 遍历目标框
            for (int i = 0; i < count; ++i){
    
    
                // 第i个目标框
                float *pbox = parray + 1 + i * NUM_BOX_ELEMENT;
                int label = pbox[5]; // 标签
                int keepflag = pbox[6]; // 是否被保留
                if (keepflag == 1){
    
    
                    Box result_object_box(pbox[0], pbox[1], pbox[2], pbox[3], pbox[4],label);
                    if (has_segment_){
    
    
                        int row_index = pbox[7]; 
                        int mask_dim = segment_head_dims_[1]; // masks维度 32
                        float *mask_weights = ...  // 权重系数的地址 长度为32
                        
                        if (mask_out_width > 0 && mask_out_height > 0){
    
    
                            decode_single_mask(){
    
    } // 获得mask
                        }
    
                    }
                    // 添加坐标框
                    output.emplace_back(result_object_box);
                }
            }
        }
    }
    Infer *loadraw(...){
    
    
        InferImpl *impl = new InferImpl(); // 初始化infer子类对象
        // 加载本地engine
        if (!impl->load(engine_file, type, confidence_threshold, nms_threshold))
        {
    
    
            delete impl;
            impl = nullptr;
        }
    }
    
    // 调用loadraw，并返回Infer接口类，使用多态。
    shared_ptr<Infer> load(const string &engine_file, Type type,
                           float confidence_threshold, float nms_threshold)
    {
    
    
        return std::shared_ptr<InferImpl>((InferImpl *)loadraw(
            engine_file, type, confidence_threshold, nms_threshold));
    }
}

3、总结

本次学习了许多cuda编程的内容，包括预处理、后处理的核函数，接口实现方式等等，在第三部分会学习基于生产者消费者的推理方式。

[CV学习笔记] yolo&tensorrt多线程推理-第二部分

1、前言

2、实现代码学习

3、总结

猜你喜欢