硬件解码example

C++11中std::move、std::forward内存转移

环境：

Intel® Pentium® Silver N6000 @ 1.10GHz 4核4线程

GPU的型号

lspci -nn | grep -i vga

00:02.0 VGA compatible controller [0300]: Intel Corporation JasperLake [UHD Graphics] [8086:4e71] (rev 01)

sudo lshw -C display

*-display
description: VGA compatible controller
product: JasperLake [UHD Graphics]
vendor: Intel Corporation
physical id: 2
bus info: pci@0000:00:02.0
logical name: /dev/fb0
version: 01
width: 64 bits
clock: 33MHz
capabilities: pciexpress msi pm vga_controller bus_master cap_list rom fb
configuration: depth=32 driver=i915 latency=0 mode=800x480 resolution=1920,1080 visual=truecolor xres=800 yres=480
resources: iomemory:600-5ff iomemory:400-3ff irq:137 memory:6000000000-6000ffffff memory:4000000000-400fffffff
ioport:3000(size=64) memory:c0000-dffff

测试系统io写速度，创建testfile文件，使用IO模式直接写入1GB数据操作

dd if=/dev/zero of=testfile bs=1G count=1 oflag=direct

1+0 records in
1+0 records out 1073741824 bytes (1.1 GB, 1.0 GiB)
copied, 23.9032 s, 44.9 MB/s

测试系统io读速度，从testfile文件中读取1GB数据，并将其丢弃到/dev/null设备

dd if=testfile of=/dev/null bs=1G count=1 iflag=direct

1+0 records in
1+0 records out 1073741824 bytes (1.1 GB, 1.0 GiB)
copied, 2.54137 s, 423 MB/s

使用方法：
./hw_decode vaapi juren-30s.mp4 juren-30s.mp4
验证播放：
ffplay -video_size 1920x1080 -pixel_format yuv420p juren-30s.yuv

av_hwdevice_iterate_types(type)

如果你填入的参数不对，那么这函数这个函数的作用相当于以下命令，列举支持的硬件加速方式。

ffmpeg -hwaccels

在这里插入图片描述 type对应以下宏，会根据你的输入，比如vaapi / qsv找到对应的宏
type = av_hwdevice_find_type_by_name(argv[1])
enum AVHWDeviceType {
AV_HWDEVICE_TYPE_NONE,
AV_HWDEVICE_TYPE_VDPAU,
AV_HWDEVICE_TYPE_CUDA,
AV_HWDEVICE_TYPE_VAAPI,
AV_HWDEVICE_TYPE_DXVA2,
AV_HWDEVICE_TYPE_QSV,
AV_HWDEVICE_TYPE_VIDEOTOOLBOX,
AV_HWDEVICE_TYPE_D3D11VA,
AV_HWDEVICE_TYPE_DRM,
AV_HWDEVICE_TYPE_OPENCL,
AV_HWDEVICE_TYPE_MEDIACODEC,
AV_HWDEVICE_TYPE_VULKAN,
};

这个函数会从input_ctx的流结构体中找到Video对应的解码器。

av_find_best_stream(input_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, &decoder, 0);

还有另外的方式，手动去结构体里找：

int video_index = -1;    
for (int i = 0; i < ic->nb_streams; i++)
    {
    
    
        if (ic->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
        {
    
    
            video_index = i;
        }
    }
        
ZlogInfo("id = %d\n", ic->streams[video_index]->codecpar->codec_id);

av_hwdevice_get_type_name(type)

上面这个函数与type = av_hwdevice_find_type_by_name(“qsv”);函数互反，av_hwdevice_get_type_name(type)的返回值是“qsv”，前者的返回值是AV_HWDEVICE_TYPE_QSV。

av_hwframe_transfer_data(sw_frame, frame, 0))

这个函数非常消耗cpu，在i9，28核56线程pc上测试，硬解码总共耗费15%的cpu，它自己就占14%，也就是说硬解码本身才占用1%。
下面这个函数应该也能GPU to CPU顺便做像素格式转换

int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
              const int srcStride[], int srcSliceY, int srcSliceH,
              uint8_t *const dst[], const int dstStride[]);

//仅将“metadata”字段从src复制到dst。

av_frame_copy_props(sw_frame, frame);

@return the buffer size in bytes, a negative error code in case of failure
int av_image_get_buffer_size(enum AVPixelFormat pix_fmt, int width, int height, int align);

根据给定的参数算出存放这样的图片需要多大的byte

int av_hwframe_transfer_get_formats(AVBufferRef *hwframe_ctx,
                                    enum AVHWFrameTransferDirection dir,
                                    enum AVPixelFormat **formats, int flags);

上面这个函数，是用来得到av_hwframe_transfer_data转换过来的帧是什么像素格式。

int av_hwframe_map(AVFrame *dst, const AVFrame *src, int flags);

上面这个函数是做映射，将YUV/NV12转RGB的步骤由CPU转换改为使用GPU转换，
使用av_hwframe_map原位置替换就好，耗时为原来的2/3。参考
下面是两个函数对比，CPU总id剩余，GPU占用，应用本身占用的id

av_hwframe_transfer_data(sw_pframe, praw_frame, 0);//1080p50 CPU:65 GPU:75 cpu:128%
av_hwframe_map(sw_pframe, praw_frame, 0);//1080p50 CPU:75 GPU:72 cpu:103%

av_hwframe_unmap

#include <stdio.h>

#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/pixdesc.h>
#include <libavutil/hwcontext.h>
#include <libavutil/opt.h>
#include <libavutil/avassert.h>
#include <libavutil/imgutils.h>

static AVBufferRef *hw_device_ctx = NULL;
static enum AVPixelFormat hw_pix_fmt;
static FILE *output_file = NULL;

static int hw_decoder_init(AVCodecContext *ctx, const enum AVHWDeviceType type)
{
    
    
    int err = 0;

    if ((err = av_hwdevice_ctx_create(&hw_device_ctx, type,
                                      NULL, NULL, 0)) < 0) {
    
    
        fprintf(stderr, "Failed to create specified HW device.\n");
        return err;
    }
    ctx->hw_device_ctx = av_buffer_ref(hw_device_ctx);

    return err;
}


static enum AVPixelFormat get_hw_format(AVCodecContext *ctx,
                                        const enum AVPixelFormat *pix_fmts)
{
    
    
    const enum AVPixelFormat *p;

    for (p = pix_fmts; *p != -1; p++) {
    
    
        if (*p == hw_pix_fmt)//确保我们需要的硬件加速像素格式，是被支持的
            return *p;
    }

    fprintf(stderr, "Failed to get HW surface format.\n");
    return AV_PIX_FMT_NONE;
}

static int decode_write(AVCodecContext *avctx, AVPacket *packet)
{
    
    
    AVFrame *frame = NULL, *sw_frame = NULL;
    AVFrame *tmp_frame = NULL;
    uint8_t *buffer = NULL;
    int size;
    int ret = 0;

    ret = avcodec_send_packet(avctx, packet);
    if (ret < 0) {
    
    
        fprintf(stderr, "Error during decoding\n");
        return ret;
    }

    while (1) {
    
    
        if (!(frame = av_frame_alloc()) || !(sw_frame = av_frame_alloc())) {
    
    
            fprintf(stderr, "Can not alloc frame\n");
            ret = AVERROR(ENOMEM);
            goto fail;
        }

        ret = avcodec_receive_frame(avctx, frame);
        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
    
    
            av_frame_free(&frame);
            av_frame_free(&sw_frame);
            return 0;
        } else if (ret < 0) {
    
    
            fprintf(stderr, "Error while decoding\n");
            goto fail;
        }

        if (frame->format == hw_pix_fmt) {
    
    
            /* retrieve data from GPU to CPU */
            if ((ret = av_hwframe_transfer_data(sw_frame, frame, 0)) < 0) {
    
    
                fprintf(stderr, "Error transferring the data to system memory\n");
                goto fail;
            }
            tmp_frame = sw_frame;
        } else
            tmp_frame = frame;

        size = av_image_get_buffer_size(tmp_frame->format, tmp_frame->width,
                                        tmp_frame->height, 1);
        buffer = av_malloc(size);
        if (!buffer) {
    
    
            fprintf(stderr, "Can not alloc buffer\n");
            ret = AVERROR(ENOMEM);
            goto fail;
        }
        ret = av_image_copy_to_buffer(buffer, size,
                                      (const uint8_t * const *)tmp_frame->data,
                                      (const int *)tmp_frame->linesize, tmp_frame->format,
                                      tmp_frame->width, tmp_frame->height, 1);
        if (ret < 0) {
    
    
            fprintf(stderr, "Can not copy image to buffer\n");
            goto fail;
        }

        if ((ret = fwrite(buffer, 1, size, output_file)) < 0) {
    
    
            fprintf(stderr, "Failed to dump raw data.\n");
            goto fail;
        }

    fail:
        av_frame_free(&frame);
        av_frame_free(&sw_frame);
        av_freep(&buffer);
        if (ret < 0)
            return ret;
    }
}

int main(int argc, char *argv[])
{
    
    
    AVFormatContext *input_ctx = NULL;
    int video_stream, ret;
    AVStream *video = NULL;
    AVCodecContext *decoder_ctx = NULL;
    const AVCodec *decoder = NULL;
    AVPacket *packet = NULL;
    enum AVHWDeviceType type;
    int i;

    if (argc < 4) {
    
    
        fprintf(stderr, "Usage: %s <device type> <input file> <output file>\n", argv[0]);
        return -1;
    }

    type = av_hwdevice_find_type_by_name(argv[1]);
    if (type == AV_HWDEVICE_TYPE_NONE) {
    
    
        fprintf(stderr, "Device type %s is not supported.\n", argv[1]);
        fprintf(stderr, "Available device types:");
        while((type = av_hwdevice_iterate_types(type)) != AV_HWDEVICE_TYPE_NONE)
            fprintf(stderr, " %s", av_hwdevice_get_type_name(type));
        fprintf(stderr, "\n");
        return -1;
    }

    packet = av_packet_alloc();
    if (!packet) {
    
    
        fprintf(stderr, "Failed to allocate AVPacket\n");
        return -1;
    }

    /* open the input file */
    if (avformat_open_input(&input_ctx, argv[2], NULL, NULL) != 0) {
    
    
        fprintf(stderr, "Cannot open input file '%s'\n", argv[2]);
        return -1;
    }

    if (avformat_find_stream_info(input_ctx, NULL) < 0) {
    
    
        fprintf(stderr, "Cannot find input stream information.\n");
        return -1;
    }

    /* find the video stream information */
    ret = av_find_best_stream(input_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, &decoder, 0);
    if (ret < 0) {
    
    
        fprintf(stderr, "Cannot find a video stream in the input file\n");
        return -1;
    }
    video_stream = ret;

    for (i = 0;; i++) {
    
    
    		//这里为什么要循环内？比如解码器是264但支持264的硬件解码有很多，英伟达，intel，intel也有不同的方法如qsv和vaapi，这里把这些一一列举。
    		//每种方法对应着一个AVCodecHWConfig结构体
        const AVCodecHWConfig *config = avcodec_get_hw_config(decoder, i);
        if (!config) {
    
    
            fprintf(stderr, "Decoder %s does not support device type %s.\n",
                    decoder->name, av_hwdevice_get_type_name(type));
            return -1;
        }
        if (config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX &&
            config->device_type == type) {
    
    //比如qsv，这里type就是AV_HWDEVICE_TYPE_QSV
            hw_pix_fmt = config->pix_fmt;//支持的像素格式
            break;
        }
    }

    if (!(decoder_ctx = avcodec_alloc_context3(decoder)))
        return AVERROR(ENOMEM);

    video = input_ctx->streams[video_stream];
    if (avcodec_parameters_to_context(decoder_ctx, video->codecpar) < 0)
        return -1;

    decoder_ctx->get_format  = get_hw_format;

    if (hw_decoder_init(decoder_ctx, type) < 0)
        return -1;

    if ((ret = avcodec_open2(decoder_ctx, decoder, NULL)) < 0) {
    
    
        fprintf(stderr, "Failed to open codec for stream #%u\n", video_stream);
        return -1;
    }

    /* open the file to dump raw data */
    output_file = fopen(argv[3], "w+b");

    /* actual decoding and dump the raw data */
    while (ret >= 0) {
    
    
        if ((ret = av_read_frame(input_ctx, packet)) < 0)
            break;

        if (video_stream == packet->stream_index)
            ret = decode_write(decoder_ctx, packet);

        av_packet_unref(packet);
    }

    /* flush the decoder */
    ret = decode_write(decoder_ctx, NULL);

    if (output_file)
        fclose(output_file);
    av_packet_free(&packet);
    avcodec_free_context(&decoder_ctx);
    avformat_close_input(&input_ctx);
    av_buffer_unref(&hw_device_ctx);

    return 0;
}

enum {
    
    
    /**
     * The mapping must be readable.
     */
    AV_HWFRAME_MAP_READ      = 1 << 0,
    /**
     * The mapping must be writeable.
     */
    AV_HWFRAME_MAP_WRITE     = 1 << 1,
    /**
     * The mapped frame will be overwritten completely in subsequent
     * operations, so the current frame data need not be loaded.  Any values
     * which are not overwritten are unspecified.
     */
    AV_HWFRAME_MAP_OVERWRITE = 1 << 2,
    /**
     * The mapping must be direct.  That is, there must not be any copying in
     * the map or unmap steps.  Note that performance of direct mappings may
     * be much lower than normal memory.
     */
    AV_HWFRAME_MAP_DIRECT    = 1 << 3,
};

AV_HWFRAME_MAP_READ: 允许读取映射后的帧数据。
AV_HWFRAME_MAP_WRITE: 允许写入映射后的帧数据。
AV_HWFRAME_MAP_DIRECT: 如果可能，使用直接映射，避免数据拷贝。如果不支持直接映射，则会回退到内部拷贝。
填入0是使用默认值，测试发现AV_HWFRAME_MAP_DIRECT和填0，CPU消耗没有差别，说明直接映射是默认的。
这个函数是做映射的，直接在GPU硬件上的数据无法直接使用，因此需要做映射。映射过程可能存在拷贝，但这个拷贝应该是使用dma不过cpu，cpu不做调度，只是GPU和内存硬件之间的交互，不怎么花费cpu。AV_HWFRAME_MAP_DIRECT是直接映射不拷贝，它让我们能访问GPU里的内存。
这个函数的src必须是硬件帧，dst可以是AVFrame，也可以是硬件帧。
flags: 映射标志，用于指定映射的行为。可以是以上值的按位或组合

int av_hwframe_map(AVFrame *dst, const AVFrame *src, int flags)

对以上函数测试发现：
把1080p50数据，copy到共享内存，消耗CPU 10个id
把1080p50数据，通过udp发送出去，几乎不消耗id
解码后放在GPU的显存的地方不不相同，因此每帧都需要映射

dst->data[0]:139931650500608 dst->data[0]:139931646380032
dst->data[0]:139931642701824 dst->data[0]:139931638581248

int av_hwframe_transfer_data(AVFrame *dst, const AVFrame *src, int flags);

这个函数可以的src/dst至少有一个硬件帧，它支持做简单的copy。它可以输出不同的像素格式，单前提是，以下函数支持的像素格式，它更不能进行分辨率大小转换。

           int num_formats = av_hwframe_transfer_get_formats(device_ref, AV_HWFRAME_TRANSFER_DIRECTION_FROM, &formats, 0);
            if (num_formats < 0)
            {
    
    
                // 错误处理
                av_buffer_unref(&device_ref);
                cout << "av_hwframe_transfer_get_formats" << endl;
                // return ;
            }
            // AV_HWFRAME_TRANSFER_DIRECTION_FROM
            cout << "9-09-00-0-0-0-0-0-0-0-0-0-0-0-0:" << num_formats << endl;
            // 遍历打印格式列表
            for (int i = 0; i < 5; i++)
            {
    
    
                cout << "22222222222222222222222222:" << num_formats << endl;
                const char *format_name = av_get_pix_fmt_name(formats[i]);
                // printf("Format: %s\n", format_name);
                cout << "Format:" << format_name << endl;
            }

enum AVHWFrameTransferDirection {
    
    
    /**
     * Transfer the data from the queried hw frame.
     */
    AV_HWFRAME_TRANSFER_DIRECTION_FROM,

    /**
     * Transfer the data to the queried hw frame.
     */
    AV_HWFRAME_TRANSFER_DIRECTION_TO,
};

猜你喜欢