英伟达N卡硬解码ffmpeg取流

随着祖国的伟大复兴，科技的发展，客户的需求，终于项目上需要硬解了，因为相机的发展低分辨率的视频已经满足不了客户越来越膨胀的欲望了，没办法只能搞一搞硬解了。

首先说一说在调研中遇到的一些弯路：

1.想到硬解首先考虑到能运用自己所学知识最好，当然首先考虑到的就是FFmpeg，于是就下载了最新的ffmpeg库，找到了HWDecode.c改吧了改吧，本以为奏这么简简单单奏搞定了，但是皇天不负有心人最终还是失败了，这个悲催的Demo改完之后最多解码1080p开多线程的话也就12路，而且还是极不稳定说不定到哪奏挂掉了(用的ffmpeg dxva2的方式)，如果对dxva2不了解的话可以查一查，它是微软的一套标准硬解接口，底层貌似调用的是硬件驱动。

2.在百思不得其解的时候，突然群里一个大lao在群里放了一个用Intel Media SDK硬解的东西，当然是羡慕嫉妒恨了，大lao果然是大佬，推荐他的博客讲解Intel Media SDK和使用ffmpeg编译qsv实现硬解(https://blog.csdn.net/ww506772362/article/details/49865403)非常人品不错的大lao，致敬。如果要求解码路数不多的话，估计ffmpeg编译qsv在普通机器上使用Intel芯片奏能解决问题。他的博客有讲如何编译ffmpeg使用qsv硬解，试过了，由于项目要求解码的数据需要传回给cpu，测试使用qsv 硬解1080p16路的话没有问题，并且可以见数据传回到cpu。

3.N卡硬解，这是一段辛酸史，找Demo看代码，熟悉API，Read the Fucking code 理解一下。下面改贴代码了，他们的代码我奏不贴了，只贴一些修改，包括自己的封装。

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#ifndef NV_VIDEO_SOURCE
#define NV_VIDEO_SOURCE

#include "dynlink_nvcuvid.h" // <nvcuvid.h>

#include <string>

extern "C"
{
#include "libavutil/avstring.h"
#include "libavutil/mathematics.h"
#include "libavutil/pixdesc.h"
#include "libavutil/imgutils.h"
#include "libavutil/dict.h"
#include "libavutil/parseutils.h"
#include "libavutil/samplefmt.h"
#include "libavutil/avassert.h"
#include "libavutil/time.h"
#include "libavformat/avformat.h"
#include "libavdevice/avdevice.h"
#include "libswscale/swscale.h"
#include "libavutil/opt.h"
#include "libavcodec/avfft.h"
#include "libswresample/swresample.h"

#include "libavfilter/avfiltergraph.h"
#include "libavfilter/buffersink.h"
#include "libavfilter/buffersrc.h"
#include "libavutil/avutil.h"
}

#pragma comment(lib, "avcodec.lib")
#pragma comment(lib, "avformat.lib")
#pragma comment(lib, "avutil.lib")
#pragma comment(lib, "avfilter.lib")
#pragma comment(lib, "avdevice.lib")
#pragma comment(lib, "postproc.lib")
#pragma comment(lib, "swresample.lib")
#pragma comment(lib, "swscale.lib")

#include "thread.hpp"

typedef struct
{
    int  codecs;
    char *name;
} _sVideoFormats;

static _sVideoFormats eVideoFormats[] =
{
    { cudaVideoCodec_MPEG1,    "MPEG-1" },
    { cudaVideoCodec_MPEG2,    "MPEG-2" },
    { cudaVideoCodec_MPEG4,    "MPEG-4 (ASP)" },
    { cudaVideoCodec_VC1,      "VC-1/WMV" },
    { cudaVideoCodec_H264,     "AVC/H.264" },
    { cudaVideoCodec_JPEG,     "M-JPEG" },
	{ cudaVideoCodec_H264_SVC, "H.264/SVC" },
	{ cudaVideoCodec_H264_MVC, "H.264/MVC" },
	{ cudaVideoCodec_HEVC,     "H.265/HEVC" },
    { cudaVideoCodec_VP8,      "VP8" },
    { cudaVideoCodec_VP9,      "VP9" },
	{ cudaVideoCodec_NumCodecs,"Invalid" },
    { cudaVideoCodec_YUV420,   "YUV  4:2:0" },
    { cudaVideoCodec_YV12,     "YV12 4:2:0" },
    { cudaVideoCodec_NV12,     "NV12 4:2:0" },
    { cudaVideoCodec_YUYV,     "YUYV 4:2:2" },
    { cudaVideoCodec_UYVY,     "UYVY 4:2:2" },
    {                  -1 , "Unknown" },
};

// forward declarations
class FrameQueue;
class VideoParser;


// A wrapper class around the CUvideosource entity and API.
//  The CUvideosource manages video-source streams (right now
// via openening a file containing the stream.) After successfully
// opening a video stream, one can query its properties, such as
// video and audio compression format, frame-rate, etc.
//
// The video-source spawns its own thread for processing the stream.
// The user can register call-back methods for handling chucks of demuxed
// audio and video data.
class VideoSource
{
    public:
		VideoSource();

		~VideoSource();

        CUVIDEOFORMAT format();

		void setParser(VideoParser &rVideoParser, CUcontext cuCtx);

        // Begin processing the video stream.
        void start();

        // End processing the video stream.
        void stop();

        // Has video-processing be started?
        bool isStarted();

        // Retrieve source dimensions (width, height) from the video
        void getSourceDimensions(unsigned int &width, unsigned int &height);

        // Retrieve display dimensions (width, height) for the video
        void getDisplayDimensions(unsigned int &width, unsigned int &height);

		void getProgressive(bool &progressive);

		bool init(const std::string sFileName, FrameQueue *pFrameQueue);

    private:
        // This struct contains the data we need inside the source's
        // video callback in order to processes the video data.
        struct VideoSourceData
        {
            CUvideoparser hVideoParser;
            FrameQueue   *pFrameQueue;
        };

        // Assignment operator. Don't implement.
        void operator= (const VideoSource &);

        VideoSourceData oSourceData_;       // Instance of the user-data struct we use in the video-data handle callback.

		CUVIDEOFORMAT g_stFormat;
		void play_thread(LPVOID lpParam);
		Common::ThreadPtr play_thread_ptr;
		BOOL bThreadExit;

		CUcontext          g_oContext ;

		bool bStarted;

		AVDictionary*				m_formatOpts;
		AVCodecContext	*		m_pCodecCtx;
		AVFormatContext*		m_pFormatCtx;
		int				videoindex;

		AVBitStreamFilterContext* h264bsfc = NULL;
};

std::ostream & operator << (std::ostream &rOutputStream, const CUVIDEOFORMAT &rCudaVideoFormat);

#endif // NV_VIDEO_SOURCE

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "VideoSource.h"
#include "FrameQueue.h"
#include "VideoParser.h"
#include <assert.h>
#include "helper_cuda_drvapi.h"

static bool g_initVideoSource = 0;

VideoSource::VideoSource()
{
	bThreadExit = false;
	bStarted = false;
	play_thread_ptr = NULL;
	m_formatOpts = NULL;
	videoindex = -1;
	if (!g_initVideoSource)
	{
		av_register_all();
		avformat_network_init();
		g_initVideoSource = true;
	}
}

VideoSource::~VideoSource()
{
	stop();
	if (m_pCodecCtx)
	{
		avcodec_close(m_pCodecCtx);
		m_pCodecCtx = NULL;
	}
	if (m_pFormatCtx)
	{
		avformat_close_input(&m_pFormatCtx);
		m_pFormatCtx = NULL;
	}

}

bool VideoSource::init(const std::string sFileName, FrameQueue *pFrameQueue)
{
	assert(0 != pFrameQueue);
	oSourceData_.hVideoParser = 0;
	oSourceData_.pFrameQueue = pFrameQueue;

	int				i;
	AVCodec			*pCodec;
	m_pFormatCtx = avformat_alloc_context();

	AVDictionary *pOptions = NULL;
	//av_dict_set(&pOptions, "probesize", "4096", 0);
	av_dict_set(&pOptions, "max_delay", "1000", 0);//指定最大延时100毫秒

	//设置rtp以tcp方式传输
	av_dict_set(&pOptions, "rtsp_transport", "tcp", 0);

	if (avformat_open_input(&m_pFormatCtx, sFileName.c_str(), NULL, &pOptions) != 0){
		printf("Couldn't open input stream.\n");
		return false;
	}
	if (avformat_find_stream_info(m_pFormatCtx, NULL)<0){
		printf("Couldn't find stream information.\n");
		return false;
	}
	videoindex = -1;
	for (i = 0; i<m_pFormatCtx->nb_streams; i++)
	if (m_pFormatCtx->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO){
		videoindex = i;
		break;
	}

	if (videoindex == -1){
		printf("Didn't find a video stream.\n");
		return false;
	}

	m_pCodecCtx = m_pFormatCtx->streams[videoindex]->codec;

	pCodec = avcodec_find_decoder(m_pCodecCtx->codec_id);
	if (pCodec == NULL){
		printf("Codec not found.\n");
		return false;
	}

	//Output Info-----------------------------
	printf("--------------- File Information ----------------\n");
	av_dump_format(m_pFormatCtx, 0, sFileName.c_str(), 0);
	printf("-------------------------------------------------\n");

	memset(&g_stFormat, 0, sizeof(CUVIDEOFORMAT));

	switch (m_pCodecCtx->codec_id) {
	case AV_CODEC_ID_H263:
		g_stFormat.codec = cudaVideoCodec_MPEG4;
		break;

	case AV_CODEC_ID_H264:
		g_stFormat.codec = cudaVideoCodec_H264;
		break;

	case AV_CODEC_ID_HEVC:
		g_stFormat.codec = cudaVideoCodec_HEVC;
		break;

	case AV_CODEC_ID_MJPEG:
		g_stFormat.codec = cudaVideoCodec_JPEG;
		break;

	case AV_CODEC_ID_MPEG1VIDEO:
		g_stFormat.codec = cudaVideoCodec_MPEG1;
		break;

	case AV_CODEC_ID_MPEG2VIDEO:
		g_stFormat.codec = cudaVideoCodec_MPEG2;
		break;

	case AV_CODEC_ID_MPEG4:
		g_stFormat.codec = cudaVideoCodec_MPEG4;
		break;

	case AV_CODEC_ID_VP8:
		g_stFormat.codec = cudaVideoCodec_VP8;
		break;

	case AV_CODEC_ID_VP9:
		g_stFormat.codec = cudaVideoCodec_VP9;
		break;

	case AV_CODEC_ID_VC1:
		g_stFormat.codec = cudaVideoCodec_VC1;
		break;
	default:
		return false;
	}

	//这个地方的FFmoeg与cuvid的对应关系不是很确定，不过用这个参数似乎最靠谱
	switch (m_pCodecCtx->sw_pix_fmt)
	{
	case AV_PIX_FMT_YUV420P:
		g_stFormat.chroma_format = cudaVideoChromaFormat_420;
		break;
	case AV_PIX_FMT_YUV422P:
		g_stFormat.chroma_format = cudaVideoChromaFormat_422;
		break;
	case AV_PIX_FMT_YUV444P:
		g_stFormat.chroma_format = cudaVideoChromaFormat_444;
		break;
	default:
		g_stFormat.chroma_format = cudaVideoChromaFormat_420;
		break;
	}

	//找了好久，总算是找到了FFmpeg中标识场格式和帧格式的标识位
	//场格式是隔行扫描的，需要做去隔行处理
	switch (m_pCodecCtx->field_order)
	{
	case AV_FIELD_PROGRESSIVE:
	case AV_FIELD_UNKNOWN:
		g_stFormat.progressive_sequence = true;
		break;
	default:
		g_stFormat.progressive_sequence = false;
		break;
	}

	m_pCodecCtx->thread_safe_callbacks = 1;

	g_stFormat.coded_width = m_pCodecCtx->coded_width;
	g_stFormat.coded_height = m_pCodecCtx->coded_height;

	g_stFormat.display_area.right = m_pCodecCtx->width;
	g_stFormat.display_area.left = 0;
	g_stFormat.display_area.bottom = m_pCodecCtx->height;
	g_stFormat.display_area.top = 0;

	if (m_pCodecCtx->codec_id == AV_CODEC_ID_H264 || m_pCodecCtx->codec_id == AV_CODEC_ID_HEVC) {
		if (m_pCodecCtx->codec_id == AV_CODEC_ID_H264)
			h264bsfc = av_bitstream_filter_init("h264_mp4toannexb");
		else
			h264bsfc = av_bitstream_filter_init("hevc_mp4toannexb");
	}

	return true;
}

CUVIDEOFORMAT VideoSource::format()
{
	return g_stFormat;
}

void VideoSource::getSourceDimensions(unsigned int &width, unsigned int &height)
{
    CUVIDEOFORMAT rCudaVideoFormat=  format();

    width  = rCudaVideoFormat.coded_width;
    height = rCudaVideoFormat.coded_height;
}

void VideoSource::getDisplayDimensions(unsigned int &width, unsigned int &height)
{
    CUVIDEOFORMAT rCudaVideoFormat=  format();

    width  = abs(rCudaVideoFormat.display_area.right  - rCudaVideoFormat.display_area.left);
    height = abs(rCudaVideoFormat.display_area.bottom - rCudaVideoFormat.display_area.top);
}

void VideoSource::getProgressive(bool &progressive)
{
	CUVIDEOFORMAT rCudaVideoFormat = format();
	progressive = rCudaVideoFormat.progressive_sequence ? true : false;
}

void VideoSource::setParser(VideoParser &rVideoParser, CUcontext cuCtx)
{
    oSourceData_.hVideoParser = rVideoParser.hParser_;
	g_oContext = cuCtx;
}

void VideoSource::start()
{
	bThreadExit = TRUE;
	if (play_thread_ptr)
	{
		play_thread_ptr->join();
		play_thread_ptr = NULL;
	}

	LPVOID arg_ = NULL;
	Common::ThreadCallback cb = BIND_MEM_CB(&VideoSource::play_thread, this);
	play_thread_ptr = new Common::CThread(cb, TRUE);
	if (!play_thread_ptr)
	{
		return ;
	}

	bThreadExit = FALSE;

	play_thread_ptr->start(arg_);
}

void VideoSource::stop()
{
	bThreadExit = TRUE;
	if (play_thread_ptr)
	{
		play_thread_ptr->join();
		play_thread_ptr = NULL;
	}
}

bool VideoSource::isStarted()
{
	return bStarted;
}

int iPkt = 0;

void VideoSource::play_thread(LPVOID lpParam)
{
	AVPacket *avpkt;
	avpkt = (AVPacket *)av_malloc(sizeof(AVPacket));
	CUVIDSOURCEDATAPACKET cupkt;
	int iPkt = 0;
	CUresult oResult;
	while (av_read_frame(m_pFormatCtx, avpkt) >= 0){
		if (bThreadExit){
			break;
		}
		bStarted = true;
		if (avpkt->stream_index == videoindex){

			AVPacket new_pkt = *avpkt;

			cuCtxPushCurrent(g_oContext);

			if (avpkt && avpkt->size) {
				if (h264bsfc)
				{
					int a = av_bitstream_filter_filter(h264bsfc, m_pFormatCtx->streams[videoindex]->codec, NULL, 
						&new_pkt.data, &new_pkt.size, avpkt->data, avpkt->size, avpkt->flags & AV_PKT_FLAG_KEY);
					av_free_packet(avpkt); 
					avpkt->data = new_pkt.data;
					avpkt->size = new_pkt.size;
					//av_bitstream_filter_filter(h264bsfc, m_pFormatCtx->streams[videoindex]->codec, NULL, &avpkt->data, &avpkt->size, avpkt->data, avpkt->size, 0);
				}
				cupkt.payload_size = (unsigned long)avpkt->size;
				cupkt.payload = (const unsigned char*)avpkt->data;
				if (avpkt->pts != AV_NOPTS_VALUE) {
					cupkt.flags = CUVID_PKT_TIMESTAMP;
					if (m_pCodecCtx->pkt_timebase.num && m_pCodecCtx->pkt_timebase.den){
						AVRational tb;
						tb.num = 1;
						tb.den = AV_TIME_BASE;
						cupkt.timestamp = av_rescale_q(avpkt->pts, m_pCodecCtx->pkt_timebase, tb);
					}
					else
						cupkt.timestamp = avpkt->pts;
				}
				cupkt.flags = (CUvideopacketflags)0;
			}
			else {
				cupkt.flags = CUVID_PKT_ENDOFSTREAM;
			}
			//
			oResult = cuvidParseVideoData(oSourceData_.hVideoParser, &cupkt);
			if ((cupkt.flags & CUVID_PKT_ENDOFSTREAM) || (oResult != CUDA_SUCCESS)){
				break;
			}
			iPkt++;
			//printf("Succeed to read avpkt %d !\n", iPkt);
			checkCudaErrors(cuCtxPopCurrent(NULL));
		}
		av_free(avpkt->data);
		//av_free_packet(avpkt);
		Sleep(40);
	}

	oSourceData_.pFrameQueue->endDecode();
	bStarted = false;
}

std::ostream & operator << (std::ostream &rOutputStream, const CUVIDEOFORMAT &rCudaVideoFormat)
{
    rOutputStream << "\tVideoCodec      : ";

    if ((rCudaVideoFormat.codec <= cudaVideoCodec_UYVY) &&
        (rCudaVideoFormat.codec >= cudaVideoCodec_MPEG1) &&
        (rCudaVideoFormat.codec != cudaVideoCodec_NumCodecs))
    {
        rOutputStream << eVideoFormats[rCudaVideoFormat.codec].name << "\n";
    }
    else
    {
        rOutputStream << "unknown\n";
    }

    rOutputStream << "\tFrame rate      : " << rCudaVideoFormat.frame_rate.numerator << "/" << rCudaVideoFormat.frame_rate.denominator;
    rOutputStream << "fps ~ " << rCudaVideoFormat.frame_rate.numerator/static_cast<float>(rCudaVideoFormat.frame_rate.denominator) << "fps\n";
    rOutputStream << "\tSequence format : ";

    if (rCudaVideoFormat.progressive_sequence)
        rOutputStream << "Progressive\n";
    else
        rOutputStream << "Interlaced\n";

    rOutputStream << "\tCoded frame size: [" << rCudaVideoFormat.coded_width << ", " << rCudaVideoFormat.coded_height << "]\n";
    rOutputStream << "\tDisplay area    : [" << rCudaVideoFormat.display_area.left << ", " << rCudaVideoFormat.display_area.top;
    rOutputStream << ", " << rCudaVideoFormat.display_area.right << ", " << rCudaVideoFormat.display_area.bottom << "]\n";
    rOutputStream << "\tChroma format   : ";

    switch (rCudaVideoFormat.chroma_format)
    {
        case cudaVideoChromaFormat_Monochrome:
            rOutputStream << "Monochrome\n";
            break;

        case cudaVideoChromaFormat_420:
            rOutputStream << "4:2:0\n";
            break;

        case cudaVideoChromaFormat_422:
            rOutputStream << "4:2:2\n";
            break;

        case cudaVideoChromaFormat_444:
            rOutputStream << "4:4:4\n";
            break;

        default:
            rOutputStream << "unknown\n";
    }

    rOutputStream << "\tBitrate         : ";

    if (rCudaVideoFormat.bitrate == 0)
        rOutputStream << "unknown\n";
    else
        rOutputStream << rCudaVideoFormat.bitrate/1024 << "kBit/s\n";

    rOutputStream << "\tAspect ratio    : " << rCudaVideoFormat.display_aspect_ratio.x << ":" << rCudaVideoFormat.display_aspect_ratio.y << "\n";

    return rOutputStream;
}

特别注意内存泄漏部分的修改

硬解的封装代码：

#pragma once

#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#define WINDOWS_LEAN_AND_MEAN
#include <windows.h>
#include <windowsx.h>
#pragma comment(lib, "version.lib")
#endif
// CUDA Header includes
#include "dynlink_nvcuvid.h"  // <nvcuvid.h>
#include "dynlink_cuda.h"     // <cuda.h>
#include "dynlink_cudaD3D9.h" // <cudaD3D9.h>
#include "dynlink_builtin_types.h"      // <builtin_types.h>

// CUDA utilities and system includes
#include "helper_functions.h"
#include "helper_cuda_drvapi.h"

// cudaDecodeD3D9 related helper functions
#include "FrameQueue.h"
#include "VideoSource.h"
#include "VideoParser.h"
#include "VideoDecoder.h"

#define MAX_YUV_PANAR 6
#define WRITE_YUV_FILE
class NvDecode
{
public:
	NvDecode();
	~NvDecode();
public:
	static bool NV_Init();
	static bool NV_InitResources(CUdevice deviceID);
	static void NV_FreeCudaResources();
	bool NV_OpenStream(const char *video_file);
	bool NV_CloseStream();


	bool NV_CopyDecodedFrameToTexture();
	//留着以后使用
	bool NV_CreateDecoder(int width, int height, int coded_width, int coded_height, cudaVideoCodec cuda_codec);
	//留着以后使用
	bool NV_DecodeFrame(BYTE*pbInput,int iInputLen,BYTE*& pbOutPut,int& iOutPutLen);
public:
	static CUcontext          g_oContext;
	static CUdevice           g_oDevice;
protected:
	// Begin processing the video stream.
	void start();
	// End processing the video stream.
	void stop();
	void NV_InitCudaVideo();
	void NV_UnitCudaVideo();
	bool NV_LoadVideoSource(const char *video_file, unsigned int &width, unsigned int &height, unsigned int &dispWidth, unsigned int &dispHeight);

	void NV_ProcessPayload(CUvideopacketflags cuvidPktFlags, const uint8_t* in_payload, uint64_t payloadSize, CUvideotimestamp in_timestamp);
	bool NV_GetDecodeFrame(uint8_t* start, uint32_t length);
	//留着以后使用
	//bool NV_CopyDecodedFrameToTexture(BYTE*& pbOutPut,int& iOutPutLen/*unsigned int &nRepeats, int bUseInterop, int *pbIsProgressive*/);
	
private:
	//CUcontext          m_oContext;
	//CUdevice           m_oDevice;
	CUvideoctxlock  m_CtxLock;
	CUVIDEOFORMAT m_stFormat;

	// System Memory surface we want to readback to
	BYTE				*m_pFrameYUV[MAX_YUV_PANAR];
	FrameQueue		*m_pFrameQueue;
	VideoSource		*m_pVideoSource;
	VideoParser		*m_pVideoParser;
	VideoDecoder  *m_pVideoDecoder;
	cudaVideoCreateFlags m_eVideoCreateFlags;
	CUstream          m_ReadbackSID;
	CUstream          m_KernelSID;
	bool					 m_bFirstFrame;
	unsigned int		 m_DecodeFrameCount;

	void decode_thread(LPVOID lpParam);
	Common::ThreadPtr play_thread_ptr;
	BOOL bThreadExit;
#ifdef WRITE_YUV_FILE
	FILE* fp;
#endif
};

#include "NvDecode.h"

CUdevice NvDecode::g_oDevice = 0;
CUcontext NvDecode::g_oContext = NULL;

void SaveFrameAsYUV(FILE* fpWriteYUV,unsigned char *pdst,
	const unsigned char *psrc,
	int width, int height, int pitch)
{
	int x, y, width_2, height_2;
	int xy_offset = width*height;
	int uvoffs = (width / 2)*(height / 2);
	const unsigned char *py = psrc;
	const unsigned char *puv = psrc + height*pitch;

	// luma
	for (y = 0; y < height; y++)
	{
		memcpy(&pdst[y*width], py, width);
		py += pitch;
	}

	// De-interleave chroma
	width_2 = width >> 1;
	height_2 = height >> 1;
	for (y = 0; y < height_2; y++)
	{
		for (x = 0; x < width_2; x++)
		{
			pdst[xy_offset + y*(width_2)+x] = puv[x * 2];
			pdst[xy_offset + uvoffs + y*(width_2)+x] = puv[x * 2 + 1];
		}
		puv += pitch;
	}

	fwrite(pdst, 1, width*height + (width*height) / 2, fpWriteYUV);
}

NvDecode::NvDecode()
{
	
	m_CtxLock = NULL;
	
	// System Memory surface we want to readback to
	//g_pFrameYUV[6] = { 0, 0, 0, 0, 0, 0 };
	bThreadExit = false;
	//bStarted = false;
	play_thread_ptr = NULL;

	m_pFrameQueue = NULL;
	m_pVideoSource = NULL;
	m_pVideoParser = NULL;
	m_pVideoDecoder = NULL;
	m_bFirstFrame = true;
	m_ReadbackSID = 0;
	m_KernelSID = 0;
	m_DecodeFrameCount = 0;
	m_eVideoCreateFlags = cudaVideoCreate_Default;
	memset(&m_stFormat, 0, sizeof(CUVIDEOFORMAT));
	memset(&m_pFrameYUV, 0, sizeof(BYTE*)*MAX_YUV_PANAR);
#ifdef WRITE_YUV_FILE
	char szTmp[32] = {};
	sprintf(szTmp, "%ld.yuv", (LONG)this);
	fp = fopen(szTmp, "wb");
#endif
}


NvDecode::~NvDecode()
{
	m_DecodeFrameCount = 0;
#ifdef WRITE_YUV_FILE
	fclose(fp);
#endif
	stop();
	NV_UnitCudaVideo();
}

bool NvDecode::NV_Init()
{
	CUDADRIVER hHandleDriver = 0;
	CUresult cuResult;
	cuResult = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
	cuResult = cuvidInit(0);
	return (cuResult == CUDA_SUCCESS ? true : false);
}

bool NvDecode::NV_InitResources(CUdevice deviceID)
{
	CUresult cuResult;
	CUdevice cuda_device;
	cuda_device = gpuGetMaxGflopsDeviceIdDRV();
	cuResult = cuDeviceGet(&deviceID, cuda_device);
	if (cuResult != CUDA_SUCCESS)
		return false;
	g_oDevice = deviceID;
	checkCudaErrors(cuCtxCreate(&g_oContext, CU_CTX_BLOCKING_SYNC, deviceID));
	//可有可无
	// get compute capabilities and the devicename
	//int major, minor;
	//size_t totalGlobalMem;
	//char deviceName[256];
	//checkCudaErrors(cuDeviceComputeCapability(&major, &minor, deviceID));
	//checkCudaErrors(cuDeviceGetName(deviceName, 256, deviceID));
	//printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor);
	//checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, deviceID));
	//printf("  Total amount of global memory:     %4.4f MB\n", (float)totalGlobalMem / (1024 * 1024));
	return true;
}

void NvDecode::NV_InitCudaVideo()
{
	checkCudaErrors(cuCtxPushCurrent(g_oContext));
	// bind the context lock to the CUDA context
	CUresult result = cuvidCtxLockCreate(&m_CtxLock, g_oContext);
	CUVIDEOFORMATEX oFormatEx;
	memset(&oFormatEx, 0, sizeof(CUVIDEOFORMATEX));
	oFormatEx.format = m_stFormat;

	if (result != CUDA_SUCCESS)
	{
		printf("cuvidCtxLockCreate failed: %d\n", result);
		assert(0);
	}

	std::auto_ptr<VideoDecoder> apVideoDecoder(new VideoDecoder(m_pVideoSource->format(), g_oContext, m_eVideoCreateFlags, m_CtxLock));
	std::auto_ptr<VideoParser> apVideoParser(new VideoParser(apVideoDecoder.get(), m_pFrameQueue, &oFormatEx));
	m_pVideoSource->setParser(*apVideoParser.get(), g_oContext);

	m_pVideoParser			= apVideoParser.release();
	m_pVideoDecoder		= apVideoDecoder.release();

	// Create a Stream ID for handling Readback
	checkCudaErrors(cuStreamCreate(&m_ReadbackSID, 0));
	checkCudaErrors(cuStreamCreate(&m_KernelSID, 0));
	printf("> initCudaVideo()\n");
	printf("  CUDA Streams (%s) <g_ReadbackSID = %p>\n", ((m_ReadbackSID == 0) ? "Disabled" : "Enabled"), m_ReadbackSID);
	printf("  CUDA Streams (%s) <g_KernelSID   = %p>\n", ((m_KernelSID == 0) ? "Disabled" : "Enabled"), m_KernelSID);

	CUcontext cuCurrent = NULL;
	checkCudaErrors(cuCtxPopCurrent(&cuCurrent));
}

bool NvDecode::NV_LoadVideoSource(const char *video_file, unsigned int &width, unsigned int &height, unsigned int &dispWidth, unsigned int &dispHeight)
{
	std::auto_ptr<FrameQueue> apFrameQueue(new FrameQueue);
	std::auto_ptr<VideoSource> apVideoSource(new VideoSource());

	// retrieve the video source (width,height)
	if (!apVideoSource->init(video_file, apFrameQueue.get())){
		return false;
	};

	apVideoSource->getSourceDimensions(width, height);
	apVideoSource->getDisplayDimensions(dispWidth, dispHeight);

	memset(&m_stFormat, 0, sizeof(CUVIDEOFORMAT));
	std::cout << (m_stFormat = apVideoSource->format()) << std::endl;

	m_pFrameQueue = apFrameQueue.release();
	m_pVideoSource = apVideoSource.release();

	if (m_pVideoSource->format().codec == cudaVideoCodec_JPEG)
	{
		m_eVideoCreateFlags = cudaVideoCreate_PreferCUDA;
	}

	return true;
}

void NvDecode::NV_FreeCudaResources()
{
	//GPU_UnitCudaVideo();
	if (g_oContext)
	{
		checkCudaErrors(cuCtxDestroy(g_oContext));
		g_oContext = NULL;
	}
}

void NvDecode::NV_UnitCudaVideo()
{
	if (m_pVideoParser)
	{
		delete m_pVideoParser;
		m_pVideoParser = NULL;
	}

	if (m_pVideoDecoder)
	{
		delete m_pVideoDecoder;
		m_pVideoDecoder = NULL;
	}

	if (m_pVideoSource)
	{
		delete m_pVideoSource;
		m_pVideoSource = NULL;
	}

	if (m_pFrameQueue)
	{
		delete m_pFrameQueue;
		m_pFrameQueue = NULL;
	}
	if (m_ReadbackSID)
	{
		cuStreamDestroy(m_ReadbackSID);
		m_ReadbackSID = 0;
	}
	if (m_KernelSID)
	{
		cuStreamDestroy(m_KernelSID);
		m_KernelSID = 0;
	}
	if (m_CtxLock)
	{
		checkCudaErrors(cuvidCtxLockDestroy(m_CtxLock));
		m_CtxLock = NULL;
	}
	if (m_pFrameYUV[0])
	{
		cuMemFreeHost((void *)m_pFrameYUV[0]);
		cuMemFreeHost((void *)m_pFrameYUV[1]);
		cuMemFreeHost((void *)m_pFrameYUV[2]);
		cuMemFreeHost((void *)m_pFrameYUV[3]);
		cuMemFreeHost((void *)m_pFrameYUV[4]);
		cuMemFreeHost((void *)m_pFrameYUV[5]);
		m_pFrameYUV[0] = NULL;
		m_pFrameYUV[1] = NULL;
		m_pFrameYUV[2] = NULL;
		m_pFrameYUV[3] = NULL;
		m_pFrameYUV[4] = NULL;
		m_pFrameYUV[5] = NULL;
	}
}
// Run the Cuda part of the computation
//留着以后使用
//bool NvDecode::NV_CopyDecodedFrameToTexture(BYTE*& pbOutPut, int& iOutPutLen/*unsigned int &nRepeats, int bUseInterop, int *pbIsProgressive*/)
bool NvDecode::NV_CopyDecodedFrameToTexture()
{
	CUVIDPARSERDISPINFO oDisplayInfo;

	if (m_pFrameQueue->dequeue(&oDisplayInfo))
	{
		CCtxAutoLock lck(m_CtxLock);
		// Push the current CUDA context (only if we are using CUDA decoding path)
		CUresult result = cuCtxPushCurrent(g_oContext);

		CUdeviceptr  pDecodedFrame[3] = { 0, 0, 0 };
		CUdeviceptr  pInteropFrame[3] = { 0, 0, 0 };

		//*pbIsProgressive = oDisplayInfo.progressive_frame;

		int num_fields = 1;
		//nRepeats = num_fields;
		CUVIDPROCPARAMS oVideoProcessingParameters;
		memset(&oVideoProcessingParameters, 0, sizeof(CUVIDPROCPARAMS));

		oVideoProcessingParameters.progressive_frame = oDisplayInfo.progressive_frame;
		oVideoProcessingParameters.top_field_first = oDisplayInfo.top_field_first;
		oVideoProcessingParameters.unpaired_field = (oDisplayInfo.repeat_first_field < 0);

		for (int active_field = 0; active_field < num_fields; active_field++)
		{
			unsigned int nDecodedPitch = 0;
			unsigned int nWidth = 0;
			unsigned int nHeight = 0;

			oVideoProcessingParameters.second_field = active_field;

			// map decoded video frame to CUDA surfae
			if (m_pVideoDecoder->mapFrame(oDisplayInfo.picture_index, &pDecodedFrame[active_field], &nDecodedPitch, &oVideoProcessingParameters) != CUDA_SUCCESS)
			{
				// release the frame, so it can be re-used in decoder
				m_pFrameQueue->releaseFrame(&oDisplayInfo);

				// Detach from the Current thread
				checkCudaErrors(cuCtxPopCurrent(NULL));

				return false;
			}
			nWidth = m_pVideoDecoder->targetWidth();
			nHeight = m_pVideoDecoder->targetHeight();
			// map DirectX texture to CUDA surface
			size_t nTexturePitch = 0;

			// If we are Encoding and this is the 1st Frame, we make sure we allocate system memory for readbacks
			if (m_bFirstFrame && m_ReadbackSID)
			{
				CUresult result;
				checkCudaErrors(result = cuMemAllocHost((void **)&m_pFrameYUV[0], (nDecodedPitch * nHeight + nDecodedPitch*nHeight / 2)));
				checkCudaErrors(result = cuMemAllocHost((void **)&m_pFrameYUV[1], (nDecodedPitch * nHeight + nDecodedPitch*nHeight / 2)));
				checkCudaErrors(result = cuMemAllocHost((void **)&m_pFrameYUV[2], (nDecodedPitch * nHeight + nDecodedPitch*nHeight / 2)));
				checkCudaErrors(result = cuMemAllocHost((void **)&m_pFrameYUV[3], (nDecodedPitch * nHeight + nDecodedPitch*nHeight / 2)));
				checkCudaErrors(result = cuMemAllocHost((void **)&m_pFrameYUV[4], (nDecodedPitch * nHeight + nDecodedPitch*nHeight / 2)));
				checkCudaErrors(result = cuMemAllocHost((void **)&m_pFrameYUV[5], (nDecodedPitch * nHeight + nDecodedPitch*nHeight / 2)));

				m_bFirstFrame = false;

				if (result != CUDA_SUCCESS)
				{
					printf("cuMemAllocHost returned %d\n", (int)result);
					checkCudaErrors(result);
				}
			}

			// If streams are enabled, we can perform the readback to the host while the kernel is executing
			if ( m_ReadbackSID)
			{
				CUresult result = cuMemcpyDtoHAsync(m_pFrameYUV[active_field], pDecodedFrame[active_field], (nDecodedPitch * nHeight * 3 / 2), m_ReadbackSID);

				if (result != CUDA_SUCCESS)
				{
					printf("cuMemAllocHost returned %d\n", (int)result);
					checkCudaErrors(result);
				}
			}

			m_pVideoDecoder->unmapFrame(pDecodedFrame[active_field]);
			m_DecodeFrameCount++;
#ifdef WRITE_YUV_FILE
			//if (g_bWriteFile)
			{

				checkCudaErrors(cuStreamSynchronize(m_ReadbackSID));
				if (fp)
				{
					SaveFrameAsYUV(fp,m_pFrameYUV[active_field + 3],
						m_pFrameYUV[active_field],
						nWidth, nHeight, nDecodedPitch);
				}
			}
#endif
		}

		// Detach from the Current thread
		checkCudaErrors(cuCtxPopCurrent(NULL));
		// release the frame, so it can be re-used in decoder
		m_pFrameQueue->releaseFrame(&oDisplayInfo);
		return true;
	}
	// Frame Queue has no frames, we don't compute FPS until we start
	return false;
}

void NvDecode::start()
{
	bThreadExit = TRUE;
	if (play_thread_ptr)
	{
		play_thread_ptr->join();
		play_thread_ptr = NULL;
	}

	LPVOID arg_ = NULL;
	Common::ThreadCallback cb = BIND_MEM_CB(&NvDecode::decode_thread, this);
	play_thread_ptr = new Common::CThread(cb, TRUE);
	if (!play_thread_ptr)
	{
		return;
	}

	bThreadExit = FALSE;

	play_thread_ptr->start(arg_);
}

void NvDecode::stop()
{
	m_pVideoSource->stop();
	bThreadExit = TRUE;
	if (play_thread_ptr)
	{
		play_thread_ptr->join();
		play_thread_ptr = NULL;
	}
}

void NvDecode::decode_thread(LPVOID lpParam)
{
	m_pVideoSource->start();
	while (!bThreadExit && m_pFrameQueue)
	{
		NV_CopyDecodedFrameToTexture();
	}
}

bool NvDecode::NV_OpenStream(const char *video_file)
{
	unsigned int nWidth, nHeight,nDisplayWidth,nDisplayHeight;
	if (NV_LoadVideoSource(video_file, nWidth, nHeight, nDisplayWidth, nDisplayHeight))
	{
		NV_InitCudaVideo();
		start();
		return true;
	}
	return false;
}
bool NvDecode::NV_CloseStream()
{
	stop();
	NV_UnitCudaVideo();
	return true;
}

void NvDecode::NV_ProcessPayload(CUvideopacketflags cuvidPktFlags, const uint8_t* in_payload, uint64_t payloadSize, CUvideotimestamp in_timestamp)
{
	CUVIDSOURCEDATAPACKET sdp;

	sdp.flags = cuvidPktFlags;
	sdp.payload_size = payloadSize;
	sdp.payload = in_payload;
	sdp.timestamp = in_timestamp;

	cuvidParseVideoData(m_pVideoParser->hParser_, &sdp);
}

// callback for coded frame input
bool NvDecode::NV_GetDecodeFrame(uint8_t* start, uint32_t length)
{
	cuCtxPushCurrent(g_oContext);
	if (length == 0) {
		NV_ProcessPayload(CUVID_PKT_ENDOFSTREAM, (const uint8_t*)start, length, 0);
	}
	else {
		NV_ProcessPayload((CUvideopacketflags)0, (const uint8_t*)start, length, 0);
	}
	CUcontext cuCurrent = NULL;
	CUresult result = cuCtxPopCurrent(&cuCurrent);
	if (result != CUDA_SUCCESS){
		printf("cuCtxPopCurrent: %d\n", result);
		return false;
	}
	return true;
}

bool NvDecode::NV_DecodeFrame(BYTE*pbInput, int iInputLen, BYTE*& pbOutPut, int& iOutPutLen)
{
	if (NV_GetDecodeFrame(pbInput, iInputLen) && m_pFrameQueue){
		//留着以后使用
		//return NV_CopyDecodedFrameToTexture(pbOutPut, iOutPutLen);
		return true;
	}
	return false;
}

想看完整工程代码的需要一些积分下载,好不容易搞的，给点小积分可以理解。。。。。

完整下载地址:https://download.csdn.net/download/huangyifei_1111/10904486

猜你喜欢

目录

热门文章