Directx11进阶之基于GerstnerWave模拟二之基于ComputeShader的实现

上一篇博客Directx11进阶之基于GerstnerWave模拟一之基于CPU计算的实现 大概非常简单的讲解了GerstnerWave的实现,不过是基于CPU计算实现的,在仅仅80 * 80 * 2 =12800个三角形下,仅仅达到平均17帧的渲染帧数,性能实在是糟糕。这篇博客说下用ComputeShader计算GerstnerWave

其实跟上节差不多,就是关于GerstnerWave的海浪参数变为了ComputeShader的常量缓冲,而计算GerstnerWave的网格顶点数据我放在了RWStructuredBuffer里。

常量缓存对应的结构体

struct CBGerstnerWaveUpdate
{
	float fCurrentTime;
	XMFLOAT3 pad;
};

//严格对应ComputerShader的ConstantBuffer
struct GerstnerParam
{
public:
	float m_fAmplitude;
	float m_fSpeed;
	float m_fWaveLength;
	float m_fsteepness;
	XMFLOAT2 m_direction;
	XMFLOAT2 m_pad;


	GerstnerParam::GerstnerParam(XMFLOAT2 direction, float fAmplitude, float fSpeed, float fWaveLength, float fSteepnees):
		m_direction(direction),
		m_fAmplitude(fAmplitude),
		m_fSpeed(fSpeed),
		m_fWaveLength(fWaveLength),
		m_fsteepness(fSteepnees),
		m_pad(XMFLOAT2(0.0f,0.0f))
	{

	}

	GerstnerParam::GerstnerParam()
	{

	}
};

struct CBGerstnerWaveNoUpdate
{
	GerstnerParam gerstnerData[3];
	float fWaveGridSize;
	float fGroundCountX; //X方向有几个线程团   
	float fGroundCountY; //Y方向有几个线程团
	float fUVTile;
	float fUVMoveSpeed;
	XMFLOAT3 pad;
};

计算GerstnerWave数据的ComputeShader



/*
* ThreadGround维度:(fGroundCountX, fGroundCountY, 1)
* 每个线程团的Thread维度为(GroundThreadSize, GroundThreadSize, 1)
* SV_DispatchThreadID范围:(0,0,0) - (fGroundCountX * GroundThreadSize, fGroundCountY * GroundThreadSize,0)
*/


#define GroundThreadSize 16
static const float  XM_PI = 3.141592f;
static const int WaveParamArraySize = 3;

//顶点格式
struct VertexPCNTT
{
	float3 pos;
	float3 color;
	float3 normal;
	float3 tangent;
	float2 uv;
};

struct GerstnerParam
{
	float fAmplitude;
	float fSpeed;
	float fWaveLength;
	float fSteepness;
	float2 direction;
	float2 pad;
};


//每帧都不改变的常量缓存
cbuffer CBGerstnerWaveNoUpdate:register(b0)
{
	GerstnerParam gerstnerData[WaveParamArraySize];
	float fWaveGridSize;
	float fGroundCountX; //X方向有几个线程团   
	float fGroundCountY; //Y方向有几个线程团
	float fUVTile;
	float fUVMoveSpeed;
	float3 pad1;
};

//每帧更新的常量缓存
cbuffer CBGerstnerWaveUpdate:register(b1)
{
	float fCurrentTime;
	float3 pad;
}


RWStructuredBuffer<VertexPCNTT> WaveData : register(u0);

float GetGerstnerHeight(float fPosX, float fPosZ, float time)
{
	float fHeight = 0.0f;

	for (int iParamIndex = 0; iParamIndex < WaveParamArraySize; ++iParamIndex)
	{
		GerstnerParam gerstnerParam = gerstnerData[iParamIndex];

		float fDdotXZ = gerstnerParam.direction.x * fPosX + gerstnerParam.direction.y * fPosZ;

		//频率
		float w = 2.0f * XM_PI / gerstnerParam.fWaveLength;

		//相位差常量
		float phaseConstant = gerstnerParam.fSpeed * w;

		float param = fDdotXZ * w + time * phaseConstant;
		//posZ
		fHeight += (float)gerstnerParam.fAmplitude * sin(param);

	}

	return fHeight;
}


float GetPosZOffset(float x, float z, float time)
{
	float fPosZOffset = 0.0f;
	for (int iParamIndex = 0; iParamIndex < WaveParamArraySize; ++iParamIndex)
	{
		GerstnerParam gerstnerParam = gerstnerData[iParamIndex];

		float fDdotXZ = gerstnerParam.direction.x * x + gerstnerParam.direction.y * z;

		//频率
		float w = 2.0f * XM_PI / gerstnerParam.fWaveLength;

		//相位差常量
		float phaseConstant = gerstnerParam.fSpeed * w;

		fPosZOffset += (float)gerstnerParam.fAmplitude * gerstnerParam.fSteepness *
			gerstnerParam.direction.y * cos(fDdotXZ * w + time * phaseConstant);
	}

	return fPosZOffset;
}


float GetPosXOffset(float x, float z, float time)
{
	float fPosXOffset = 0.0f;

	for (int iParamIndex = 0; iParamIndex < WaveParamArraySize; ++iParamIndex)
	{
		GerstnerParam gerstnerParam = gerstnerData[iParamIndex];

		float fDdotXZ = gerstnerParam.direction.x * x + gerstnerParam.direction.y * z;

		//频率
		float w = 2.0f * XM_PI / gerstnerParam.fWaveLength;

		//相位差常量
		float phaseConstant = gerstnerParam.fSpeed * w;

		fPosXOffset += (float)gerstnerParam.fAmplitude * gerstnerParam.fSteepness *
			gerstnerParam.direction.x * cos(fDdotXZ * w + time * phaseConstant);
	}

	return fPosXOffset;
}



[numthreads(GroundThreadSize, GroundThreadSize, 1)]
void WaveVertexPosUVColor_CS(uint3 DTid : SV_DispatchThreadID)
{
	int index = DTid.x + DTid.y * fGroundCountX * GroundThreadSize;
	float posX = (float)DTid.x * fWaveGridSize;
	float posZ = (float)DTid.y * fWaveGridSize;
	float posXOffset = GetPosXOffset(posX, posZ, fCurrentTime);
	float posZOffset = GetPosZOffset(posX, posZ, fCurrentTime);
	float posY = GetGerstnerHeight(posX, posZ, fCurrentTime);
	WaveData[index].pos = float3(posX + posXOffset, posY, posZ + posZOffset);
	WaveData[index].color = float3(1.0f, 1.0f, 1.0f);
	WaveData[index].uv.x = (DTid.x + fUVMoveSpeed * fUVTile * fCurrentTime) / fUVTile;
	WaveData[index].uv.y = (DTid.y + fUVMoveSpeed * fUVTile * fCurrentTime) / fUVTile;
	WaveData[index].normal = float3(0.0f, 0.0f, 0.0f);
	WaveData[index].tangent = float3(0.0f, 0.0f, 0.0f);
}



//----------------------------------------------------------------------------

float3 CrossNormal(float3 vertex1, float3 vertex2, float3 vertex3)
{
	float3 vec1 = float3(vertex2.x - vertex1.x, vertex2.y - vertex1.y, vertex2.z - vertex1.z);
	float3 vec2 = float3(vertex3.x - vertex1.x, vertex3.y - vertex1.y, vertex3.z - vertex1.z);
	float3 normal = cross(vec1, vec2);
	return normal;
}

void CalculateNormal(uint3 DTid)
{
	int width = fGroundCountX * GroundThreadSize;
	int index = DTid.x + DTid.y * width;

	//大致忽略网格边缘问题,否则都判断很多边界问题
	//GPU的StructBuffer具备溢出写入无效和溢出读为0的性质
	float3 vertex1 = WaveData[index].pos;
	float3 vertex2 = WaveData[(DTid.y + 1)  * width + DTid.x].pos;
	float3 vertex3 = WaveData[DTid.y * width + DTid.x + 1].pos;
	float3 vertex4 = WaveData[(DTid.y - 1) * width + DTid.x].pos;
	float3 vertex5 = WaveData[DTid.y * width + DTid.x - 1].pos;
	float3 normal1 = CrossNormal(vertex1, vertex2, vertex3);
	float3 normal2 = CrossNormal(vertex1, vertex3, vertex4);
	float3 normal3 = CrossNormal(vertex1, vertex4, vertex5);
	float3 normal4 = CrossNormal(vertex1, vertex5, vertex2);
	float normalX = (normal1.x + normal2.x + normal3.x + normal4.x) / 4.0f;
	float normalY = (normal1.y + normal2.y + normal3.y + normal4.y) / 4.0f;
	float normalZ = (normal1.z + normal2.z + normal3.z + normal4.z) / 4.0f;
	float3 normal = float3(normalX, normalY, normalZ);
	WaveData[index].normal = normalize(normal);
}


void CalculateTangent(uint3 DTid)
{
	int width = fGroundCountX * GroundThreadSize;
	int index = DTid.x + DTid.y * width;
	VertexPCNTT vertex1;
	VertexPCNTT vertex2;
	VertexPCNTT vertex3;

	vertex1 = WaveData[index];
	vertex2 = WaveData[(DTid.y + 1) * width + DTid.x];
	vertex3 = WaveData[DTid.y * width + DTid.x + 1];

	float Edge1[3], Edge2[3];
	float TexEdge1[2], TexEdge2[2];

	//计算面的两个向量  
	//边向量1  
	Edge1[0] = vertex2.pos.x - vertex1.pos.x; //E0X  
	Edge1[1] = vertex2.pos.y - vertex1.pos.y; //E0Y  
	Edge1[2] = vertex2.pos.z - vertex1.pos.z; //E0Z  

	//边向量2  
	Edge2[0] = vertex3.pos.x - vertex1.pos.x; //E1X  
	Edge2[1] = vertex3.pos.y - vertex1.pos.y; //E1Y  
	Edge2[2] = vertex3.pos.z - vertex1.pos.z; //E1Z  

	//纹理边向量1  
	TexEdge1[0] = vertex2.uv.x - vertex1.uv.x; //U0  
	TexEdge1[1] = vertex2.uv.y - vertex1.uv.y; //V0  

	//纹理边向量2  
	TexEdge2[0] = vertex3.uv.x - vertex1.uv.x; //U1  
	TexEdge2[1] = vertex3.uv.y - vertex1.uv.y; //V1  

	//求出TB在模型空间坐标的方程系数  
	float den = 1.0f / (TexEdge1[0] * TexEdge2[1] - TexEdge1[1] * TexEdge2[0]);

	//求出Tangent  
	float3 tangent;
	tangent.x = den * (TexEdge2[1] * Edge1[0] - TexEdge1[1] * Edge2[0]);
	tangent.y = den * (TexEdge2[1] * Edge1[1] - TexEdge1[1] * Edge2[1]);
	tangent.z = den * (TexEdge2[1] * Edge1[2] - TexEdge1[1] * Edge2[2]);
	tangent = -normalize(tangent);

	//求出垂直于Nomral的Tangent
	float3 normal = WaveData[index].normal;
	float dotA = dot(normal, tangent);
	tangent = normalize(tangent - dotA * normal);

	WaveData[index].tangent = tangent;

}

[numthreads(GroundThreadSize, GroundThreadSize, 1)]
void WaveNormalTangent_CS(uint3 DTid : SV_DispatchThreadID)
{

	CalculateNormal(DTid);

	CalculateTangent(DTid);
}

【1】从上面代码可以看出,我把”计算VertexPos,VertexUV““归为一次CS,再把“VertexNormal,VertexTangent”归为另外一次CS。为什么不一次性计算所有的VertexPos,VertexUV,VertexNormal,VertexTangent呢?理由很简单,每个顶点的法线和切线的计算依赖其四周顶点的位置和UV,也就是说你得保证包围每个顶点的周围顶点的VertexPos,VertexUV已经计算好了。所以

【2】在上面代码中不 像上篇博客考虑边界判断,是因为我考虑到我计算大型海浪网格一般边缘看不到以及为了代码的整洁性还有GPU开if else分支影响效率上考虑,最终将边缘判断去掉。

void WaveVertexPosUVColor_CS(uint3 DTid : SV_DispatchThreadID)
void WaveNormalTangent_CS(uint3 DTid : SV_DispatchThreadID)

WaveVertexPosUV_CS计算RWStructuredBuffer<VertexPCNTT> WaveData 又是作为 WaveNormalTangent_CS的输入资源。

关于ComputeShader计算得到的GerstnerWave顶点数据是在最后的绘制阶段是直接在GPU中作为VertexShader的输入数据使用还是得Copy回CPU端更新VertexBuffer?

因为最后的着色我们的顶点数据是作为VertexShader的输入数据的,刚开始我尽量想的是:既然是GPU端计算的数据,是否可以直接在VertexShader中使用呢?于是我修改了创建ID3D11Buffer创建的BindFlag

{
	ID3D11Device* d3dDevice = D3DClass::GetInstance()->GetDevice();
	int nWaveArraySize = m_nWaveWidth * m_nWaveHeight * GroundThreadSize * GroundThreadSize;
	m_pWaveDataBuffer = nullptr;
	D3D11_BUFFER_DESC desc;
	ZeroMemory(&desc, sizeof(desc));
	desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_VERTEX_BUFFER;
	desc.ByteWidth = sizeof(VertexPCNTT) * nWaveArraySize;
	desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
	desc.StructureByteStride = sizeof(VertexPCNTT);
	d3dDevice->CreateBuffer(&desc, nullptr, &m_pWaveDataBuffer);
	desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_VERTEX_BUFFER;

看起来好像很完美,

D3D11_BIND_UNORDERED_ACCESS告诉我们可以创建UAV,

D3D11_BIND_SHADER_RESOURCE告诉我们可以创建SRV,

D3D11_BIND_VERTEX_BUFFER告诉我们可以作为VertexBuffer,

然后我们就可以在ComputeShader计算完GerstnerWaveData之后,可以直接设置VertexShader阶段的VertexBuffer, 从而避免GPU数据到CPU端数据的Copy带来的额外消耗,然而现实很残酷。当你多添加了D3D11_BIND_VERTEX_BUFFER标志后,ID3D111Buffer创建得到的是空指针。为什么?我也不清楚,到stackoverflow和GameDev以及MSDN的论坛找到了两三个跟我一样想法的帖子,也给不了答案。

在一篇MSDN的帖子里 Using UAV data from compute shader as vertex data 和 Stackoverflow中一篇帖子How can I feed compute shader results into vertex shader w/o using a vertex buffer?,都给出了:

运用RawBuffer (RWByteAddressBuffer) 可以解决,当然也会碰上新的问题。

最终我还是用了将GPU数据Copy回到CPU端的方案, 有点可惜,经历了一次GPU到CPU的Copy和一次CPU到CPU的Copy

	ID3D11Device* pDevice = D3DClass::GetInstance()->GetDevice();
	ID3D11DeviceContext* d3dContext = D3DClass::GetInstance()->GetDeviceContext();
	ID3D11Buffer* debugbuf = nullptr;
	D3D11_BUFFER_DESC desc;
	ZeroMemory(&desc, sizeof(desc));

	m_pWaveDataBuffer->GetDesc(&desc);
	desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
	desc.Usage = D3D11_USAGE_STAGING;
	desc.BindFlags = 0;
	desc.MiscFlags = 0;
	if (FAILED(pDevice->CreateBuffer(&desc, nullptr, &debugbuf)))
	{
		return;
	}
	d3dContext->CopyResource(debugbuf, m_pWaveDataBuffer);
	D3D11_MAPPED_SUBRESOURCE MappedResourceSrc;
	VertexPCNTT* pDataSrc;
	d3dContext->Map(debugbuf, 0, D3D11_MAP_READ, 0, &MappedResourceSrc);
	pDataSrc = (VertexPCNTT*)MappedResourceSrc.pData;
	d3dContext->Unmap(debugbuf,0);

	int nWaveArraySize = m_nWaveWidth * m_nWaveHeight * GroundThreadSize * GroundThreadSize;
	VertexPCNTT* pDataDest;
	D3D11_MAPPED_SUBRESOURCE MappedResourceDest;
	d3dContext->Map(m_pWaveVertexBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResourceDest);
	pDataDest = (VertexPCNTT*)MappedResourceDest.pData;
	memcpy(pDataDest, pDataSrc, sizeof(VertexPCNTT) * nWaveArraySize);
	d3dContext->Unmap(m_pWaveVertexBuffer, 0);
	

	ReleaseCOM(debugbuf);

着色器代码和上一篇博客一样的。

最终运行效果图:

在256 * 256 * 2 = 131 072个三角形下,达到了200帧左右,性能秒杀了基于CPU端的,证明了基于ComputeShader在计算并行数据的强大。

碰见的坑

【1】这次在 使用ComputeShader时碰见一个坑,就是常量缓存time的数过大(我原先使用的是从系统到目前的时间,往往是六七位数以上,例如:548712.673后面改了),导致了溢出,这个浮点数在CPU端是完全没问题的,在GPU中就出了问题,而且我用GraphicsDebug断点ComputeShader时数据也是没问题的。只能说Shader得远离太大的数。

源码

https://github.com/2047241149/SDEngine 的 GerstnerWaveCS.h , GerstnerWaveCS.cpp, WaveComputerShader.fx 文件

未来改进

【1】改善光照和着色,考虑引入光照的散射,反射,折射

【2】考虑浪花的渲染

【3】考虑海底的渲染

【4】考虑完善海洋网格模型,增加等等随机性。

【5】继续优化性能,考虑引入曲面细分。

【6】用RawBuffer和GroupSharedMemory 来进一步来优化CS计算次数

猜你喜欢

转载自blog.csdn.net/qq_29523119/article/details/81074290