当相机接近模型时,SSAO样本内核会导致性能下降?
SSAO sample kernels causes performance drop when camera is close to model?
我有一个问题,当相机靠近模型时,性能会下降。
我发现这与 ssao 示例内核有关,但我似乎无法弄清楚为什么这些在靠近网格时会导致性能问题。
当我注释掉 ssao 渲染代码中的 for 循环示例时,性能会恢复到应有的状态,因此这个 for 循环显然是导致问题的原因。我最初认为这可能是着色器问题,但我也找不到任何问题。
有什么想法吗?这是您需要的所有代码...
SSAO 设置代码
// Create two frame buffers, one for ssao colour and another for ssao blur
_fbos.push_back(new Fbo(width, height, { new FboAttachment(width, height, GL_RED, GL_RGB, GL_FLOAT, GL_COLOR_ATTACHMENT0) }, false));
_fbos.push_back(new Fbo(width, height, { new FboAttachment(width, height, GL_RED, GL_RGB, GL_FLOAT, GL_COLOR_ATTACHMENT0) }, false));
//////////////////////////////////////////////////////////////////////////////////////////////////////////
std::uniform_real_distribution<GLfloat> rand_floats(0.0f, 1.0f); // Generate random floats between 0.0 and 1.0
std::default_random_engine rand_generator; // A generator for randomising floats
// Create temp iterator var
for (unsigned int i = 0; i < 64; ++i) // Iterate through each sample...
{
glm::vec3 sample(rand_floats(rand_generator) * 2.0f - 1.0f, rand_floats(rand_generator) * 2.0f - 1.0f, rand_floats(rand_generator)); // the third parameter was wrong on this line
sample = glm::normalize(sample); // Normalise the sample
sample *= rand_floats(rand_generator); // Seed the randomisation
float scale = static_cast<float>(i) / 64.0f; // Get pixel position in NDC about the resolution size
scale = Math::lerpf(0.1f, 1.0f, scale * scale); // Interpolate the scale
sample *= scale; // Scale the s and t values
_ssao_kernals.push_back(sample); // Assign sample to the kernal array
_u_samples.push_back(glGetUniformLocation(shader_programs[0], ("samples[" + std::to_string(i) + "]").c_str())); // Get each sample uniform location
}
for (unsigned int i = 0; i < 16; i++) // For each sample / 4...
{
glm::vec3 noise(rand_floats(rand_generator) * 2.0f - 1.0f, rand_floats(rand_generator) * 2.0f - 1.0f, 0.0f); // Randomly generate a noise pixel
_ssao_noise.push_back(noise); // Assign noise pixel to noise array
}
/*
* Create a noise texture to remove any banding from the ssao
*/
glGenTextures(1, &_noise_texture); // generate the texture
glBindTexture(GL_TEXTURE_2D, _noise_texture); // bind data
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB32F, 4, 4, 0, GL_RGB, GL_FLOAT, &_ssao_noise[0]); // set texture data
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); // texture filtering
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); // texture filtering
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); // texture filtering
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); // texture filtering
SSAO 渲染函数
_fbos[0]->Bind(); // bind ssao texture
glClear(GL_COLOR_BUFFER_BIT); // clear colour data on the screen
glUseProgram(_shader_programs[0]); // Use the first shader pass
for (unsigned int i = 0; i < SSAO_SAMPLE_RESOLUTION; ++i) // For each ssao sample...
glUniform3fv(_u_samples[i], 1, glm::value_ptr(_ssao_kernals[i])); // Assign kernal uniform data
glUniformMatrix4fv(_u_projection, 1, GL_FALSE, glm::value_ptr(Content::_map->GetCamera()->GetProjectionMatrix())); // Assign camera projection uniform data
glActiveTexture(GL_TEXTURE0); // Set active texture to index 0
glBindTexture(GL_TEXTURE_2D, _g_buffer_data->GetAttachments()[0]->_texture); // Bind positions
glActiveTexture(GL_TEXTURE1); // Set active texture to index 1
glBindTexture(GL_TEXTURE_2D, _g_buffer_data->GetAttachments()[1]->_texture); // Bind normals
glActiveTexture(GL_TEXTURE2); // Set active texture to index 2
glBindTexture(GL_TEXTURE_2D, _noise_texture); // Bind the noise texture
_screen_rect->Render(1); // Render to screen rectangle
// Blur ssao texture
_fbos[1]->Bind();
glClear(GL_COLOR_BUFFER_BIT);
glUseProgram(_shader_programs[1]); // Use the second shader pass
glActiveTexture(GL_TEXTURE0); // Bind active texture to index 0
glBindTexture(GL_TEXTURE_2D, _fbos[0]->GetAttachments()[0]->_texture); // Bind the final colour
_screen_rect->Render(1); // Render to screen rectangle
SSAO 片段着色器
#version 330 core
out float FragColor;
in vec2 _texcoord;
uniform sampler2D gPosition;
uniform sampler2D gNormal;
uniform sampler2D texNoise;
uniform vec3 samples[64];
int kernelSize = 64;
float radius = 0.3;
float bias = 0.025;
const vec2 noiseScale = vec2(1920.0 / 4.0, 1080.0 / 4.0);
uniform mat4 proj;
void main()
{
vec3 fragPos = texture(gPosition, _texcoord).xyz;
vec3 normal = normalize(texture(gNormal, _texcoord).rgb);
vec3 randomVec = normalize(texture(texNoise, _texcoord * noiseScale).xyz);
vec3 tangent = normalize(randomVec - normal * dot(randomVec, normal));
vec3 bitangent = cross(normal, tangent);
mat3 TBN = mat3(tangent, bitangent, normal);
float occlusion = 0.0;
for(int i = 0; i < kernelSize; ++i)
{
// get sample position
vec3 sample = TBN * samples[i]; // from tangent to view-space
sample = fragPos + sample * radius;
// project sample position (to sample texture) (to get position on screen/texture)
vec4 offset = vec4(sample, 1.0);
offset = proj * offset; // from view to clip-space
offset.xyz /= offset.w; // perspective divide
offset.xyz = offset.xyz * 0.5 + 0.5; // transform to range 0.0 - 1.0
// get sample depth
float sampleDepth = texture(gPosition, offset.xy).z; // get depth value of kernel sample
// range check & accumulate
float rangeCheck = smoothstep(0.0, 1.0, radius / abs(fragPos.z - sampleDepth));
occlusion += (sampleDepth >= sample.z + bias ? 1.0 : 0.0) * rangeCheck;
}
occlusion = 1.0 - (occlusion / kernelSize);
FragColor = pow(occlusion, 3.0);
}
这是 SSAO 的预期性能特征。
您计算AO的纹素离相机越近,它周围的采样点在屏幕空间中就越远,而这些相邻纹素在GPU的纹理缓存中的可能性就越小 - 这会导致巨大的性能影响。
相关文章:
- 删除一个线程上有数百万个字符串的大型哈希映射会影响另一个线程的性能
- OpenMP阵列性能较差
- 递归列出所有目录中的C++与Python与Ruby的性能
- 大小相等但成员数量不同的结构之间的性能差异
- 如何在内核C++中使用1920x1080x16M图形或类似的16M颜色?(VGA)
- 为什么constexpr的性能比正常表达式差
- CUDA内核和数学函数的显式命名空间
- 码头化的C++应用程序是否向后兼容早期的内核版本
- 在类中使用随机生成器时出现性能问题
- 当相机接近模型时,SSAO样本内核会导致性能下降?
- 如何提高此 OpenCL 缩减内核代码的性能?
- 删除内核中的倍数需要提高性能
- 特定的OpenCL内核在移动和PC上的性能不同
- 性能如何对内核堆栈进行采样
- 当运行的线程数超过内核数时,CUDA性能会得到提高
- 调用GPU内核后,CPU性能下降
- 调用多个内核,全局内存性能 - CUDA
- 调用cuda内核时的性能损失
- Julia集的CUDA内核与CPU版本相比性能较慢
- 我是否应该使用"if"语句统一两个相似的内核,冒着性能损失的风险?