GLSL 计算着色器不会为大型输入运行

GLSL Compute Shader doesn't run for large inputs

本文关键字:大型 输入 运行 计算 GLSL      更新时间:2023-10-16

着色器采用具有位置、方向、波长和强度的光子的 SSBO,每个线程负责通过网格精确追踪一个光子,其中光子击中的每个网格单元,每个波长的强度累积以创建每个网格单元的光谱分布。

问题是着色器可以完美地处理 100,000 个光子,但不会返回 1,000,000 个光子的结果。

我研究了SSBO的大小,所有的大小都在我的GPU(NVIDIA Quadro P6000)2GB的限制范围内:

  • SSBO 网格大小:1.5GB
  • SSBO 光子大小:0.02GB

如果我在某些地方改变逻辑,它可以处理一百万个光子(有关注释,请参阅第 87 行和第 114 行)。

我目前看不到任何解释为什么着色器在 1,000,000 个光子上失败,但对 100,000 个光子有效。逻辑相同,缓冲区大小在限制范围内。(缓冲区大小不会成为问题也通过它在更改逻辑时起作用来确认。

下面是源代码。如果你想自己尝试一下,这里是github上的代码:https://github.com/TheJhonny007/TextureTracerDebug

计算着色器:

#version 430
#extension GL_EXT_compute_shader: enable
#extension GL_EXT_shader_storage_buffer_object: enable
#extension GL_ARB_compute_variable_group_size: enable
const uint TEX_WIDTH = 1024u;
const uint TEX_HEIGHT = TEX_WIDTH;
const uint MIN_WAVELENGTH = 380u;
const uint MAX_WAVELENGTH = 740u;
const uint NUM_WAVELENGTHS = MAX_WAVELENGTH - MIN_WAVELENGTH;
// Size: 24 bytes -> ~40,000,000 photons per available gigabyte of ram
struct Photon {
vec2 position;// m
vec2 direction;// normalized
uint wavelength;// nm
float intensity;// 0..1 should start at 1
};
layout(std430, binding = 0) buffer Photons {
Photon photons[];
};
// Size: 1440 bytes -> ~700,000 pixels per available gigabyte of ram
struct Pixel {
uint intensityAtWavelengths[NUM_WAVELENGTHS];// [0..1000]
};
layout(std430, binding = 1) buffer Pixels {
//Pixel pixels[TEX_WIDTH][TEX_HEIGHT];
// NVIDIAs linker takes ages to link if the sizes are specified :(
Pixel[] pixels;
};
uniform float xAxisScalingFactor;
vec2 getHorizontalRectangleAt(int i) {
float x = pow(float(i), xAxisScalingFactor);
float w = pow(float(i + 1), xAxisScalingFactor);
return vec2(x, w);
}
uniform float rectangleHeight;
struct Rectangle {
float x;
float y;
float w;
float h;
};
layout (local_size_variable) in;
void addToPixel(uvec2 idx, uint wavelength, uint intensity) {
if (idx.x >= 0u && idx.x < TEX_WIDTH && idx.y >= 0u && idx.y < TEX_HEIGHT) {
uint index = (idx.y * TEX_WIDTH) + idx.x;
atomicAdd(pixels[index].intensityAtWavelengths[wavelength - MIN_WAVELENGTH], intensity);
}
}
/// Returns the rectangle at the given indices.
Rectangle getRectangleAt(ivec2 indices) {
vec2 horRect = getHorizontalRectangleAt(indices.x);
return Rectangle(horRect.x, rectangleHeight * float(indices.y), horRect.y, rectangleHeight);
}
uniform float shadowLength;
uniform float shadowHeight;
/// Returns the indices of the rectangle at the given location
ivec2 getRectangleIdxAt(vec2 location) {
int x = 0;
int y = int(location.y / rectangleHeight);
return ivec2(x, y);
}
float getRayIntersectAtX(Photon ray, float x) {
float slope = ray.direction.y / ray.direction.x;
return slope * (x - ray.position.x) + ray.position.y;
}
ivec2 getRayRectangleExitEdge(Photon ray, Rectangle rect) {
float intersectHeight = getRayIntersectAtX(ray, rect.x + rect.w);
// IF ONE OF THE FIRST TWO CONDITIONS GETS REMOVED IT WORKS WITH 1'000'000 PHOTONS OTHERWISE ONLY 100'000 WHY?
if (intersectHeight < rect.y) {
return ivec2(0, -1);
} else if (intersectHeight > rect.y + rect.h) {
return ivec2(0, 1);
} else {
return ivec2(1, 0);
}
}
void main() {
uint gid = gl_GlobalInvocationID.x;
if (gid >= photons.length()) return;
Photon photon = photons[gid];
ivec2 photonTexIndices = getRectangleIdxAt(photon.position);
while (photonTexIndices.x < TEX_WIDTH && photonTexIndices.y < TEX_HEIGHT &&
photonTexIndices.x >= 0        && photonTexIndices.y >= 0) {
// need to convert to uint for atomic add operations...
addToPixel(uvec2(photonTexIndices), photon.wavelength, uint(photon.intensity * 100.0));
ivec2 dir = getRayRectangleExitEdge(photon, getRectangleAt(photonTexIndices));
photonTexIndices += dir;
// When the ray goes out of bounds on the bottom then mirror it to simulate rays coming from
// the other side of the planet. This works because of the rotational symmetry of the system.
// IF COMMENTET OUT IT WORKS WITH 1'000'000 PHOTONS OTHERWISE ONLY 100'000 WHY?
if (photonTexIndices.y < 0) {
photonTexIndices.y = 0;
photon.position.y *= -1.0;
photon.direction.y *= -1.0;
}
}
}

Tracer.hpp

#ifndef TEXTURE_TRACER_HPP
#define TEXTURE_TRACER_HPP
#include <glm/glm.hpp>
#include <random>
namespace gpu {
// 6 * 4 = 24 Bytes
struct Photon {
glm::vec2 position;  // m
glm::vec2 direction; // normalized
uint32_t waveLength; // nm
float intensity;     // 0..1 should start at 1
};
class TextureTracer {
public:
TextureTracer();
uint32_t createShadowMap(size_t numPhotons);
private:
void initTextureTracer();
void traceThroughTexture(uint32_t ssboPhotons, size_t numPhotons);
Photon emitPhoton();
std::vector<Photon> generatePhotons(uint32_t count);
struct {
uint32_t uRectangleHeight;
uint32_t uShadowLength;
uint32_t uShadowHeight;
uint32_t uXAxisScalingFactor;
} mTextureTracerUniforms;
uint32_t mTextureTracerProgram;
std::mt19937_64 mRNG;
std::uniform_real_distribution<> mDistributionSun;
std::uniform_int_distribution<uint32_t> mDistributionWavelength;
std::bernoulli_distribution mDistributionBoolean;
};
} // namespace gpu
#endif // TEXTURE_TRACER_HPP

示踪剂.cpp

#include "TextureTracer.hpp"
#include <GL/glew.h>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <random>
#include <string>
#include <vector>
void GLAPIENTRY MessageCallback(GLenum source, GLenum type, GLuint id,
GLenum severity, GLsizei length,
const GLchar *message, const void *userParam) {
if (type == GL_DEBUG_TYPE_ERROR)
fprintf(stderr, "GL ERROR: type = 0x%x, severity = 0x%x, message = %sn",
type, severity, message);
else
fprintf(stdout, "GL INFO: type = 0x%x, severity = 0x%x, message = %sn",
type, severity, message);
}
namespace gpu {
const double TEX_HEIGHT_TO_RADIUS_FACTOR = 4;
const double TEX_SHADOW_LENGTH_FACTOR = 8;
const uint32_t TEX_WIDTH = 1024u;
const uint32_t TEX_HEIGHT = TEX_WIDTH;
const double RADIUS = 6'371'000.0;
const double RADIUS_FACTORED = RADIUS * TEX_HEIGHT_TO_RADIUS_FACTOR;
const double SUN_RADIUS = 695'510'000.0;
const double DIST_TO_SUN = 149'600'000'000.0;
const double ATMO_HEIGHT = 42'000.0;
std::string loadShader(const std::string &fileName) {
std::ifstream shaderFileStream(fileName, std::ios::in);
if (!shaderFileStream.is_open()) {
std::cerr << "Could not load the GLSL shader from '" << fileName << "'!"
<< std::endl;
exit(-1);
}
std::string shaderCode;
while (!shaderFileStream.eof()) {
std::string line;
std::getline(shaderFileStream, line);
shaderCode.append(line + "n");
}
return shaderCode;
}
void TextureTracer::initTextureTracer() {
mTextureTracerProgram = glCreateProgram();
uint32_t rayTracingComputeShader = glCreateShader(GL_COMPUTE_SHADER);
std::string code = loadShader("../resources/TextureTracer.glsl");
const char *shader = code.c_str();
glShaderSource(rayTracingComputeShader, 1, &shader, nullptr);
glCompileShader(rayTracingComputeShader);
glAttachShader(mTextureTracerProgram, rayTracingComputeShader);
glLinkProgram(mTextureTracerProgram);
mTextureTracerUniforms.uRectangleHeight =
glGetUniformLocation(mTextureTracerProgram, "rectangleHeight");
mTextureTracerUniforms.uShadowHeight =
glGetUniformLocation(mTextureTracerProgram, "shadowHeight");
mTextureTracerUniforms.uShadowLength =
glGetUniformLocation(mTextureTracerProgram, "shadowLength");
mTextureTracerUniforms.uXAxisScalingFactor =
glGetUniformLocation(mTextureTracerProgram, "xAxisScalingFactor");
glDetachShader(mTextureTracerProgram, rayTracingComputeShader);
glDeleteShader(rayTracingComputeShader);
}
TextureTracer::TextureTracer()
: mRNG(1L), mDistributionSun(
std::uniform_real_distribution<>(-SUN_RADIUS, SUN_RADIUS)),
mDistributionWavelength(
std::uniform_int_distribution<uint32_t>(380, 739)),
mDistributionBoolean(std::bernoulli_distribution(0.5)) {
glEnable(GL_DEBUG_OUTPUT);
glDebugMessageCallback(MessageCallback, nullptr);
initTextureTracer();
}
double raySphereDistance(glm::dvec2 origin, glm::dvec2 direction,
glm::dvec2 center, double radius) {
glm::dvec2 m = origin - center;
double b = glm::dot(m, direction);
double c = glm::dot(m, m) - (radius * radius);
if (c > 0.0 && b > 0.0)
return -1.0;
double discr = b * b - c;
// A negative discriminant corresponds to ray missing sphere
if (discr < 0.0)
return -1.0;
// Ray now found to intersect sphere, compute smallest t value of intersection
return glm::max(0.0, -b - glm::sqrt(discr));
}
Photon TextureTracer::emitPhoton() {
std::uniform_real_distribution<> distributionEarth(0.0, ATMO_HEIGHT);
glm::dvec2 target = {0.0, RADIUS + distributionEarth(mRNG)};
double d;
do {
d = glm::length(glm::dvec2(mDistributionSun(mRNG), mDistributionSun(mRNG)));
} while (d > SUN_RADIUS);
glm::dvec2 startPosition =
glm::dvec2(-DIST_TO_SUN, mDistributionBoolean(mRNG) ? d : -d);
glm::dvec2 direction = glm::normalize(target - startPosition);
startPosition +=
direction * raySphereDistance(startPosition, direction, {0.0, 0.0},
RADIUS + ATMO_HEIGHT);
return {glm::vec2(0.0, startPosition.y), glm::vec2(direction),
mDistributionWavelength(mRNG), 1.0f};
}
std::vector<Photon> TextureTracer::generatePhotons(uint32_t count) {
std::vector<Photon> photons(count);
std::generate(photons.begin(), photons.end(),
[this]() { return emitPhoton(); });
return photons;
}
void TextureTracer::traceThroughTexture(uint32_t ssboPhotons,
size_t numPhotons) {
glUseProgram(mTextureTracerProgram);
glUniform1f(mTextureTracerUniforms.uRectangleHeight,
RADIUS_FACTORED / TEX_HEIGHT);
const double shadowLength =
TEX_SHADOW_LENGTH_FACTOR * (DIST_TO_SUN * RADIUS) / (SUN_RADIUS - RADIUS);
glUniform1f(mTextureTracerUniforms.uShadowLength, shadowLength);
glUniform1f(mTextureTracerUniforms.uShadowHeight, RADIUS_FACTORED);
const double xAxisScalingFactor =
glm::log(shadowLength) / glm::log(static_cast<double>(TEX_WIDTH));
glUniform1f(mTextureTracerUniforms.uXAxisScalingFactor,
static_cast<float>(xAxisScalingFactor));
const uint32_t MIN_WAVELENGTH = 380u;
const uint32_t MAX_WAVELENGTH = 740u;
const uint32_t NUM_WAVELENGTHS = MAX_WAVELENGTH - MIN_WAVELENGTH;
size_t pixelBufferSize =
TEX_WIDTH * TEX_HEIGHT * NUM_WAVELENGTHS * sizeof(uint32_t);
uint32_t ssboPixels;
glGenBuffers(1, &ssboPixels);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssboPixels);
glBufferData(GL_SHADER_STORAGE_BUFFER, pixelBufferSize, nullptr,
GL_DYNAMIC_COPY);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssboPhotons);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ssboPixels);
const uint32_t numThreads = 32u;
const uint32_t numBlocks = numPhotons / numThreads;
std::cout << "numBlocks: " << numBlocks << std::endl;
glDispatchComputeGroupSizeARB(numBlocks, 1, 1, numThreads, 1, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
struct Pixel {
uint32_t intensityAtWavelengths[NUM_WAVELENGTHS];
};
std::vector<Pixel> pixels(TEX_WIDTH * TEX_HEIGHT);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssboPixels);
glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, pixelBufferSize,
pixels.data());
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
for (int y = 0; y < TEX_HEIGHT; ++y) {
printf("%4i | ", y);
for (int x = 0; x < TEX_WIDTH; ++x) {
Pixel p = pixels[y * TEX_WIDTH + x];
int counter = 0;
for (uint32_t i : p.intensityAtWavelengths) {
counter += i;
}
if (counter == 0) {
printf("  ");
} else if (counter > 100'000'000) {
printf("%4s", "u25A0");
} else if (counter > 10'000'000) {
printf("%4s", "u25A3");
} else if (counter > 1'000'000) {
printf("%4s", "u25A6");
} else if (counter > 100'000) {
printf("%4s", "u25A4");
} else {
printf("%4s", "u25A1");
}
}
std::cout << std::endl;
}
glDeleteBuffers(1, &ssboPixels);
glUseProgram(0);
}
uint32_t TextureTracer::createShadowMap(size_t numPhotons) {
std::vector<Photon> photons = generatePhotons(numPhotons);
uint32_t ssboPhotons;
glGenBuffers(1, &ssboPhotons);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssboPhotons);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(Photon) * photons.size(),
photons.data(), GL_DYNAMIC_COPY);
traceThroughTexture(ssboPhotons, photons.size());
glDeleteBuffers(1, &ssboPhotons);
glDeleteProgram(mTextureTracerProgram);
glDisable(GL_DEBUG_OUTPUT);
glDebugMessageCallback(nullptr, nullptr);
return 0;
}
}

主.cpp

#include <GL/glew.h>
#include <GL/glut.h>
#include "TextureTracer.hpp"
int main(int argc, char *argv[]) {
glutInit(&argc, argv);
glutCreateWindow("OpenGL needs a window o.O");
glewInit();
auto mapper = gpu::TextureTracer();
// WITH 100'000 PHOTONS IT WORKS, WITH 1'000'000 PHOTONS NOT WHY?
mapper.createShadowMap(100'000);
return 0;
}

如果 GPU 程序执行时间过长,操作系统会取消它们。在Windows上通常是两秒,在Linux上大部分时间是五秒,但它可能会有所不同。

这是为了检测卡住的GPU程序并取消它们。有不同的方法可以解决此超时问题,但它们都需要管理员/root 权限,这并不总是可用的。

如果可能,可以将执行拆分为多个调用,如以下代码片段所示:

const uint32_t passSize   = 2048u;
const uint32_t numPasses  = (numPhotons / passSize) + 1;
const uint32_t numThreads = 64u;
const uint32_t numBlocks  = passSize / numThreads;
glUniform1ui(glGetUniformLocation(mTextureTracerProgram, "passSize"), passSize);
for (uint32_t pass = 0u; pass < numPasses; ++pass) {
glUniform1ui(glGetUniformLocation(mTextureTracerProgram, "pass"), pass);
glDispatchComputeGroupSizeARB(numBlocks, 1, 1, numThreads, 1, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
glFlush();
glFinish();
}

glFlush()glFinish()调用很重要,否则执行将捆绑在一起,操作系统无论如何都会触发超时。

在着色器中,您只需要访问输入数据的正确部分,如下所示:

// other stuff
uniform uint pass;
uniform uint passSize;
void main() {
uint gid = gl_GlobalInvocationID.x;
uint passId = pass * passSize + gid;
if (passId >= photons.length()) return;
Photon photon = photons[passId];
// rest of program
}

仅此而已。

如果你想禁用操作系统超时,这里有一个相关的帖子 Linux: https://stackoverflow.com/a/30520538/5543884

这是一篇关于Windows的帖子:https://stackoverflow.com/a/29759823/5543884