如何在OpenGL中测量峰值内存带宽
How do you measure peak memory bandwidth in OpenGL?
为了了解我应该期望的速度,我一直在尝试对全局内存和着色器之间的传输进行基准测试,而不是依赖GPU规格表。然而,我无法接近理论上的最大值。事实上,我以50的倍数出局了!。
我使用的是GTX Titan X,据说它有336.5GB/s。Linux x64驱动程序352.21。
我在这里找到了一个CUDA基准,它给了我大约240–250GB/s(这是我所期望的)。
我正试图将它们与着色器的作用完全匹配。我尝试过顶点着色器、计算着色器、通过image_load_store和NV_shader_buffer_store访问缓冲区对象、使用float
s、vec4
s、着色器内部的循环(在工作组中使用联合寻址)和各种计时方法。我被困在约7GB/s(请参阅下面的更新)。
为什么GL要慢得多?我做错了什么吗?如果是,该怎么做
这是我的MWE,有三种方法(1。具有image_load_store的顶点着色器,2。具有无绑定图形的顶点着色器,3。使用无绑定图形的计算着色器):
//#include <windows.h>
#include <assert.h>
#include <stdio.h>
#include <memory.h>
#include <GL/glew.h>
#include <GL/glut.h>
const char* imageSource =
"#version 440n"
"uniform layout(r32f) imageBuffer data;n"
"uniform float val;n"
"void main() {n"
" imageStore(data, gl_VertexID, vec4(val, 0.0, 0.0, 0.0));n"
" gl_Position = vec4(0.0);n"
"}n";
const char* bindlessSource =
"#version 440n"
"#extension GL_NV_gpu_shader5 : enablen"
"#extension GL_NV_shader_buffer_load : enablen"
"uniform float* data;n"
"uniform float val;n"
"void main() {n"
" data[gl_VertexID] = val;n"
" gl_Position = vec4(0.0);n"
"}n";
const char* bindlessComputeSource =
"#version 440n"
"#extension GL_NV_gpu_shader5 : enablen"
"#extension GL_NV_shader_buffer_load : enablen"
"layout(local_size_x = 256) in;n"
"uniform float* data;n"
"uniform float val;n"
"void main() {n"
" data[gl_GlobalInvocationID.x] = val;n"
"}n";
GLuint compile(GLenum type, const char* shaderSrc)
{
GLuint shader = glCreateShader(type);
glShaderSource(shader, 1, (const GLchar**)&shaderSrc, NULL);
glCompileShader(shader);
int success = 0;
int loglen = 0;
glGetShaderiv(shader, GL_COMPILE_STATUS, &success);
glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &loglen);
GLchar* log = new GLchar[loglen];
glGetShaderInfoLog(shader, loglen, &loglen, log);
if (!success)
{
printf("%sn", log);
exit(0);
}
GLuint program = glCreateProgram();
glAttachShader(program, shader);
glLinkProgram(program);
return program;
}
GLuint timerQueries[2];
void start()
{
glGenQueries(2, timerQueries);
glQueryCounter(timerQueries[0], GL_TIMESTAMP);
}
float stop()
{
glMemoryBarrier(GL_ALL_BARRIER_BITS);
GLsync sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
glWaitSync(sync, 0, GL_TIMEOUT_IGNORED);
glQueryCounter(timerQueries[1], GL_TIMESTAMP);
GLint available = 0;
while (!available) //sometimes gets stuck here for whatever reason
glGetQueryObjectiv(timerQueries[1], GL_QUERY_RESULT_AVAILABLE, &available);
GLuint64 a, b;
glGetQueryObjectui64v(timerQueries[0], GL_QUERY_RESULT, &a);
glGetQueryObjectui64v(timerQueries[1], GL_QUERY_RESULT, &b);
glDeleteQueries(2, timerQueries);
return b - a;
}
int main(int argc, char** argv)
{
float* check;
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
glutCreateWindow("test");
glewInit();
int bufferSize = 64 * 1024 * 1024; //64MB
int loops = 500;
glEnable(GL_RASTERIZER_DISCARD);
float* dat = new float[bufferSize/sizeof(float)];
memset(dat, 0, bufferSize);
//create a buffer with data
GLuint buffer;
glGenBuffers(1, &buffer);
glBindBuffer(GL_TEXTURE_BUFFER, buffer);
glBufferData(GL_TEXTURE_BUFFER, bufferSize, NULL, GL_STATIC_DRAW);
//get a bindless address
GLuint64 address;
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
//make a texture alias for it
GLuint bufferTexture;
glGenTextures(1, &bufferTexture);
glBindTexture(GL_TEXTURE_BUFFER, bufferTexture);
glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, buffer);
glBindImageTextureEXT(0, bufferTexture, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R32F);
//compile the shaders
GLuint imageShader = compile(GL_VERTEX_SHADER, imageSource);
GLuint bindlessShader = compile(GL_VERTEX_SHADER, bindlessSource);
GLuint bindlessComputeShader = compile(GL_COMPUTE_SHADER, bindlessComputeSource);
//warm-up and check values
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glUseProgram(imageShader);
glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
glUniform1f(glGetUniformLocation(imageShader, "val"), 1.0f);
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
//check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
//for (int i = 0; i < bufferSize/sizeof(float); ++i)
// assert(check[i] == 1.0f);
//glUnmapBuffer(GL_TEXTURE_BUFFER);
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glUseProgram(bindlessShader);
glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessShader, "val"), 1.0f);
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
//glMemoryBarrier(GL_ALL_BARRIER_BITS); //this causes glDispatchCompute to segfault later, so don't uncomment
//check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
//for (int i = 0; i < bufferSize/sizeof(float); ++i)
// assert(check[i] == 1.0f);
//glUnmapBuffer(GL_TEXTURE_BUFFER);
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glUseProgram(bindlessComputeShader);
glProgramUniformui64NV(bindlessComputeShader, glGetUniformLocation(bindlessComputeShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessComputeShader, "val"), 1.0f);
glDispatchCompute(bufferSize/(sizeof(float) * 256), 1, 1);
glMemoryBarrier(GL_ALL_BARRIER_BITS);
//check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
//for (int i = 0; i < bufferSize/sizeof(float); ++i)
// assert(check[i] == 1.0f); //glDispatchCompute doesn't actually write anything with bindless graphics
//glUnmapBuffer(GL_TEXTURE_BUFFER);
glFinish();
//time image_load_store
glUseProgram(imageShader);
glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
glUniform1f(glGetUniformLocation(imageShader, "val"), 1.0f);
start();
for (int i = 0; i < loops; ++i)
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
GLuint64 imageTime = stop();
printf("image_load_store: %.2fGB/sn", (float)((bufferSize * (double)loops) / imageTime));
//time bindless
glUseProgram(bindlessShader);
glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessShader, "val"), 1.0f);
start();
for (int i = 0; i < loops; ++i)
glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
GLuint64 bindlessTime = stop();
printf("bindless: %.2fGB/sn", (float)((bufferSize * (double)loops) / bindlessTime));
//time bindless in a compute shader
glUseProgram(bindlessComputeShader);
glProgramUniformui64NV(bindlessComputeShader, glGetUniformLocation(bindlessComputeShader, "data"), address);
glUniform1f(glGetUniformLocation(bindlessComputeShader, "val"), 1.0f);
start();
for (int i = 0; i < loops; ++i)
glDispatchCompute(bufferSize/(sizeof(float) * 256), 1, 1);
GLuint64 bindlessComputeTime = stop();
printf("bindless compute: %.2fGB/sn", (float)((bufferSize * (double)loops) / bindlessComputeTime));
assert(glGetError() == GL_NO_ERROR);
return 0;
}
我的输出:
image_load_store: 6.66GB/s
bindless: 6.68GB/s
bindless compute: 6.65GB/s
一些注意事项:
- 使用无绑定图形的计算着色器似乎不会写入任何内容(注释掉的断言失败),或者至少不会使用
glMapBuffer
检索数据,即使速度与其他方法匹配。在计算着色器中使用image_load_store可以工作,并提供与顶点着色器相同的速度(尽管我认为这是一个太多的排列,无法发布) - 在
glDispatchCompute
之前调用glMemoryBarrier(GL_ALL_BARRIER_BITS)
会导致驱动程序崩溃 - 注释掉用于检查输出的三个
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
,将前两个测试的速度提高到17GB/s,计算着色器飙升到292GB/s,这与我想要的更接近,但由于第1点的原因,这是不可信的 - 有时
while (!available)
会挂很长时间(当我厌倦了等待时,ctrl-c会显示它仍在循环中)
作为参考,这里是CUDA代码:
//http://www.ks.uiuc.edu/Research/vmd/doxygen/CUDABench_8cu-source.html
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#define CUERR { cudaError_t err;
if ((err = cudaGetLastError()) != cudaSuccess) {
printf("CUDA error: %s, %s line %dn", cudaGetErrorString(err), __FILE__, __LINE__);
return -1; }}
//
// GPU device global memory bandwidth benchmark
//
template <class T>
__global__ void gpuglobmemcpybw(T *dest, const T *src) {
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
dest[idx] = src[idx];
}
template <class T>
__global__ void gpuglobmemsetbw(T *dest, const T val) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
dest[idx] = val;
}
typedef float4 datatype;
static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) {
int i;
int len = 1 << 22; // one thread per data element
int loops = 500;
datatype *src, *dest;
datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f);
// initialize to zero for starters
float memsettime = 0.0f;
float memcpytime = 0.0f;
*gpumemsetgbsec = 0.0;
*gpumemcpygbsec = 0.0;
// attach to the selected device
cudaError_t rc;
rc = cudaSetDevice(cudadev);
if (rc != cudaSuccess) {
#if CUDART_VERSION >= 2010
rc = cudaGetLastError(); // query last error and reset error state
if (rc != cudaErrorSetOnActiveProcess)
return -1; // abort and return an error
#else
cudaGetLastError(); // just ignore and reset error state, since older CUDA
// revs don't have a cudaErrorSetOnActiveProcess enum
#endif
}
cudaMalloc((void **) &src, sizeof(datatype)*len);
CUERR
cudaMalloc((void **) &dest, sizeof(datatype)*len);
CUERR
dim3 BSz(256, 1, 1);
dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1);
// do a warm-up pass
gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val);
CUERR
gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
CUERR
gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
CUERR
cudaEvent_t start, end;
cudaEventCreate(&start);
cudaEventCreate(&end);
// execute the memset kernel
cudaEventRecord(start, 0);
for (i=0; i<loops; i++) {
gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
}
CUERR
cudaEventRecord(end, 0);
CUERR
cudaEventSynchronize(start);
CUERR
cudaEventSynchronize(end);
CUERR
cudaEventElapsedTime(&memsettime, start, end);
CUERR
// execute the memcpy kernel
cudaEventRecord(start, 0);
for (i=0; i<loops; i++) {
gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
}
cudaEventRecord(end, 0);
CUERR
cudaEventSynchronize(start);
CUERR
cudaEventSynchronize(end);
CUERR
cudaEventElapsedTime(&memcpytime, start, end);
CUERR
cudaEventDestroy(start);
CUERR
cudaEventDestroy(end);
CUERR
*gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops);
*gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops);
cudaFree(dest);
cudaFree(src);
CUERR
return 0;
}
int main()
{
double a, b;
cudaglobmembw(0, &a, &b);
printf("%f %fn", (float)a, (float)b);
return 0;
}
更新:
在我的glBufferData
调用中,缓冲区似乎变成了非驻留的,这些调用用于检查输出是否被写入。根据扩展:
由于通过BufferData重新指定或删除,缓冲区也会隐式地成为非驻留缓冲区
…
BufferData被指定为"删除现有数据存储",因此该数据的GPU地址应当变得无效。缓冲区是因此,在当前情况下被定为非居民。
据推测,OpenGL会在缓冲区中对每帧的对象数据进行流式传输,而不会将其缓存在视频内存中。这解释了为什么计算着色器未能断言,但有一个轻微的异常,即顶点着色器中的无绑定图形在不驻留时仍然有效,但我现在将忽略这一点。我不知道为什么64MB缓冲区对象在12GB可用时不会默认为常驻(尽管可能是在第一次使用后)。
因此,每次呼叫glBufferData
后,我都会让它再次驻留,并获取地址以防其更改:
glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
assert(glIsBufferResidentNV(GL_TEXTURE_BUFFER)); //sanity check
我现在使用image_load_store或无绑定图形的计算着色器获得270–290GB/s现在我的问题包括:
- 假设每个测试的缓冲区似乎都是常驻的,并且计算着色器又好又快,为什么顶点着色器版本仍然那么慢
如果没有无绑定图形扩展,OpenGL的普通用户应该如何将数据放入视频内存(实际上放入,而不是无聊地暗示驱动程序可能只是喜欢)?
我确信我在现实世界中会注意到这个问题,而正是这个人为的基准测试走得很慢,所以我怎么能欺骗驱动程序让缓冲对象常驻呢?首先运行计算着色器不会更改任何内容。
您要求驱动程序从进程内存dat
中读取。这会导致大量的缓存一致性流量。当GPU读取内存时,它不能确定它是最新的,它可能在CPU缓存中,经过修改,还没有写回RAM。这导致GPU实际上必须从CPU缓存中读取,这比绕过CPU并读取RAM要昂贵得多。RAM在正常操作期间通常是空闲的,因为现代CPU的命中率通常为95%到99%。缓存被连续使用。
为了实现最大性能,您需要让驱动程序分配内存。程序使用的普通内存,如全局变量和堆,在写回内存中分配。驱动程序分配的内存通常会被分配为写组合或不可缓存,这消除了一致性流量。
只有在没有高速缓存一致性开销的情况下才能实现峰值广告带宽数。
要让驱动程序分配数据,请将glBufferData
与nullptr
一起用于数据。
不过,如果您设法强迫驱动程序使用系统内存写组合缓冲区,这并不完全是乐观的。CPU对此类地址的读取速度将非常慢。顺序写入由CPU优化,但随机写入会导致写入组合缓冲区频繁刷新,从而影响性能。
- 将字符串存储在c++中的稳定内存中
- C++ 指针的内存地址和指向数组的内存地址如何相同?
- Win32编译器选项和内存分配
- 当vector是tje全局变量时,c++中vector的内存管理
- 带内存和隔离功能的SQLite
- 是否可以通过C++扩展强制多个python进程共享同一内存
- 迭代时从向量和内存中删除对象
- 在C++中打印指向不同基元数据类型的指针的内存地址
- 这个指针和内存代码打印是什么?我不知道是打印垃圾还是如何打印我需要的值
- 多个文件的内存分配错误"在抛出 'std :: bad_alloc' what (): std :: bad_alloc 的实例后终止调用" [C++]
- 为什么示例代码访问IUnknown中已删除的内存
- 如何在C++类内存结构中创建"spacer"?
- 从构造函数抛出异常时如何克服内存泄漏
- malloc() 可能出现内存泄漏
- 如何理解将半精度指针转换为无符号长指针和相关的内存对齐
- 在调用FreeLibrary后,释放动态链接到具有相同版本的CRT堆的DLL的内存
- 库达内存带宽计算
- 怪异的结果从NVPROF输出计算内存带宽
- C、 所有平台中进程的C++CPU使用率和内存使用率+当前时间的可用网络下行链路带宽
- 如何在OpenGL中测量峰值内存带宽