CUDA 程序没有预期的那么快

CUDA Program not working as fast as expected

本文关键字：程序 CUDA 更新时间：2023-10-16

我一直在试验CUDA/C++，并决定制作一个n身体模拟器。它模拟了4096粒子之间的引力。它以大约 2 或 3 FPS 的速度运行，我不完全确定为什么。正在使用的显卡是GTX 980 Ti，所以我希望该程序能够顺利运行。我知道它可能没有尽其所能进行优化，但我没想到它会运行得这么慢。

代码只应该是一个原型，所以代码无论如何都不整洁(或正确编写)。

main.cu

#include <Windows.h>
#include <GL/glew.h>
#include <GL/freeglut.h>
#include <iostream>
#include <vector>
#include <math.h>
#include "Particle.h"
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <ctime>
#include <string>
#define N 4096
#define DT 0.00001
# define M_PI           3.14159265358979323846  /* pi */
using namespace std;
Particle p[N];
int frames = 0;
clock_t starttime = clock();
clock_t timepassed = 0;
bool first = true;
float fps = 0.0f;
__global__ void updateParticle(Particle* out, Particle *pin)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
double velx = 0;
double vely = 0;
out[i].mass = pin[i].mass;
for(int j = 0; j < N; j++)
{
if (i == j || pin[j].mass == 0 || pin[i].mass == 0)
continue;
double difx = pin[i].posx - pin[j].posx;
double dify = pin[i].posy - pin[j].posy;
double len = difx * difx + dify * dify;
if (len == 0)
continue;
double force = (pin[i].mass * pin[j].mass) / len;
len = sqrt(len);
double dirx = -difx / len;
double diry = -dify / len;
dirx *= force;
diry *= force;
velx += (dirx / pin[i].mass + pin[i].velx) * DT;
vely += (diry / pin[i].mass + pin[i].vely) * DT;
}
out[i].posx = pin[i].posx + velx;
out[i].posy = pin[i].posy + vely;
out[i].velx = pin[i].velx;
out[i].vely = pin[i].vely;
while (out[i].posx > 1)
out[i].posx--;
while (out[i].posx < -1)
out[i].posx++;
while (out[i].posy > 1)
out[i].posy--;
while (out[i].posy < -1)
out[i].posy++;
}
void changeViewPort(int w, int h)
{
glViewport(0, 0, w, h);
}
void renderMore()
{
for (int i = 0; i < N; ++i)
{
if (p[i].mass == 0)
continue;
if (p[i].mass == 1)
glColor3f(1, 1, 1);
else
glColor3f(1, 0, 0);
glBegin(GL_LINE_LOOP);
for (int j = 0; j <= 4; j++) {
double angle = 2 * M_PI * j / 300;
double x = cos(angle) * 0.001;
double y = sin(angle) * 0.001;
x *= p[i].mass;
y *= p[i].mass;
glVertex2d(x + p[i].posx, y + p[i].posy);
}
glEnd();
}
}
void render(void)
{
if(first)
{
frames = 0;
starttime = clock();
first = false;
}
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
renderMore();
glutSwapBuffers();
frames++;
}
void moveCuda(Particle* in, Particle* out)
{
Particle *device_p = nullptr;
Particle *device_res = nullptr;
cudaError_t cudaStatus;
int size = N * sizeof(Particle);
cudaStatus = cudaMalloc((void**)&device_res, size);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMalloc((void**)&device_p, size);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(device_p, in, size, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
updateParticle << <N / 1024, 1024 >> >(device_res, device_p);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "kernel launch failed: %sn", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!n", cudaStatus);
}
cudaStatus = cudaMemcpy(out, device_res, size, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaFree(device_res);
cudaFree(device_p);
}
void update(int)
{
Particle temp[N] = {};
moveCuda(p, temp);
for (int i = 0; i < N; ++i)
p[i] = temp[i];
fps = (double)frames / ((clock() - starttime) / 1000);
const string a = "FPS: " + to_string(fps);
glutSetWindowTitle(a.c_str());
glutTimerFunc(100.0 / 60, update, -1);
}
void idle()
{
glutPostRedisplay();
}
int main(int argc, char* argv[])
{
for (int i = 0; i < N; ++i)
{
p[i] = Particle();
}
// Initialize GLUT
glutInit(&argc, argv);
// Set up some memory buffers for our display
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH);
// Set the window size
glutInitWindowSize(1000, 1000);
// Create the window with the title "Hello,GL"
glutCreateWindow("Hello World");
// Bind the two functions (above) to respond when necessary
glutReshapeFunc(changeViewPort);
glutDisplayFunc(render);
glutTimerFunc(100.0 / 60, update, -1);
glutIdleFunc(idle);

// Very important!  This initializes the entry points in the OpenGL driver so we can 
// call all the functions in the API.
GLenum err = glewInit();
if (GLEW_OK != err) {
fprintf(stderr, "GLEW error");
return 1;
}
render();
glutMainLoop();
return 0;
}

粒子.cpp

#include "Particle.h"
#include "stdlib.h"
#include <host_defines.h>
Particle::Particle()
{
posx = (((double)rand() / (RAND_MAX)) * 2) - 1;
posy = (((double)rand() / (RAND_MAX)) * 2) - 1;
velx = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
vely = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
mass = 1;
}

粒子.h

#pragma once
class Particle
{
public:
Particle();
void Update();
double posx;
double posy;
double velx;
double vely;
double mass;
};

当我删除设置图形设备的行时，它会抛出错误，但继续以 2-3 fps 的速度运行。这可能表明它在获取我的显卡时遇到问题，尽管我不确定该怎么做。当我将其设置为 cudaSetDevice(0) 时，它不会抛出错误。显卡工作正常，显示器已连接到它并正常工作。

如果有人能提供一些指示或建议，我将不胜感激。

首先，您可能想研究 CUDA nbody 示例代码，因为它在公开编写良好的代码方面会比我做得更好。另请注意，该示例包含指向本章的链接，该链接也很有启发性。

我将介绍一个似乎比原始代码运行得快得多的代码。以下是我应用的一般策略：

确保生成的是发布项目，而不是调试项目。
不要做不必要的cudaMalloc/cudaFree或cudaMemcpy操作。最好执行一次这些分配，然后重复使用它们。由于您没有修改主机代码中的任何内容(位置、速度)，这也意味着我们实际上不需要为moveCuda的每次迭代更新设备。只需将数据留在设备上即可。这将我们减少到单个cudaMemcpy操作，以便我们可以执行OpenGL内容(但见下文)。我似乎从中得到了大约 3 倍的提升。我还实现了"乒乓球"缓冲策略，以避免不必要的复制。
使用float而不是double。这有几个好处。首先，它减少了内存流量，因为您检索的数据量是原来的一半。其次，您使用的 GPU 的float吞吐量(数学运算)明显高于double。我真的不认为这是一个计算绑定的内核，所以我认为内存流量是更大的问题。我似乎从中又得到了 3 倍的提升。
将您的粒子从 AoS 转换为 SoA。这个主题在cuda标签以及许多其他地方都有介绍，所以我不打算在这里回顾它。我还没有完全做到这一点，相反，我做了部分转换(将质量移除到一个单独的数组)，然后对剩余float4量的速度 x/y 和位置 x/y 使用"矢量载荷"策略。这是一个示例答案，讨论了 AoS->SoA 转换及其价值原因，以及我在此处采用的向量加载"快捷方式"。
4096 是现代 GPU 相对较少的线程数。从 1024 线程块切换到 512 线程块可能会带来一点好处。这为内核提供了更好的机会来填充 GPU 上的可用 SM。如果您只有 4 条或更少的 SM，这不会有太大区别，但您的 980 Ti 有22条短信，因此我们见证最高性能的最佳机会是在每个 SM 上至少放置 1 个块。因此，您甚至可能想尝试 256 个线程的块(总共 16 个线程块)。
这是一组相当"昂贵"的计算：
```
len = sqrt(len);
double dirx = -difx / len;
double diry = -dify / len;
```
事实证明，rsqrtf()和sqrtf()一样容易计算，这样我们就可以将随后的浮点除法运算转换为浮点乘法运算。

通过这些基本步骤，我能够在非常旧的 GPU 上达到大约 30fps，您可能应该见证比这更好的东西。我正在研究 linux，但我不相信我所做的任何更改都应该在 Windows 下"中断"。

#include <GL/glew.h>
#include <GL/freeglut.h>
#include <iostream>
#include <vector>
#include <math.h>
#include <ctime>
#include <string>
#include <cstdlib>
#include <cstdio>
#include <time.h>
#define N 4096
#define DT 0.00001
#define M_PI           3.14159265358979323846  /* pi */

class Particle
{
public:
Particle();
float4 p;
};
Particle::Particle()
{
p.x = (((double)rand() / (RAND_MAX)) * 2) - 1;
p.y = (((double)rand() / (RAND_MAX)) * 2) - 1;
p.z = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
p.w = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
}
const int size = N * sizeof(Particle);

using namespace std;
Particle p[N];
float pmass[N];
Particle *d_p1, *d_p2;
float *d_pmass1, *d_pmass2;
int ping_pong = 0;
float et;
cudaEvent_t start, stop;
int frames = 0;
clock_t starttime = clock();
clock_t timepassed = 0;
bool first = true;
float fps = 0.0f;
__global__ void updateParticle(Particle * __restrict__  out, float * __restrict__ pmass_out,  const Particle * __restrict__ pin, const float * __restrict__ pmass_in)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
float velx = 0;
float vely = 0;
Particle my_i = pin[i];
float my_mass_i = pmass_in[i];
pmass_out[i] = my_mass_i;
for(int j = 0; j < N; j++)
{
float my_mass_j = pmass_in[j];
if (i == j || my_mass_i == 0 || my_mass_j == 0)
continue;
Particle my_j = pin[j];
float difx = my_i.p.x - my_j.p.x;
float dify = my_i.p.y - my_j.p.y;
float len = difx * difx + dify * dify;
if (len == 0)
continue;
float force = (my_mass_i * my_mass_j) / len;
len = rsqrtf(len);
float dirx = -difx * len;
float diry = -dify * len;
dirx *= force;
diry *= force;
velx += (dirx / my_mass_i + my_i.p.z) * DT;
vely += (diry / my_mass_i + my_i.p.w) * DT;
}
Particle my_out_i = my_i;
my_out_i.p.x = my_i.p.x + velx;
my_out_i.p.y = my_i.p.y + vely;
my_out_i.p.z = my_i.p.z;
my_out_i.p.w = my_i.p.w;
if (my_out_i.p.x > 1)
my_out_i.p.x = 1;
if (my_out_i.p.x < -1)
my_out_i.p.x = -1;
if (my_out_i.p.y > 1)
my_out_i.p.y = 1;
if (my_out_i.p.y < -1)
my_out_i.p.y = -1;
out[i] = my_out_i;
}
void changeViewPort(int w, int h)
{
glViewport(0, 0, w, h);
}
void renderMore()
{
for (int i = 0; i < N; ++i)
{
if (pmass[i] == 0)
continue;
if (pmass[i] == 1)
glColor3f(1, 1, 1);
else
glColor3f(1, 0, 0);
glBegin(GL_LINE_LOOP);
for (int j = 0; j <= 4; j++) {
double angle = 2 * M_PI * j / 300;
double x = cos(angle) * 0.001;
double y = sin(angle) * 0.001;
x *= pmass[i];
y *= pmass[i];
glVertex2d(x + p[i].p.x, y + p[i].p.y);
}
glEnd();
}
}
void render(void)
{
if(first)
{
frames = 0;
starttime = clock();
first = false;
}
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
renderMore();
glutSwapBuffers();
frames++;
}
void moveCuda(Particle* in, Particle* out)
{
Particle *d_pi;
Particle *d_po;
float *d_pmassi, *d_pmasso;
cudaError_t cudaStatus;
if (ping_pong) {
d_pi = d_p2;
d_po = d_p1;
d_pmassi = d_pmass2;
d_pmasso = d_pmass1;
ping_pong = 0;}
else {
d_pi = d_p1;
d_po = d_p2;
d_pmassi = d_pmass1;
d_pmasso = d_pmass2;
ping_pong = 1;}
cudaEventRecord(start);
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
}
updateParticle << <N / 256, 256 >> >(d_po, d_pmasso,  d_pi, d_pmassi);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "kernel launch failed: %sn", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!n", cudaStatus);
}
cudaEventRecord(stop);
cudaStatus = cudaMemcpy(out, d_po, size, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
//cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
}
void update(int)
{
Particle temp[N] = {};
moveCuda(p, temp);
for (int i = 0; i < N; ++i)
p[i] = temp[i];
char a[64];
fps = (float)frames / ((clock() - starttime) / CLOCKS_PER_SEC);
sprintf(a, "FPS: %f, et: %f", fps, et);
glutSetWindowTitle(a);
glutTimerFunc(100.0 / 60, update, -1);
}
void idle()
{
glutPostRedisplay();
}
int main(int argc, char* argv[])
{
for (int i = 0; i < N; ++i)
{
p[i] = Particle();
pmass[i] = 1;
//    p[i].p();
}
cudaMalloc((void**)&d_p2, size);
cudaMalloc((void**)&d_p1, size);
cudaMalloc((void**)&d_pmass2, N*sizeof(float));
cudaMalloc((void**)&d_pmass1, N*sizeof(float));
cudaMemcpy(d_p1, p, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_pmass1, pmass, N*sizeof(float), cudaMemcpyHostToDevice);
cudaEventCreate(&start); cudaEventCreate(&stop);
// Initialize GLUT
glutInit(&argc, argv);
// Set up some memory buffers for our display
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH);
// Set the window size
glutInitWindowSize(1000, 1000);
// Create the window with the title "Hello,GL"
glutCreateWindow("Hello World");
// Bind the two functions (above) to respond when necessary
glutReshapeFunc(changeViewPort);
glutDisplayFunc(render);
glutTimerFunc(100.0 / 60, update, -1);
glutIdleFunc(idle);

// Very important!  This initializes the entry points in the OpenGL driver so we can
// call all the functions in the API.
GLenum err = glewInit();
if (GLEW_OK != err) {
fprintf(stderr, "GLEW error");
return 1;
}
render();
glutMainLoop();
return 0;
}

我不声称这是无缺陷的代码，(我认为你的不是)，但它在图形上的行为似乎与您的原始代码大致相同。例如，在你的代码中，你在内核的末尾有这个：

out[i].velx = pin[i].velx;
out[i].vely = pin[i].vely;

这对我来说看起来不对，但它不是这里讨论的性能的核心。

如果你知道你的质量总是 1 或 0，那么你可以对这段代码进行大量的额外优化，但我没有追求这一点。

作为附加说明，您可能需要考虑 CUDA/OpenGL 互操作策略，以摆脱保留在此处的设备>主机副本，并将数据永久移动到 GPU。同样，CUDA nbody 示例代码可以是一个路线图，如果你想开始使用 CUDA/GL 互操作，我认为这个演示文稿有点过时，但是一个很好的起点。