在GPU中共享许多高斯-勒让德求积的根和权重

Sharing roots and weights for many Gauss-Legendre Quadrature in GPUs

本文关键字：权重 GPU 共享许多高更新时间：2023-10-16

我打算以并行方式计算许多数值象限，这些象限在一天结束时使用一组公共数据进行所有计算（一个相当大的根和权重数组，占用大约25 Kb的内存）。高斯-勒让德求积法一开始就很简单。我想通过声明devicedouble*d_droot，*d_dweight，使设备中的所有线程、根和权重都可用。但我缺少了一些东西，因为我必须明确地将指针传递给数组，以使我的内核正常工作。我该怎么做才合适？更重要的是，为了在设备上拥有更多可用的空闲内存，是否有可能将根和权重烧到设备内存的某个恒定部分？

代码附有

#include <math.h>
#include <stdlib.h>
#include <stdio.h>

__device__  double *d_droot, *d_dweight;

__device__ __host__
double f(double alpha,double x)
{
  /*function to be integrated via gauss-legendre quadrature. */
  return exp(alpha*x);
}
__global__
void lege_inte2(int n, double alpha, double a, double b, double *lroots, double *weight, double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    lroots[]: roots for the quadrature
    weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    result[i] += weight[dummy] * f(alpha,c1 * lroots[dummy] + c2)*c1;
    }
}
__global__
void lege_inte2_shared(int n,double alpha, double a, double b,  double *result)
{
  extern __shared__ double *d_droot;
  extern __shared__ double *d_dweight;
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    d_root[]: roots for the quadrature
    d_weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    {
      result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
      printf(" Vale: %f n", d_dweight[dummy]);
    }
    }
}

int main(void)
{
  int N = 1<<23;
  int N_nodes = 5;

  double *droot, *dweight, *dresult, *d_dresult;

  /*double version in host*/
  droot =(double*)malloc(N_nodes*sizeof(double));
  dweight =(double*)malloc(N_nodes*sizeof(double));
  dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/

  /*double version in device*/
  cudaMalloc(&d_droot, N_nodes*sizeof(double));
  cudaMalloc(&d_dweight, N_nodes*sizeof(double));
  cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/

  /*double version of the roots and weights*/
  droot[0] = 0.90618;
  droot[1] = 0.538469;
  droot[2] = 0.0;
  droot[3] = -0.538469;
  droot[4] = -0.90618;

  dweight[0] = 0.236927;
  dweight[1] = 0.478629;
  dweight[2] = 0.568889;
  dweight[3] = 0.478629;
  dweight[4] = 0.236927;

  /*double copy host-> device*/
  cudaMemcpy(d_droot, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_dweight, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);

  // Perform SAXPY on 1M element
  lege_inte2<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_droot, d_dweight, d_dresult); /*This kerlnel works OK*/
  //lege_inte2_shared<<<(N+255)/256, 256>>>(N,  -3.0, 3.0,  d_dresult); /*why this one does not work? */


  cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost); 
  double maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(dresult[i]-20.03574985));
  printf("Max error: %f in %i quadratures n", maxError, N);
  printf("integral: %f  n" ,dresult[0]);

  cudaFree(dresult);
  cudaFree(d_droot);
  cudaFree(d_dweight);
}

以及编译它的makefile：

objects = main.o 
all: $(objects)
        nvcc   -Xcompiler -std=c99 -arch=sm_20 $(objects) -o gauss
%.o: %.cpp
        nvcc -x cu -arch=sm_20  -I. -dc $< -o $@
clean:
        rm -f *.o gauss

提前感谢您的任何建议

您对d_droot和d_dweight的处理存在各种错误。当我编译你的代码时，我会收到这样的各种警告：

t640.cu(86): warning: address of a __shared__ variable "d_droot" cannot be directly taken in a host function
t640.cu(87): warning: address of a __shared__ variable "d_dweight" cannot be directly taken in a host function
t640.cu(108): warning: a __shared__ variable "d_droot" cannot be directly read in a host function
t640.cu(109): warning: a __shared__ variable "d_dweight" cannot be directly read in a host function

这一点不应被忽视。

这些声明：
```
__device__  double *d_droot, *d_dweight;
```
没有定义__shared__变量，所以这些行：
```
extern __shared__ double *d_droot;
extern __shared__ double *d_dweight;
```
毫无意义。此外，如果您确实希望这些是动态分配的共享变量（extern __shared__的用途），则需要将分配大小作为第三个内核启动参数传递，而您没有这样做。
这些说法不正确：
```
cudaMalloc(&d_droot, N_nodes*sizeof(double));
cudaMalloc(&d_dweight, N_nodes*sizeof(double));
```
您不能在主机代码中获取__device__变量的地址，而且我们无论如何都不会使用cudaMalloc来分配__device__变量；根据定义，它是一种静态分配。
我建议做正确的cuda错误检查。作为一个快速测试，您还可以使用cuda-memcheck运行您的代码。任何一种方法都会指示代码中存在运行时错误（尽管不是任何问题的关键）。

这些说法也不正确：

cudaMemcpy(d_droot, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_dweight, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);

cudaMemcpy不是用于__device__变量的正确API。请改用cudaMemcpyToSymbol。

下面的代码修复了这些不同的使用错误，将干净地编译，并且看起来运行正确。它证明了不需要传递__device__变量作为内核参数：

#include <math.h>
#include <stdlib.h>
#include <stdio.h>

__device__  double *d_droot, *d_dweight;

__device__ __host__
double f(double alpha,double x)
{
  /*function to be integrated via gauss-legendre quadrature. */
  return exp(alpha*x);
}
__global__
void lege_inte2(int n, double alpha, double a, double b, double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    lroots[]: roots for the quadrature
    weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
    }
}
__global__
void lege_inte2_shared(int n,double alpha, double a, double b,  double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    d_root[]: roots for the quadrature
    d_weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    {
      result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
      printf(" Vale: %f n", d_dweight[dummy]);
    }
    }
}

int main(void)
{
  int N = 1<<23;
  int N_nodes = 5;

  double *droot, *dweight, *dresult, *d_dresult, *d_droot_temp, *d_dweight_temp;

  /*double version in host*/
  droot =(double*)malloc(N_nodes*sizeof(double));
  dweight =(double*)malloc(N_nodes*sizeof(double));
  dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/

  /*double version in device*/
  cudaMalloc(&d_droot_temp, N_nodes*sizeof(double));
  cudaMalloc(&d_dweight_temp, N_nodes*sizeof(double));
  cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/

  /*double version of the roots and weights*/
  droot[0] = 0.90618;
  droot[1] = 0.538469;
  droot[2] = 0.0;
  droot[3] = -0.538469;
  droot[4] = -0.90618;

  dweight[0] = 0.236927;
  dweight[1] = 0.478629;
  dweight[2] = 0.568889;
  dweight[3] = 0.478629;
  dweight[4] = 0.236927;

  /*double copy host-> device*/
  cudaMemcpy(d_droot_temp, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_dweight_temp, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(d_droot, &d_droot_temp, sizeof(double *));
  cudaMemcpyToSymbol(d_dweight, &d_dweight_temp, sizeof(double *));
  // Perform SAXPY on 1M element
  lege_inte2<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
  //lege_inte2_shared<<<(N+255)/256, 256>>>(N,  -3.0, 3.0,  d_dresult); /*why this one does not work? */


  cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
  double maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(dresult[i]-20.03574985));
  printf("Max error: %f in %i quadratures n", maxError, N);
  printf("integral: %f  n" ,dresult[0]);

  cudaFree(d_dresult);
  cudaFree(d_droot_temp);
  cudaFree(d_dweight_temp);
}

（我不能保证结果。）

现在，关于这个问题：

更重要的是，为了在设备上拥有更多可用的空闲内存，是否有可能将根和权重烧到设备内存的某个恒定部分？

由于您对d_dweight和d_droot的访问似乎是一致的：

result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;

那么将这些定义为__constant__存储器空间变量可能是有用的。当扭曲中的每个线程都在请求恒定内存中的相同值（相同位置）时，恒定内存访问是最佳的。然而，__constant__内存不能动态分配，并且将指针（仅）存储在常量内存中是没有意义的；这并没有提供常量缓存机制的任何好处。

因此，以下对代码的进一步修改演示了如何将这些值存储在__constant__内存中，但它需要静态分配。此外，这并不能真正"节省"任何设备内存。无论是使用cudaMalloc动态分配，使用__device__变量静态分配，还是通过__constant__变量定义（也是静态分配）进行分配，所有这些方法都需要全局内存备份存储在设备内存（板载DRAM）中。

演示可能的恒定内存使用的代码：

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define N_nodes 5
__constant__   double d_droot[N_nodes], d_dweight[N_nodes];

__device__ __host__
double f(double alpha,double x)
{
  /*function to be integrated via gauss-legendre quadrature. */
  return exp(alpha*x);
}
__global__
void lege_inte2(int n, double alpha, double a, double b, double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    lroots[]: roots for the quadrature
    weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
    }
}
__global__
void lege_inte2_shared(int n,double alpha, double a, double b,  double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    d_root[]: roots for the quadrature
    d_weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    {
      result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
      printf(" Vale: %f n", d_dweight[dummy]);
    }
    }
}

int main(void)
{
  int N = 1<<23;
 // int N_nodes = 5;

  double *droot, *dweight, *dresult, *d_dresult;

  /*double version in host*/
  droot =(double*)malloc(N_nodes*sizeof(double));
  dweight =(double*)malloc(N_nodes*sizeof(double));
  dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/

  /*double version in device*/
  cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/

  /*double version of the roots and weights*/
  droot[0] = 0.90618;
  droot[1] = 0.538469;
  droot[2] = 0.0;
  droot[3] = -0.538469;
  droot[4] = -0.90618;

  dweight[0] = 0.236927;
  dweight[1] = 0.478629;
  dweight[2] = 0.568889;
  dweight[3] = 0.478629;
  dweight[4] = 0.236927;

  /*double copy host-> device*/
  cudaMemcpyToSymbol(d_droot, droot, N_nodes*sizeof(double));
  cudaMemcpyToSymbol(d_dweight, dweight, N_nodes*sizeof(double));
  // Perform SAXPY on 1M element
  lege_inte2<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
  //lege_inte2_shared<<<(N+255)/256, 256>>>(N,  -3.0, 3.0,  d_dresult); /*why this one does not work? */


  cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
  double maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(dresult[i]-20.03574985));
  printf("Max error: %f in %i quadratures n", maxError, N);
  printf("integral: %f  n" ,dresult[0]);

  cudaFree(d_dresult);
}