带有大型矢量大小的CUDA内核的点产品返回错误的结果

Dot Product with a CUDA kernel for big vector sizes returns wrong results

本文关键字：返回结果错误内核 CUDA 大型更新时间：2023-10-16

我正在尝试实现一个CUDA内核，该核心正在计算两个向量的点产物。对于小矢量规模，代码正常工作，我得到了正确的结果，但是对于较大的结果，它只是在计算错误的结果。我正在实施三种计算点产品的不同方法：

序列版本（C 直接而没有任何优化）
cuda 内核
cublas 版本

我在CPP文件中的主要主角看起来如下：

float *h_x,*h_y;
float res1=0.0, res2=0.0, res3=0.0;
h_x=(float*)malloc(Main::N*sizeof(float)); random_ints_Vec(h_x);
h_y=(float*)malloc(Main::N*sizeof(float)); random_ints_Vec(h_y);
double serialTimer;
double cublasTimer;
double cudaTimer;
res1=serial_dotProd(h_x,h_y,&serialTimer);  
res2=cublas_dotProd(h_x,h_y,&cublasTimer);
res3=cuda_dotProd(h_x,h_y,&cudaTimer);      
free(h_x); free(h_y);

序列版本：

float Main::serial_dotProd(float* x, float* y, double* time){
std::clock_t start;
start = std::clock();
float res1=0.0;
for (int i=0;i<Main::N;++i) {
    res1+=x[i]*y[i];
}
*time= ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
return res1;}

cuda 版本：

float Main::cuda_dotProd(float *h_x,float *h_y,double* time){
float *d_x,*d_y,*d_res, *h_res;
h_res=(float*)malloc(Main::BLOCKS_PER_GRID*sizeof(float));
size_t bfree, afree, total;
cudaMemGetInfo(&bfree,&total);
cudaMalloc((void**) &d_x,Main::N*sizeof(float));
cudaMalloc((void**) &d_y,Main::N*sizeof(float));
cudaMalloc((void**) &d_res,Main::BLOCKS_PER_GRID*sizeof(float));
cudaCheckErrors("cuda malloc fail");
cudaMemGetInfo(&afree,&total);
std::cout<<" > memory used for cuda-version:"<< (bfree -afree)/1048576<< "MB ("<<total/1048576 <<"MB)" <<"n";
cudaMemcpy(d_x,h_x,Main::N*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_y,h_y,Main::N*sizeof(float),cudaMemcpyHostToDevice);   
std::clock_t start;
start = std::clock();   
DotProdWrapper(d_x,d_y,d_res,(Main::N+Main::THREADS_PER_BLOCK-1)/Main::THREADS_PER_BLOCK,Main::THREADS_PER_BLOCK,Main::N);
*time= ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
cudaMemcpy(h_res,d_res,Main::BLOCKS_PER_GRID*sizeof(float),cudaMemcpyDeviceToHost);
float c= 0;
for (int i=0;i<Main::BLOCKS_PER_GRID;++i){
  c+=h_res[i];
}
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_res);    
free(h_res);
return c;}

cuda 内核：

__global__ void DotProd(float* x, float* y, float* scalar,unsigned long int N){
    extern __shared__ float cache[];
    int tid = threadIdx.x+ blockIdx.x*blockDim.x;
    int cacheIndex = threadIdx.x;
    float temp=0;
    while (tid<N){
        temp+=x[tid]*y[tid];
        tid +=blockDim.x*gridDim.x; 
    }
    cache[cacheIndex]=temp;
    __syncthreads();
    int i=blockDim.x/2;
    while(i!=0){
        if (cacheIndex<i)
            cache[cacheIndex]+=cache[cacheIndex+i];
        __syncthreads();
        i/=2;
    }
    if(cacheIndex==0)
        scalar[blockIdx.x]=cache[cacheIndex];
}

cublas 版本：

float Main::cublas_dotProd(float *h_x,float *h_y, double* time){
    float *d_x,*d_y;
    float *res;
    float result=0.0;
    cublasHandle_t h;
    cublasCreate(&h);
    cublasSetPointerMode(h, CUBLAS_POINTER_MODE_DEVICE);
    size_t bfree, afree, total;
    cudaMemGetInfo(&bfree,&total);
    cudaMalloc((void**) &d_x,Main::N*sizeof(float));
    cudaMalloc((void**) &d_y,Main::N*sizeof(float));
    cudaMalloc( (void **)(&res), sizeof(float) );
    cudaCheckErrors("cublas malloc fail");
    cudaMemGetInfo(&afree,&total);
     cudaMemcpy(d_x, h_x, Main::N*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(d_y, h_y, Main::N*sizeof(float), cudaMemcpyHostToDevice);
    cublasSetVector(Main::N,sizeof(float),h_x,1,d_x,1);
    cublasSetVector(Main::N,sizeof(float),h_y,1,d_y,1);

    std::clock_t start;
    start = std::clock();
    cublasSdot(h,Main::N,d_x,1,d_y,1,res);
    *time= ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
    cudaMemcpy(&result, res, sizeof(float), cudaMemcpyDeviceToHost);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(res);
    return result;
}

结果我在计算以不同设置的计算后获得：

n = 204800，threads_per_block：512，blocks_per_grid：400serial_dotprod = 4.15369e 06 ;cublas_dotprod = 4.15369e 06 ;cuda_dotprod = 4.15369e 06
n = 20480000，threads_per_block：512，blocks_per_grid：40000serial_dotprod = 4.04149e 08 ;cublas_dotprod = 4.14834e 08 ;cuda_dotprod = 4.14833e 08

我不知道为什么，但是在我的矢量有一定尺寸之后，我就会发现结果。向量确实适合SDRAM，并且每个块的共享内存还足以分配内存。预先感谢。

这个问题经常出现，以至于NVIDIA专用CUDA浮点的整个部分和IEEE 754指南。在CUDA C最佳实践指南中也简要提及。

简短的解释是双重的：

与相应的精确数学操作不同，由于所涉及的舍入误差，浮点算术操作不是关联的。这意味着将总序列总和重新排序为适合并行执行的树结构将稍微改变结果（更多的是越来越多的值求和）。巧合的是，在大多数情况下，树的排列也使结果更接近确切的数学总和，而不是顺序总和。
CUDA编译器在使用融合的多重ADD时倾向于更具侵略性（FMA，省略了中间圆形阶段的多重添加操作）。同样，数学上正确的结果往往更接近使用FMA获得的结果。

因此，可能的答案是，使用CUDA获得的结果可能比简单的串行CPU版本更接近确切的结果（这就是为什么我要求您使用提高的精度再次执行实验）。