如何将带有指针数组的C++类传递给CUDA

How to pass a C++ class with array of pointers to CUDA?

本文关键字:C++ CUDA 数组 指针      更新时间:2023-10-16

更清楚地说,我想要的是将指针和它们指向设备的所有数据传递给设备。为了测试我如何实现这个目标,我写了一个简单的类:

class vecarray{
    public:
        int * vecptr[N];                //array of pointers pointing to array
        int dim[N];                     //store length of each array pointed to
        __device__ __host__ vecarray(); //constructor
        __device__ __host__ int sum();  //sum up all the elements in the array being              
                                       //pointed to
}
vecarray::vecarray(){
    for(int i = 0; i<N; i++)
    {
        vecptr[i] = NULL;
        dim[i] = 0;
    }
}
int vecarray::sum(){
    int i=0, j=0, s=0;
    for (i=0; i<N; i++)
        for(j=0; j < dim[i]; j++)
            s += vecptr[i][j];
    return s;
}

然后我在下面的代码中使用这个类:

#define N 2
__global__ void addvecarray( vecarray * v, int *s){
    *s = v->sum();
}
int main(){                                 //copy *V to device, do sum() and pass back 
    vecarray *v, *dev_v;                    //the result by dev_v
    v = new vecarray;
    dev_v = new vecarray;
    int a[3] = {1,2,3};                     //initialize v manually
    int b[4] = {4,5,6,7};
    int result = 0;
    int * dev_result;
    v->vecptr[0] = a;
    v->vecptr[1] = b;
    v->dim[0] = 3; v->dim[1] = 4;

    cudaMalloc((void**)&dev_v, sizeof(vecarray));      
    cudaMemcpy(dev_v, v, sizeof(vecarray),cudaMemcpyHostToDevice); //copy class object 
    for(int i = 0; i < N; i++){
        cudaMalloc((void**)&(dev_v->vecptr[i]), v->dim[i]*sizeof(int));
    }
    for(int i = 0; i<N; i++ ){                   //copy arrays
    cudaMemcpy(dev_v->vecptr[i], v->vecptr[i], v->dim[i]*sizeof(int), cudaMemcpyHostToDevice));
    }
    addvecarray<<<1,1>>>(dev_v, dev_result);
    cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
    printf("the result is %dn", result);
}

代码通过了nvcc编译器,但在运行时由于分段错误而失败。我已经检查了for循环中的两个cudaMalloc和cudaMemcpy操作中的问题。所以我的问题是,我应该如何将这个对象传递给CUDA?提前谢谢。

您的代码中有几个错误。正如我在评论中提到的,其中一个关键错误是如何为类中指针引用的数据区域分配内存。关键的错误是,您正在将指针传递给已经存在于设备内存中的cudaMalloc。我们可以通过创建一组额外的指针来解决这个问题,这些指针将用于为类中指向的数组分配所需的设备存储。此外,还有一些其他错误,例如您没有为dev_result正确分配设备存储。下面的代码修复了我能找到的所有错误,我相信会给出正确的结果。我还添加了一个cuda错误检查的参考形式,你可能会发现它在你的项目中很有用:

#include <stdio.h>
#define N 2
#define cudaCheckErrors(msg) 
    do { 
        cudaError_t __err = cudaGetLastError(); 
        if (__err != cudaSuccess) { 
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)n", 
                msg, cudaGetErrorString(__err), 
                __FILE__, __LINE__); 
            fprintf(stderr, "*** FAILED - ABORTINGn"); 
            exit(1); 
        } 
    } while (0)
using namespace std;
class vecarray{
    public:
        int *vecptr[N];                //array of pointers pointing to array
        int dim[N];                     //store length of each array pointed to
        __device__ __host__ vecarray(); //constructor
        __device__ __host__ int sum();  //sum up all the elements in the array being
                                       //pointed to
};
vecarray::vecarray(){
    for(int i = 0; i<N; i++)
    {
        vecptr[i] = NULL;
        dim[i] = 0;
    }
}
__device__ __host__ int vecarray::sum(){
    int i=0, j=0, s=0;
    for (i=0; i<N; i++)
        for(j=0; j < dim[i]; j++)
            s += vecptr[i][j];
    return s;
}
__global__ void addvecarray( vecarray * v, int *s){
    *s = v->sum();
}
int main(){                                 //copy *V to device, do sum() and pass back
    vecarray *v, *dev_v;                    //the result by dev_v
    v = new vecarray;
    int a[3] = {1,2,3};                     //initialize v manually
    int b[4] = {4,5,6,7};
    int result = 0;
    int *dev_result;
    v->vecptr[0] = a;
    v->vecptr[1] = b;
    v->dim[0] = 3; v->dim[1] = 4;
    int *vptr[N];
    cudaMalloc((void**)&dev_v, sizeof(vecarray));
    cudaCheckErrors("cudaMalloc1 fail");
    cudaMemcpy(dev_v, v, sizeof(vecarray),cudaMemcpyHostToDevice); //copy class object
    cudaCheckErrors("cudaMemcpy1 fail");
    for(int i = 0; i < N; i++){
        cudaMalloc((void**)&(vptr[i]), v->dim[i]*sizeof(int));
        cudaCheckErrors("cudaMalloc2 fail");
        cudaMemcpy(&(dev_v->vecptr[i]), &vptr[i], sizeof(int*), cudaMemcpyHostToDevice);
        cudaCheckErrors("cudaMemcpy2 fail");
    }
    for(int i = 0; i<N; i++ ){                   //copy arrays
        cudaMemcpy(vptr[i], v->vecptr[i], v->dim[i]*sizeof(int), cudaMemcpyHostToDevice);
        cudaCheckErrors("cudaMemcpy3 fail");
    }
    cudaMalloc((void **)&dev_result, sizeof(int));
    cudaCheckErrors("cudaMalloc3 fail");
    addvecarray<<<1,1>>>(dev_v, dev_result);
    cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy4 fail");
    printf("the result is %dn", result);
    return 0;
}