CUDA 将用户定义的结构传递给内核失败

cuda passing user defined structure to a kernel failed

本文关键字：内核失败结构用户定义 CUDA 更新时间：2023-10-16

这是我的问题。我的kernel.h中有以下结构。

struct   __Q_VECTOR__{
    double* Data;       
    int     Dimension;  
    int     Cluster;    
};
typedef struct __Q_VECTOR__     VQ_VECTOR;

在kernel.cu我有以下代码

int main(void){
 int L = 3, //.Data length
    N = 100;
VQ_VECTOR   *A,
            *device_VQ_VECTOR;
cudaError_t cudaStatus;
A =   (VQ_VECTOR*)malloc(N*sizeof(VQ_VECTOR));
for(int i=0; i<N; i++){
    VQ_VECTOR a;
    a.Data = (double*)malloc(L*sizeof(double));;
    a.Cluster   =   1;
    a.Dimension =   L;
    for(int j=0; j<L; j++)
        a.Data[j]=i*j;
    A[i] = a;
}
//Prinf of all the elements of A
for(int i=0; i<2; i++){
    printf("nA[%d]={");
    for(int j=0; j<L; j++)
        printf("%.3f",A[i].Data[j]);
    printf("}");
}
printf("nn");
//I Allocate and Copy data from A to device_VQ_VECTORon the GPU memory
cudaDeviceReset();
cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));
cudaStatus = cudaMemcpy(device_VQ_VECTOR, A, N*sizeof(VQ_VECTOR), cudaMemcpyHostToDevice);
cudaPrintfInit();
testKernel<<<N,1>>>(device_VQ_VECTOR, N);//to test and see on a sigle thread
cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "n testKernel launch failed: %sn", cudaGetErrorString(cudaStatus));
        return 1;
}
cudaStatus = cudaMemcpy(A, device_VQ_VECTOR, N*sizeof(VQ_VECTOR), cudaMemcpyDeviceToHost);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "n testKernel launch failed: %sn", cudaGetErrorString(cudaStatus));
        return 1;
}
for(int i=0; i<2; i++){
    printf("nA[%d]={");
    for(int j=0; j<L; j++)
        printf("%.3f",A[i].Data[j]);
    printf("}");
}
cudaFree(device_VQ_VECTOR);
 return 0;

}

当我构建时，有时它什么都不打印，有时它有效。
我的代码出了什么问题？可能由以下原因引起

cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));
cudaStatus = cudaMemcpy(device_VQ_VECTOR, A, N* sizeof(VQ_VECTOR), cudaMemcpyHostToDevice);

请帮忙！

这不起作用，因为数组是单独分配的，而不是复制到设备内存中。您还需要在设备上分配它们，并制作完整副本。更糟糕的是，您无法直接从主机端访问设备内存（cudaMemcpy以外的其他方式），因此您不能使用 cudaMalloc(&device_VQ_VECTOR[i].Data, ...)（它会崩溃）。

下面是一个示例代码。为简单起见，它会删除主机端A[i].Data，然后重新创建它们。它不是太好，但会去的。

struct   __Q_VECTOR__{
    double* Data;       
    int     Dimension;  
    int     Cluster;    
};
typedef struct __Q_VECTOR__     VQ_VECTOR;
__global__ void testKernel(VQ_VECTOR *X, int N){
    int i= blockIdx.x*blockDim.x + threadIdx.x;
    cuPrintf("n testKernel entrance by the global threadIdx= %dn", i);
    for(int k=0; k<X[i].Dimension; k++)
        cuPrintf("%2.2f, ",X[i].Data[k]);
    cuPrintf("n");
}
int main(void){
    int L = 3, //.Data length
        N = 100;
    VQ_VECTOR   *A,
                *device_VQ_VECTOR;
    cudaError_t cudaStatus;
    A =   (VQ_VECTOR*)malloc(N*sizeof(VQ_VECTOR));
    for(int i=0; i<N; i++){
        VQ_VECTOR a;
        a.Data = (double*)malloc(L*sizeof(double));;
        a.Cluster   =   1;
        a.Dimension =   L;
        for(int j=0; j<L; j++)
            a.Data[j]=(1+i)*(1+j);
        A[i] = a;
    }
    //Prinf of all the elements of A
    for(int i=0; i<2; i++){
        printf("nA[%d]={", i);
        for(int j=0; j<L; j++)
            printf("%.3f",A[i].Data[j]);
        printf("}n");
    }
    printf("nn");
    //I Allocate and Copy data from A to device_VQ_VECTORon the GPU memory
    cudaDeviceReset();
    cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));
    cudaStatus = cudaMemcpy(device_VQ_VECTOR, A, N*sizeof(VQ_VECTOR), cudaMemcpyHostToDevice);
    for(int i = 0; i != N; ++i) {
        /* can't access device_VQ_VECTOR[i].Data directly from host-side,
         * working around it with proxy variable */
        double *out;
        cudaMalloc(&out, L*sizeof(double));
        cudaMemcpy(out, A[i].Data, L*sizeof(double),
                cudaMemcpyHostToDevice);
        cudaMemcpy(&device_VQ_VECTOR[i].Data, &out, sizeof(void*),
                cudaMemcpyHostToDevice);
        // will re-allocate later, for simplicity sake
        free(A[i].Data);
    }
    cudaPrintfInit();
    testKernel<<<N,1>>>(device_VQ_VECTOR, N);//to test and see on a sigle thread
    cudaPrintfDisplay(stdout, true);
    cudaPrintfEnd();
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "n testKernel launch failed: %sn", cudaGetErrorString(cudaStatus));
        return 1;
    }
    cudaStatus = cudaMemcpy(A, device_VQ_VECTOR, N*sizeof(VQ_VECTOR), cudaMemcpyDeviceToHost);
    for(int i = 0; i != N; ++i) {
        // allocate array, copy data
        double *array = (double*)malloc(L*sizeof(double));
        cudaMemcpy(array, A[i].Data, L*sizeof(double),
                cudaMemcpyDeviceToHost);
        // assign new array to A[i]
        A[i].Data = array;
    }
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "n testKernel launch failed: %sn", cudaGetErrorString(cudaStatus));
        return 1;
    }
/*  for(int i=0; i<2; i++){
        printf("nA[%d]={", i);
        for(int j=0; j<L; j++)
            printf("%.3f",A[i].Data[j]);
        printf("}n");
    }*/
    cudaFree(device_VQ_VECTOR);
    // don't forget to free A and all its Data
    return 0;
}

部分输出将是（它很大，我不想发布太多）：

[2, 0]: 3.00, [18, 0]: 19.00, [22, 0]: 23.00, [16, 0]: 17.00,
[24, 0]: 25.00, [19, 0]: 20.00, [4, 0]: 5.00, [23, 0]: 24.00,
[3, 0]: 4.00, [5, 0]: 6.00, [13, 0]: 14.00, [1, 0]: 2.00,
[10, 0]: 11.00, [6, 0]: 7.00, [14, 0]: 15.00, [0, 0]: 1.00, [20, 0]: