简单的 CUDA 测试总是失败并出现错误"an illegal memory access was encountered"

Simple CUDA Test always fails with "an illegal memory access was encountered" error

本文关键字：an 错误 illegal memory encountered was access 测试 CUDA 失败简单更新时间：2023-10-16

如果我运行这个程序，我会得到"在matrixMulti.cu的第48〃行遇到非法存储器存取；错误我搜索并尝试了很多。所以我希望有人能帮助我。

第48行：HANDLE_ERROR（cudaMemcpy（array，devarray，NNsizeof（int），cudaMemcpyDeviceToHost））；

这个项目只是为了进入CUDA。我试着实现矩阵乘法。

#include <iostream>
#include<cuda.h>
#include <stdio.h>
using namespace std;
#define HANDLE_ERROR( err ) ( HandleError( err, __FILE__, __LINE__ ) )
void printVec(int** a, int n);
static void HandleError( cudaError_t err, const char *file, int line )
{
    if (err != cudaSuccess)
    {
    printf( "%s in %s at line %dn", cudaGetErrorString( err ),
            file, line );
    exit( EXIT_FAILURE );
    }
}
void checkCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {
        fprintf(stderr, "Cuda error: %s: %s.n", msg, 
                              cudaGetErrorString( err) );
        exit(EXIT_FAILURE);
    }                         
}
__global__ void MatrixMulti(int** a, int** b) {
    b[0][0]=4;
}
int main() {
    int N =10;
    int** array, **devarray;
    array = new int*[N];
    for(int i = 0; i < N; i++) {
        array[i] = new int[N];  
    }
    
    HANDLE_ERROR ( cudaMalloc((void**)&devarray, N*N*sizeof(int) ) );
    HANDLE_ERROR ( cudaMemcpy(devarray, array, N*N*sizeof(int), cudaMemcpyHostToDevice) );  
    MatrixMulti<<<1,1>>>(array,devarray);
    HANDLE_ERROR ( cudaMemcpy(array, devarray, N*N*sizeof(int), cudaMemcpyDeviceToHost) );
    HANDLE_ERROR ( cudaFree(devarray) );
    printVec(array,N);
    return 0;
}
void printVec(int** a , int n) {
    for(int i =0 ; i < n; i++) {
        for ( int j = 0; j <n; j++) {
        cout<< a[i][j] <<" ";
        }       
        cout<<" "<<endl;    
    }
}

通常，分配和复制双下标C数组的方法不起作用。cudaMemcpy需要平坦、连续分配、单指针、单下标数组。

由于这种混乱，传递到内核（int** a, int** b）的指针无法正确（安全）地取消引用两次：

b[0][0]=4;

当您尝试在内核代码中执行上述操作时，您会获得非法的内存访问，因为您没有在设备上正确分配指针到指针样式的分配。

如果您使用cuda-memcheck运行代码，您将在内核代码中得到另一个非法内存访问的指示。

在这些情况下，通常的建议是将2D数组"展平"为一维，并使用适当的指针或索引算法来模拟2D访问。分配2D数组（即双下标、双指针）是可能的，但它相当复杂（部分原因是需要"深度复制"）。如果您想了解更多信息，请在右上角搜索CUDA 2D array。

以下是您的代码的一个版本，它对设备端阵列进行了阵列扁平化：

$ cat t60.cu
#include <iostream>
#include <cuda.h>
#include <stdio.h>
using namespace std;
#define HANDLE_ERROR( err ) ( HandleError( err, __FILE__, __LINE__ ) )
void printVec(int** a, int n);
static void HandleError( cudaError_t err, const char *file, int line )
{
    if (err != cudaSuccess)
    {
    printf( "%s in %s at line %dn", cudaGetErrorString( err ),
            file, line );
    exit( EXIT_FAILURE );
    }
}
void checkCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err)
    {
        fprintf(stderr, "Cuda error: %s: %s.n", msg,
                              cudaGetErrorString( err) );
        exit(EXIT_FAILURE);
    }
}
__global__ void MatrixMulti(int* b, unsigned n) {
    for (int row = 0; row < n; row++)
      for (int col=0; col < n; col++)
    b[(row*n)+col]=col;  //simulate 2D access in kernel code
}
int main() {
    int N =10;
    int** array, *devarray;  // flatten device-side array
    array = new int*[N];
    array[0] = new int[N*N]; // host allocation needs to be contiguous
    for (int i = 1; i < N; i++) array[i] = array[i-1]+N; //2D on top of contiguous allocation
    HANDLE_ERROR ( cudaMalloc((void**)&devarray, N*N*sizeof(int) ) );
    HANDLE_ERROR ( cudaMemcpy(devarray, array[0], N*N*sizeof(int), cudaMemcpyHostToDevice) );
    MatrixMulti<<<1,1>>>(devarray, N);
    HANDLE_ERROR ( cudaMemcpy(array[0], devarray, N*N*sizeof(int), cudaMemcpyDeviceToHost) );
    HANDLE_ERROR ( cudaFree(devarray) );
    printVec(array,N);
    return 0;
}
void printVec(int** a , int n) {
    for(int i =0 ; i < n; i++) {
        for ( int j = 0; j <n; j++) {
        cout<< a[i][j] <<" ";
        }
        cout<<" "<<endl;
    }
}
$ nvcc -arch=sm_20 -o t60 t60.cu
$ ./t60
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
$