CUDA非法访问内存

CUDA illegal memory access

本文关键字：内存访问非法 CUDA 更新时间：2023-10-16

我正试图让这段代码与3D类型的结构一起工作。我正在使用Cuda的2D函数。因此，主机端线性数据("板")的大小为width*height*depth，2D mallocs的大小为width=height*depthy(此处width和height都是DIMxDIM元素)。内核处理从A到B的数据。我在行收到一个非法的内存访问错误(使用内存检查器)

dst[offset] = curr;

如果我将malloc更改为HEIGHT*2，错误就会消失，但大小似乎匹配。我错过了什么？其他批评也很受欢迎，我对C++和CUDA都是新手。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
typedef signed int sint;
typedef unsigned int uint;
#define DIM 512
#define TPB 32 // Threads per block
#define CLEARANCE 5
#define MAPLAYERS 2
#define WIDTH (sizeof(sint) * DIM)
#define HEIGHT (DIM * MAPLAYERS)
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest);
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index);
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch);
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff);
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch);
dim3 blocks(DIM / TPB, DIM / TPB, MAPLAYERS);
dim3 threads(TPB, TPB);
/** CUDA Error Check */
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
int tmp;
std::cin >> tmp;
exit(code);
}
}
int main(void) {
sint *A;
sint *B;
size_t pitchA, pitchB;
sint *board = new sint[WIDTH*HEIGHT];
CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT));
CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT));
CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT));
CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT));
route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1);
CER(cudaFree(A));
CER(cudaFree(B));
delete[] board;
}
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) {
unsigned long *dev_index;
unsigned long index = NULL;
CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long)));
CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index);
CER(cudaPeekAtLastError());
CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost));
if (index != NULL) {
// break condition
}
}
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) {
unsigned int x = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int y = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int z = blockIdx.z + blockIdx.z * blockDim.z;
unsigned long long offset = calcOffset(x, y, z, 0, 0, 0, pitchDst);
sint curr;
if (!inBounds(x, y, z, 0, 0, 0))
return;
curr = src[calcOffset(x, y, z, 0, 0, 0, pitchSrc)];
if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) {
// Processing
}
else
dst[offset] = 1;
return;
}
/** Finds linear offset for a given pixel and offset. */
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) {
return (x + xoff) + (y + yoff) * pitch + ((z + zoff) * pitch * (HEIGHT / MAPLAYERS));
}

/** Checks if position is valid on the map. */
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) {
if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS)
return false;
return true;
}

/** Returns true if a block has clearnace */
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) {
for (int c = -CLEARANCE; c <= CLEARANCE; c++) {
for (int r = -CLEARANCE; r <= CLEARANCE; r++){
if (inBounds(x, y, z, r, c, 0)){
if (src[calcOffset(x, y, z, r, c, 0, pitch)] == 2 || src[calcOffset(x, y, z, r, c, 0, pitch)] == 1)
return false;
}
else {
return false;
}
}
}
return true;
}

CUDA调试器的输出：

Memory Checker detected 384 access violations.
error = access violation on load (global memory)
gridid = 18
blockIdx = {0,8,0}
threadIdx = {0,4,0}
address = 0x05d08000
accessSize = 4

这看起来不对：

sint *board = new sint[WIDTH*HEIGHT];

我想你的意思是：

sint *board = new sint[DIM*HEIGHT];

这看起来不对：

unsigned int z = blockIdx.z + blockIdx.z * blockDim.z;

我想你的意思是：

unsigned int z = threadIdx.z + blockIdx.z * blockDim.z;

但问题的关键是，在对sint数组中的索引进行计数的算术中，您使用了间距值(计算行宽度的字节)。当你用这种方式计算指数时，你需要用sizeof(sint)来缩放你的音高值。即使这样也不完全正确。正确的做法是转换为unsigned char指针，按行乘以间距(即字节)进行算术运算，然后将行指针的开头从unsigned char转换回sint，然后通过(x+xoff)从那里进行索引。实际上，这意味着calcOffset例程需要重写，并且需要接受底层指针作为参数，并返回指针。

所以这个代码有这些变化：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
typedef signed int sint;
typedef unsigned int uint;
#define DIM 512
#define TPB 32 // Threads per block
#define CLEARANCE 5
#define MAPLAYERS 2
#define WIDTH (sizeof(sint) * DIM)
#define HEIGHT (DIM * MAPLAYERS)
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest);
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index);
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch);
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff);
__device__ inline sint * calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch);
dim3 blocks(DIM / TPB, DIM / TPB, MAPLAYERS);
dim3 threads(TPB, TPB);
/** CUDA Error Check */
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
int tmp;
std::cin >> tmp;
exit(code);
}
}
int main(void) {
sint *A;
sint *B;
size_t pitchA, pitchB;
sint *board = new sint[DIM*HEIGHT];
CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT));
CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT));
CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT));
CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT));
route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1);
CER(cudaFree(A));
CER(cudaFree(B));
delete[] board;
}
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) {
unsigned long *dev_index;
unsigned long index = 0;
CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long)));
CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index);
CER(cudaPeekAtLastError());
CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost));
if (index != 0) {
// break condition
}
}
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int z = threadIdx.z + blockIdx.z * blockDim.z;
sint *dst_offset = calcOffset(dst, x, y, z, 0, 0, 0, pitchDst);
sint curr;
if (!inBounds(x, y, z, 0, 0, 0))
return;
curr = *calcOffset(src, x, y, z, 0, 0, 0, pitchSrc);
if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) {
// Processing
}
else
*dst_offset = 1;
return;
}
/** Finds linear offset for a given pixel and offset. */
__device__ sint* calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) {
unsigned char *my_ptr = reinterpret_cast<unsigned char *>(ptr);
return (x + xoff) + reinterpret_cast<sint *>(my_ptr + (((y + yoff) * pitch) + ((z + zoff) * pitch * (HEIGHT / MAPLAYERS))));
}

/** Checks if position is valid on the map. */
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) {
if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS)
return false;
return true;
}

/** Returns true if a block has clearnace */
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) {
for (int c = -CLEARANCE; c <= CLEARANCE; c++) {
for (int r = -CLEARANCE; r <= CLEARANCE; r++){
if (inBounds(x, y, z, r, c, 0)){
if ((*calcOffset(src, x, y, z, r, c, 0, pitch) == 2) || (*calcOffset(src, x, y, z, r, c, 0, pitch)) == 1)
return false;
}
else {
return false;
}
}
}
return true;
}

将来，您可能希望使用非音调分配来使代码正常工作。一旦你把事情做好了，你就可以看看添加投球分配是否会给你带来任何性能优势。

我还想到，如果(x+xoff)为负(或者(x+xoff)导致索引到下一行)，即使这样也不会起作用。在倾斜分配中，不能以这种方式从一行向后索引到前一行(或下一行)。首先需要将(x+xoff)解析为引用的实际行，然后为该行建立索引，然后针对该行进行倾斜计算。