CUDA非法访问内存
CUDA illegal memory access
我正试图让这段代码与3D类型的结构一起工作。我正在使用Cuda的2D函数。因此,主机端线性数据("板")的大小为width*height*depth,2D mallocs的大小为width=height*depthy(此处width和height都是DIMxDIM元素)。内核处理从A到B的数据。我在行收到一个非法的内存访问错误(使用内存检查器)
dst[offset] = curr;
如果我将malloc更改为HEIGHT*2,错误就会消失,但大小似乎匹配。我错过了什么?其他批评也很受欢迎,我对C++和CUDA都是新手。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
typedef signed int sint;
typedef unsigned int uint;
#define DIM 512
#define TPB 32 // Threads per block
#define CLEARANCE 5
#define MAPLAYERS 2
#define WIDTH (sizeof(sint) * DIM)
#define HEIGHT (DIM * MAPLAYERS)
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest);
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index);
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch);
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff);
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch);
dim3 blocks(DIM / TPB, DIM / TPB, MAPLAYERS);
dim3 threads(TPB, TPB);
/** CUDA Error Check */
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
int tmp;
std::cin >> tmp;
exit(code);
}
}
int main(void) {
sint *A;
sint *B;
size_t pitchA, pitchB;
sint *board = new sint[WIDTH*HEIGHT];
CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT));
CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT));
CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT));
CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT));
route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1);
CER(cudaFree(A));
CER(cudaFree(B));
delete[] board;
}
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) {
unsigned long *dev_index;
unsigned long index = NULL;
CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long)));
CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index);
CER(cudaPeekAtLastError());
CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost));
if (index != NULL) {
// break condition
}
}
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) {
unsigned int x = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int y = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int z = blockIdx.z + blockIdx.z * blockDim.z;
unsigned long long offset = calcOffset(x, y, z, 0, 0, 0, pitchDst);
sint curr;
if (!inBounds(x, y, z, 0, 0, 0))
return;
curr = src[calcOffset(x, y, z, 0, 0, 0, pitchSrc)];
if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) {
// Processing
}
else
dst[offset] = 1;
return;
}
/** Finds linear offset for a given pixel and offset. */
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) {
return (x + xoff) + (y + yoff) * pitch + ((z + zoff) * pitch * (HEIGHT / MAPLAYERS));
}
/** Checks if position is valid on the map. */
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) {
if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS)
return false;
return true;
}
/** Returns true if a block has clearnace */
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) {
for (int c = -CLEARANCE; c <= CLEARANCE; c++) {
for (int r = -CLEARANCE; r <= CLEARANCE; r++){
if (inBounds(x, y, z, r, c, 0)){
if (src[calcOffset(x, y, z, r, c, 0, pitch)] == 2 || src[calcOffset(x, y, z, r, c, 0, pitch)] == 1)
return false;
}
else {
return false;
}
}
}
return true;
}
CUDA调试器的输出:
Memory Checker detected 384 access violations.
error = access violation on load (global memory)
gridid = 18
blockIdx = {0,8,0}
threadIdx = {0,4,0}
address = 0x05d08000
accessSize = 4
这看起来不对:
sint *board = new sint[WIDTH*HEIGHT];
我想你的意思是:
sint *board = new sint[DIM*HEIGHT];
这看起来不对:
unsigned int z = blockIdx.z + blockIdx.z * blockDim.z;
我想你的意思是:
unsigned int z = threadIdx.z + blockIdx.z * blockDim.z;
但问题的关键是,在对sint
数组中的索引进行计数的算术中,您使用了间距值(计算行宽度的字节)。当你用这种方式计算指数时,你需要用sizeof(sint)
来缩放你的音高值。即使这样也不完全正确。正确的做法是转换为unsigned char
指针,按行乘以间距(即字节)进行算术运算,然后将行指针的开头从unsigned char
转换回sint
,然后通过(x+xoff)
从那里进行索引。实际上,这意味着calcOffset
例程需要重写,并且需要接受底层指针作为参数,并返回指针。
所以这个代码有这些变化:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
typedef signed int sint;
typedef unsigned int uint;
#define DIM 512
#define TPB 32 // Threads per block
#define CLEARANCE 5
#define MAPLAYERS 2
#define WIDTH (sizeof(sint) * DIM)
#define HEIGHT (DIM * MAPLAYERS)
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest);
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index);
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch);
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff);
__device__ inline sint * calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch);
dim3 blocks(DIM / TPB, DIM / TPB, MAPLAYERS);
dim3 threads(TPB, TPB);
/** CUDA Error Check */
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
int tmp;
std::cin >> tmp;
exit(code);
}
}
int main(void) {
sint *A;
sint *B;
size_t pitchA, pitchB;
sint *board = new sint[DIM*HEIGHT];
CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT));
CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT));
CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT));
CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT));
route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1);
CER(cudaFree(A));
CER(cudaFree(B));
delete[] board;
}
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) {
unsigned long *dev_index;
unsigned long index = 0;
CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long)));
CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index);
CER(cudaPeekAtLastError());
CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost));
if (index != 0) {
// break condition
}
}
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int z = threadIdx.z + blockIdx.z * blockDim.z;
sint *dst_offset = calcOffset(dst, x, y, z, 0, 0, 0, pitchDst);
sint curr;
if (!inBounds(x, y, z, 0, 0, 0))
return;
curr = *calcOffset(src, x, y, z, 0, 0, 0, pitchSrc);
if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) {
// Processing
}
else
*dst_offset = 1;
return;
}
/** Finds linear offset for a given pixel and offset. */
__device__ sint* calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) {
unsigned char *my_ptr = reinterpret_cast<unsigned char *>(ptr);
return (x + xoff) + reinterpret_cast<sint *>(my_ptr + (((y + yoff) * pitch) + ((z + zoff) * pitch * (HEIGHT / MAPLAYERS))));
}
/** Checks if position is valid on the map. */
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) {
if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS)
return false;
return true;
}
/** Returns true if a block has clearnace */
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) {
for (int c = -CLEARANCE; c <= CLEARANCE; c++) {
for (int r = -CLEARANCE; r <= CLEARANCE; r++){
if (inBounds(x, y, z, r, c, 0)){
if ((*calcOffset(src, x, y, z, r, c, 0, pitch) == 2) || (*calcOffset(src, x, y, z, r, c, 0, pitch)) == 1)
return false;
}
else {
return false;
}
}
}
return true;
}
将来,您可能希望使用非音调分配来使代码正常工作。一旦你把事情做好了,你就可以看看添加投球分配是否会给你带来任何性能优势。
我还想到,如果(x+xoff)
为负(或者(x+xoff)
导致索引到下一行),即使这样也不会起作用。在倾斜分配中,不能以这种方式从一行向后索引到前一行(或下一行)。首先需要将(x+xoff)
解析为引用的实际行,然后为该行建立索引,然后针对该行进行倾斜计算。
- C++尝试深度复制唯一指针时出现内存访问冲突
- 如何使用 MPI 的远程内存访问 (RMA) 功能并行化数据聚合?
- CRTP - 危险的内存访问?
- C++ Python 的扩展 - 安全内存访问和内存布局
- 在Visual Studio中查找非法内存访问
- C++内存访问违反内存大块
- 数组中未映射的内存访问从python传递到c++
- 使用加速进程间创建消息队列 - 内存访问冲突
- C 指针转换会导致内存访问冲突
- 为什么代码会抛出非法内存访问错误
- 多线程环境中C++内存访问
- CUDA 中的递归返回非法内存访问
- 为什么创建进程 API 调用会导致内存访问冲突?
- 在 C++ 中遍历链表比在具有类似内存访问的 Go 中慢
- 确定打开进程的内存访问位置
- 存在内存访问异常,但我不确定我的代码中出了什么问题
- 指向结构的指针的 2D 数组.内存访问问题
- GPU 内存访问和使用 (CUDA)
- 在实时程序中是动态内存访问有害的
- 随机 mmaped 内存访问比堆数据访问慢 16%