CUDA:请帮我找出代码中的错误
CUDA: please help me to find error in my code
有代码,使用GPU:
__global__ void gpu_process(float* input, float* weights, float* output, int psize, int size)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < psize && j < size)
output[j] += input[i] * weights[i * size + j];
}
void process(float* input, float* weights, float* output, size_t psize, size_t size)
{
float* in_d, *w_d, *out_d;
cudaMalloc((void**)&in_d, psize * sizeof(float));
cudaMalloc((void**)&w_d, psize * size * sizeof(float));
cudaMalloc((void**)&out_d, size * sizeof(float));
for(size_t i = 0; i < size; i++)
output[i] = 0;
cudaMemcpy(in_d, input, psize * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(w_d, weights, psize * size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(out_d, output, size * sizeof(float), cudaMemcpyHostToDevice);
int rx = psize, ry = size, block_x = min((int)psize, 32), block_y = min((int)size, 32);
dim3 dimBlock(block_x, block_y);
dim3 dimGrid(ceil(float(rx) / block_x), ceil(float(ry) / block_y));
gpu_process<<<dimGrid, dimBlock>>>(in_d, w_d, out_d, psize, size);
cudaThreadSynchronize();
cudaMemcpy(output, out_d, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(in_d);
cudaFree(out_d);
cudaFree(w_d);
}
有代码,做同样的事情,但只使用CPU:
int blockIdxx, blockIdxy, blockDimx, blockDimy, threadIdxx, threadIdxy;
void cpu_process(float* input, float* weights, float* output, int psize, int size)
{
int i = blockIdxx*blockDimx + threadIdxx;
int j = blockIdxy*blockDimy + threadIdxy;
if(i < psize && j < size)
output[j] += input[i] * weights[i * size + j];
}
void process(float* input, float* weights, float* output, size_t psize, size_t size)
{
for(size_t i = 0; i < size; i++)
output[i] = 0;
int rx = psize, ry = size, block_x = min((int)psize, 32), block_y = min((int)size, 32);
blockDimx = block_x;
blockDimy = block_y;
int gridDimx = ceil(float(rx) / block_x), gridDimy = ceil(float(ry) / block_y);
for(blockIdxx = 0; blockIdxx < gridDimx; blockIdxx++)
for(blockIdxy = 0; blockIdxy < gridDimy; blockIdxy++)
for(threadIdxx = 0; threadIdxx < blockDimx; threadIdxx++)
for(threadIdxy = 0; threadIdxy < blockDimy; threadIdxy++)
cpu_process(input, weights, output, psize, size);
}
为什么CPU变体工作正确,但GPU变体返回垃圾输出?
有什么不同?cuda-toolkit版本:4.0
操作系统:Debian GNU/Linux, cuda从它的仓库安装。
GPU: NVIDIA GeForce GT 525M.
cudaThreadSyncronize已弃用,不应使用,而是使用cudaDeviceSyncronize,检查这些错误代码,因为如果线程失败,它们将返回错误。它们还会阻塞此后的所有代码,直到任务完成为止,因此您还可以在两者之间添加一些计时代码以查找瓶颈。
相关文章:
- 将鼠标悬停在问题上时与预期">"相关的代码错误
- VI工作室代码错误无法打开输出文件主.exe
- 引入参数化构造函数后显示 LNK 2019 未解析外部符号的代码错误
- 有关矩阵的代码错误导致分段错误(内核转储)
- C++ Schannel POST 400 错误代码错误请求
- Java 本机访问代码错误:"Invalid memory access"
- 代码错误修改
- C++代码错误分配
- "_FCbuild"不能用作函数 - C/C++ 代码错误
- 我的代码错误类型警告 1 警告 C4018 和更多错误
- 代码错误(从 1 到 100 找到质数)
- OpenCV VisualStudio,C 代码错误尝试访问像素
- 当我使用的教程完全没有错误时,为什么我的代码错误
- 简单的NOOB C 输入/输出问题:代码错误
- 为什么enumProcessModules返回false值和299代码错误
- WinhttpSetoption()失败设置TLSV1.2,带有错误代码错误_internet_incorrect_ha
- 苹果安全传输代码错误
- C++ - 国际象棋主教移动代码错误
- 导入Python库时,嵌入了Python代码错误
- 编译提升C 代码错误