为什么我在 CUDA 中实施总和减少时得到错误的结果?
Why am I getting wrong results with this implemention of a sum reduction in CUDA?
我正在为使用 CUDA C++ API 实现的vector_reduction算法编写一个教程,我很挣扎,因为我真的不明白我做错了什么,因为结果是(设备:4386.000000 主机:260795.000000)
我使用的代码如下(问题大小固定为 512)。
编辑:不幸的是,问题尚未解决,我仍然得到相同的结果。我已经更新了提供完整代码的代码。目标是相同的,对 512 个元素的浮点数组的所有元素求和。
#define NUM_ELEMENTS 512
__global__ void reduction(float *g_data, int n)
{
__shared__ float s_data[NUM_ELEMENTS];
int tid = threadIdx.x;
int index = tid + blockIdx.x*blockDim.x;
s_data[tid] = 0.0;
if (index < n){
s_data[tid] = g_data[index];
}
__syncthreads();
for (int s = 2; s <= blockDim.x; s = s * 2){
if ((tid%s) == 0){
s_data[tid] += s_data[tid + s / 2];
}
__syncthreads();
}
if (tid == 0){
g_data[blockIdx.x] = s_data[tid];
}
}
// includes, system
#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
// includes, kernels
#include "vector_reduction_kernel.cu"
// For simplicity, just to get the idea in this MP, we're fixing the problem size to 512 elements.
#define NUM_ELEMENTS 512
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
float computeOnDevice(float* h_data, int array_mem_size);
extern "C"
void computeGold( float* reference, float* idata, const unsigned int len);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
cudaSetDevice(0);
runTest( argc, argv);
return EXIT_SUCCESS;
}
////////////////////////////////////////////////////////////////////////////////
//! Run naive scan test
////////////////////////////////////////////////////////////////////////////////
void runTest( int argc, char** argv)
{
int num_elements = NUM_ELEMENTS;
const unsigned int array_mem_size = sizeof( float) * num_elements;
// Allocate host memory to store the input data
float* h_data = (float*) malloc( array_mem_size);
// initialize the input data on the host to be integer values
// between 0 and 1000
for( unsigned int i = 0; i < num_elements; ++i)
h_data[i] = floorf(1000*(rand()/(float)RAND_MAX));
// Function to compute the reference solution on CPU using a C sequential version of the algorithm
// It is written in the file "vector_reduction_gold.cpp". The Makefile compiles this file too.
float reference = 0.0f;
computeGold(&reference , h_data, num_elements);
// Function to compute the solution on GPU using a call to a CUDA kernel (see body below)
// The kernel is written in the file "vector_reduction_kernel.cu". The Makefile also compiles this file.
float result = computeOnDevice(h_data, num_elements);
// We can use an epsilon of 0 since values are integral and in a range that can be exactly represented
float epsilon = 0.0f;
unsigned int result_regtest = (abs(result - reference) <= epsilon);
printf( "Test %sn", (1 == result_regtest) ? "Ok." : "No.");
printf( "device: %f host: %fn", result, reference);
// cleanup memory
free( h_data);
}
// Function to call the CUDA kernel on the GPU.
// Take h_data from host, copies it to device, setup grid and thread
// dimensions, excutes kernel function, and copy result of scan back
// to h_data.
// Note: float* h_data is both the input and the output of this function.
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy((void**)&d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
//int threads = (num_elements/2) + num_elements%2;
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}
你真的应该为这样的问题提供一个完整的代码。 您还应该使用正确的 CUDA 错误检查并使用cuda-memcheck
运行代码。代码中至少有 2 个错误:
-
我们不会做这样的
cudaMemcpy
:cudaMemcpy((void**)&d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
它应该是:
cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
第一个参数只是一个指针,而不是指针到指针。
cuda-memcheck
或适当的 CUDA 错误检查会将您的注意力集中在这一行上。 -
您没有启动足够的线程。 内核为每个线程加载一个元素。 如果您的问题大小为 512,您将需要 512 个线程,如下所示:
int threads = (num_elements/2) + num_elements%2;
不是让你明白的。 不知道你在那里有什么想法。 但这可以为 512 案例修复它:
int threads = (num_elements);
您的缩减方法需要 2 次幂线程块大小。
这是一个完整的测试用例,请注意cuda-memcheck
的使用:
$ cat t27.cu
#include <stdio.h>
#define NUM_ELEMENTS 512
__global__ void reduction(float *g_data, int n)
{
__shared__ float s_data[NUM_ELEMENTS];
int tid = threadIdx.x;
int index = tid + blockIdx.x*blockDim.x;
s_data[tid] = 0.0;
if (index < n){
s_data[tid] = g_data[index];
}
__syncthreads();
for (int s = 2; s <= blockDim.x; s = s * 2){
if ((tid%s) == 0){
s_data[tid] += s_data[tid + s / 2];
}
__syncthreads();
}
if (tid == 0){
g_data[blockIdx.x] = s_data[tid];
}
}
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}
int main(){
float *data = new float[NUM_ELEMENTS];
for (int i = 0; i < NUM_ELEMENTS; i++) data[i] = 1;
float r = computeOnDevice(data, NUM_ELEMENTS);
printf(" result = %fn" , r);
}
$ nvcc -arch=sm_35 -o t27 t27.cu
$ cuda-memcheck ./t27
========= CUDA-MEMCHECK
result = 512.000000
========= ERROR SUMMARY: 0 errors
这是您现在发布的代码的修改版本(以几种新的/不同的方式损坏),它似乎对我来说可以正常运行:
$ cat t30.cu
#define NUM_ELEMENTS 512
__global__ void reduction(float *g_data, int n)
{
__shared__ float s_data[NUM_ELEMENTS];
int tid = threadIdx.x;
int index = tid + blockIdx.x*blockDim.x;
s_data[tid] = 0.0;
if (index < n){
s_data[tid] = g_data[index];
}
__syncthreads();
for (int s = 2; s <= blockDim.x; s = s * 2){
if ((tid%s) == 0){
s_data[tid] += s_data[tid + s / 2];
}
__syncthreads();
}
if (tid == 0){
g_data[blockIdx.x] = s_data[tid];
}
}
// includes, system
#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
// includes, kernels
// For simplicity, just to get the idea in this MP, we're fixing the problem size to 512 elements.
#define NUM_ELEMENTS 512
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
float computeOnDevice(float* h_data, int array_mem_size);
extern "C"
void computeGold( float* reference, float* idata, const unsigned int len)
{
for (int i = 0; i<len; i++) *reference += idata[i];
};
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
cudaSetDevice(0);
runTest( argc, argv);
return EXIT_SUCCESS;
}
////////////////////////////////////////////////////////////////////////////////
//! Run naive scan test
////////////////////////////////////////////////////////////////////////////////
void runTest( int argc, char** argv)
{
int num_elements = NUM_ELEMENTS;
const unsigned int array_mem_size = sizeof( float) * num_elements;
// Allocate host memory to store the input data
float* h_data = (float*) malloc( array_mem_size);
// initialize the input data on the host to be integer values
// between 0 and 1000
for( unsigned int i = 0; i < num_elements; ++i)
h_data[i] = floorf(1000*(rand()/(float)RAND_MAX));
// Function to compute the reference solution on CPU using a C sequential version of the algorithm
// It is written in the file "vector_reduction_gold.cpp". The Makefile compiles this file too.
float reference = 0.0f;
computeGold(&reference , h_data, num_elements);
// Function to compute the solution on GPU using a call to a CUDA kernel (see body below)
// The kernel is written in the file "vector_reduction_kernel.cu". The Makefile also compiles this file.
float result = computeOnDevice(h_data, num_elements);
// We can use an epsilon of 0 since values are integral and in a range that can be exactly represented
float epsilon = 0.0f;
unsigned int result_regtest = (abs(result - reference) <= epsilon);
printf( "Test %sn", (1 == result_regtest) ? "CORRECTO: Coinciden los resultados de la CPU y la GPU" : "INCORRECTO: Los resultados calculados en paralelo en la GPU no coinciden con los obtenidos secuencialmente en la CPU");
printf( "device: %f host: %fn", result, reference);
// cleanup memory
free( h_data);
}
// Function to call the CUDA kernel on the GPU.
// Take h_data from host, copies it to device, setup grid and thread
// dimensions, excutes kernel function, and copy result of scan back
// to h_data.
// Note: float* h_data is both the input and the output of this function.
#if 0
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy((void**)&d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
//int threads = (num_elements/2) + num_elements%2;
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}
#endif
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaError_t err = cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
if (err != cudaSuccess) {printf("CUDA error: %sn", cudaGetErrorString(err)); exit(0);}
// Copy from host memory to device memory
cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
err = cudaGetLastError();
if (err != cudaSuccess) {printf("CUDA error: %sn", cudaGetErrorString(err)); exit(0);}
cudaDeviceReset();
return result;
}
$ nvcc -arch=sm_35 -o t30 t30.cu
$ cuda-memcheck ./t30
========= CUDA-MEMCHECK
Test CORRECTO: Coinciden los resultados de la CPU y la GPU
device: 260795.000000 host: 260795.000000
========= ERROR SUMMARY: 0 errors
$
您仍然没有将适当的 CUDA 错误检查添加到您的代码中,因此您完全有可能遇到机器设置问题。 如果您仍然遇到问题,您可能需要运行我上面发布的确切代码,因为我在其中进行了基本的错误检查。
相关文章:
- 如何修复循环中的错误产生的错误结果?
- 添加可变参数函数的错误结果
- 使用特征 3 线性系统求解器的错误结果
- 按位包含 OR 的错误结果
- 如果我将索引变量更改为零,并且合并函数中的k = 0,则获得合并排序的错误结果
- STD :: FPCLASSIFY的错误结果使用Valgrind进行长时间的双重双重结果
- 视觉工作室的错误结果
- 将C#struct传递给C 不受管理的DLL返回错误结果
- 最长的常见子字错误结果
- 华氏度到摄氏度的错误结果
- UE4中简单坐标变换的错误结果
- 记忆错误结果的说明[编译器行为]
- 如何解决错误PRJ0002:错误结果 -1073741515从"cl.exe"返回?
- 错误 PRJ0002:错误结果 -1073741515 从"C:\Program Files\Microsoft Visual Studio 9.0\VC\bin\cl.
- glGetUniformLocation OpenGL ES 2.0(在ipad 3 iOS 7.0.3上返回错误结果)
- 在模板函数中调用std::for_each时出现错误结果
- 使用copy和back_inserter将矢量附加到其自身时出现错误结果
- CUDA 线程在全局内存中的私有位置写入时出现错误结果
- 来自 c++ 代码的错误结果
- 使用浮点值和铸造的算术运算的错误结果 - 差异很大,我想这不是准确值的情况(429497)?