OpenCL - GPU 总和和 CPU 总和不同
OpenCL - GPU sum and CPU sum not the same
我是OpenCL的新手。
我写了一个程序,应该对 100 万个元素数组进行并行归约。在代码的最后一部分,我比较了 CPU总和和 GPU总和 ,它们不一样,这就是问题所在。我的本地尺码是 64。从索引"90"开始,GPU 中的总和开始变大。
编辑:如果我对较小的数字求和(现在我和 0 - 1m(,假设 1 的最终总和是正确的。
内核:
__kernel void gpuSumfunc( __global float *vec ,__global float* sum, int n)
{
__local float tempSum[64];
const int i;
const int globalID = get_global_id(0); //BLOCK_DIM*BLOCK_IND+THREAD_ID
const int tid = get_local_id(0); //THREAD_ID
const int BlockDIM = get_local_size(0);//BLOCK_DIM=64
if (globalID < n)
{
tempSum[tid] = vec[globalID]; //Inserting global data to local data
}
else
{
tempSum[tid] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE); //Wating for all the threads to copy their data
for (i = BlockDIM / 2; i > 0; i /= 2)
{
if (tid < i)
{
tempSum[tid] += tempSum[tid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0)
{
sum[get_group_id(0)] = tempSum[0];
}
}
主要:
//HOST-cpu
float *h_a;//input
float *h_b;//output
float *h_s;
//DEVICE-gpu
cl_mem d_a;//input buffer
cl_mem d_b;//Output
//Kernel File
FILE* fileKernel;
//Memory allocation - cpu input
vector = (float*)malloc(n * sizeof(float));
h_a = (float*)malloc(n * sizeof(float));
h_b = (float*)malloc(n * sizeof(float));
h_s = (float*)malloc(n * sizeof(float));
*vector = { 0 };
*h_a = { 0 };
*h_b = { 0 };
*h_s = { 0 };
//Initializing Data for gpu
for (i = 0; i < n; i++) {
h_a[i] = i;//(float)i;
}
//Initializing Data for cpu
for (i = 0; i < n; i++) {
vector[i] = i;//(float)i;
}
fileKernel = fopen("KernelCode.cl", "r");
if (!fileKernel)
{
printf("Cannot open kernel file!n");
exit(1);
}
// Read kernel code
kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(kernelSource, 1, MAX_SOURCE_SIZE, fileKernel);
fclose(fileKernel);
error = clGetPlatformIDs(2, cp_Platform, NULL); //array with two devices
error = clGetDeviceIDs(cp_Platform[1], CL_DEVICE_TYPE_GPU, 1, &Device_ID, NULL); // cp_platform[1] = Nvidia GPU
context = clCreateContext(NULL, 1, &Device_ID, NULL, NULL, &error); // creating openCL context
queue = clCreateCommandQueue(context, Device_ID, 0, &error); // creating command queue, executing openCL context on device cp_Platform[1]
globalSize = ceil(n / (float)localSize)*localSize;
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, n * sizeof(float), NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, n * sizeof(float), NULL, NULL);
error = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, n * sizeof(float), h_a, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.
error |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,n * sizeof(float), h_s, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.
program = clCreateProgramWithSource(context, 1, (const char **)& kernelSource, (const size_t *)&source_size, &error); //this function creates a program object for this specific openCL context
error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); //compiles and links a program executable from the program source
kernel = clCreateKernel(program, "gpuSumfunc", &error); //creating kernel object
error = clGetKernelWorkGroupInfo(kernel, Device_ID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void*)&workGroupSize, NULL);
error = clGetKernelWorkGroupInfo(kernel, Device_ID, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), (void*)&pWorkGroupSize, NULL);
error = clGetDeviceInfo(Device_ID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(NumOfCU), &NumOfCU, NULL);
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); //Used to set the argument value for a specific argument of a kernel.
error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
error |= clSetKernelArg(kernel, 2, sizeof(int), &n);
error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Enqueues a command to execute a kernel on a device.
clFinish(queue);
clEnqueueReadBuffer(queue, d_b, CL_TRUE, 0, n*sizeof(float) , h_b, 0, NULL, NULL); ////writing data from the device (d_b) to host(h_b)
clock_t end = clock();
for (i = 0; i < (n+localSize-1)/localSize; i++)
{
gpuSum += h_b[i];
cpuSum = cpuSumfunc(vector, 64*(i+1));
if ((gpuSum - cpuSum) > Tolerance)
{
printf("nfailed! for index:%d",i);
printf("nCPU sum = %f", cpuSum);
printf("nGPU sum = %fn", gpuSum);
}
else
{
printf("nPassed! for index:%d",i);
printf("nCPU sum: %.2f", cpuSum);
printf("nGPU sum: %.2fn", gpuSum);
}
}
// cpu
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
//printf("nTotal program's running time is: %.2fn", time_spent);
free(h_a);
free(h_b);
free(h_s);
free(vector);
//free(kernelSource);
clReleaseProgram(program);
clReleaseContext(context);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
}
float cpuSumfunc(float * vec, int n)
{
float sum = 0;
int i;
for (i = 0; i < n; i++)
{
sum += vec[i];
}
return sum;
}
Float32
值对于求和操作来说不够准确,并且会出现舍入误差,在 CPU 和 GPU 设备中会有所不同。
16956560
需要 25 位才能准确表示。 Float32
仅提供 23 位精度。这意味着:如果在 Float32 中执行操作,则16956560 + 1 = 16956560。
两种设备的区别在于:
- 排序:CPU 和 GPU 将以不同的顺序求和,具有不同的舍入误差。
- 准确性:大多数 CPU(x86 等(使用内部 48 位浮点数学运算,然后将其另存为 32 位。而 GPU 以纯 32 位完成所有数学运算。
你可以通过使用 Float64 ( double
( 或使用整数 (int64_t = Long 来解决它。
注意:实际上,您的GPU总和比CPU总和更准确,因为它首先将小值打包在一起,然后将这些大值与最终总和相加。
相关文章:
- 使用std::vector的OpenCL矩阵乘法
- 处理小于cpu数据总线的数据类型.(c++转换为机器代码)
- 在模拟器中使用并集来模拟CPU寄存器有多合适
- 编写一个函数以使用 n 百分比的 CPU 使用率
- 如何禁用 CPU 的无序执行
- CPU 瓶颈;处理具有许多非静态对象的 3D 场景渲染的简单方法
- OpenCL 内核参数中的字符***?
- 分别测量每个线程上花费的 CPU 时间(C++)
- "CPU OpenCL Project"和"GPU OpenCL Project"的区别
- OpenCL 的 CPU 使用率意外
- OpenCL - GPU 总和和 CPU 总和不同
- 我的OpenCL测试的运行速度不超过CPU
- Linux下基于英特尔CPU/GPU的OpenCL开发
- 在CPU上使用OpenCL将一个数组复制到另一个数组比C++代码慢得多
- OpenCL - 结果与 CPU 版本不同
- CPU设备上的OpenCL-引擎盖下发生的事情
- OpenCL:在 CPU 上而不是在 GPU 上更正结果:如何正确管理内存
- 对于朴素矩阵乘法,OpenCL CPU比OpenCL GPU更快
- OpenCL结构值对CPU正确,但对GPU不正确
- OpenCL示例程序在CPU上的执行速度是在GPU上的10倍