OpenCL实现的算法比正常循环慢
OpenCL implemented algorithms slower than normal loop
我是并行计算和OpenCL的新手。我学习了《OpenCL编程指南》一书。在卷积实现部分。
我的主.cpp:
#include <iostream>
#include <sstream>
#include <fstream>
#include <string>
#include <OpenCL/OpenCL.h>
using namespace std;
const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;
cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
{3, 1, 1, 4, 8, 2, 1, 3},
{4, 2, 1, 1, 2, 1, 2, 3},
{4, 4, 4, 4, 3, 2, 2, 2},
{9, 8, 3, 8, 9, 0, 0, 0},
{9, 3, 3, 9, 0, 0, 0, 0},
{0, 9, 0, 8, 0, 0, 0, 0},
{3, 0, 8, 8, 9, 4, 4, 4},
{5, 9, 8 ,1 ,8, 1, 1, 1}
};
const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;
cl_uint outputSignal[outputSignalWidth][outputSignalHeight];
const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;
cl_uint mask[maskWidth][maskHeight] =
{
{1, 1, 1}, {1, 0, 1}, {1, 1, 1}
};
inline void checkErr(cl_int err, const char* name)
{
if (err != CL_SUCCESS)
{
cerr << "Error: " << name << endl;
exit(EXIT_FAILURE);
}
}
void CL_CALLBACK contextCallback(const char * errInfo,
const void * private_info,
size_t cb,
void * user_data)
{
cout << "Error occurred during contxt use: " << errInfo << endl;
exit(EXIT_FAILURE);
}
int main(int argc, const char * argv[])
{
cl_int errNum;
cl_uint numPlatforms;
cl_uint numDevices;
cl_platform_id * platformIDs;
cl_device_id * deviceIDs;
cl_context context = NULL;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem inputSignalBuffer;
cl_mem outputSignalBuffer;
cl_mem maskBuffer;
errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");
platformIDs = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numPlatforms);
errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatFormIDs");
deviceIDs = NULL;
cl_uint i;
for (i = 0; i < numPlatforms; i++)
{
errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
{
checkErr(errNum, "clGetDeviceIDs");
} else if (numDevices > 0)
{
deviceIDs = (cl_device_id *) alloca(sizeof(cl_device_id) * numDevices);
errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, numDevices, &deviceIDs[0], NULL);
checkErr(errNum, "clGetDeviceIDs");
break;
}
}
if (deviceIDs == NULL)
{
cout << "No CPU devices found." << endl;
exit(-1);
}
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties) platformIDs[i], 0
};
context = clCreateContext(contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum);
checkErr(errNum, "clCreateContext");
ifstream srcFile("Convolution.cl");
checkErr(srcFile.is_open()?CL_SUCCESS:-1, "reading Convolution.cl");
string srcProg(istreambuf_iterator<char>(srcFile),
(istreambuf_iterator<char>()));
const char* src = srcProg.c_str();
size_t length = srcProg.length();
program = clCreateProgramWithSource(context, 1, &src, &length, &errNum);
checkErr(errNum, "clCreateProgramWithSource");
cout << "Device count: " << sizeof(deviceIDs)/sizeof(cl_device_id) << endl;
errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);
checkErr(errNum, "clBuildProgram");
kernel = clCreateKernel(program, "convolve", &errNum);
checkErr(errNum, "clCreateKernel");
inputSignalBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * inputSignalHeight*inputSignalWidth, static_cast<void*>(inputSignal), &errNum);
checkErr(errNum, "clCreateBuffer(inputSignal)");
maskBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * maskHeight * maskWidth, static_cast<void*>(mask), &errNum);
checkErr(errNum, "clCreateBuffer(mask)");
outputSignalBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, NULL, &errNum);
checkErr(errNum, "clCreateBuffer(outputSignal)");
queue = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum);
checkErr(errNum, "clCreateCommandQueue");
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
checkErr(errNum, "clSetKernelArg");
const size_t globalWorkSize[1] =
{
outputSignalWidth * outputSignalWidth
};
const size_t localWorkSize[1] =
{
1
};
clock_t start, end;
clFinish(queue);
start = clock();
errNum = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
clFinish(queue);
end = clock();
cout << "time for calculation: " << (float)(end - start) << endl;
checkErr(errNum, "clEnequeueNDRangeKernel");
errNum = clEnqueueReadBuffer(queue, outputSignalBuffer, CL_TRUE, 0, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, outputSignal, 0, NULL, NULL);
checkErr(errNum, "clEnqueueReadBuffer");
clFinish(queue);
start = clock();
for (int y = 0; y < outputSignalHeight; y++)
{
for (int x = 0; x < outputSignalHeight; x++)
{
uint sum = 0;
for (int r = 0; r < maskWidth; r++)
{
for (int c =0; c < maskWidth; c++)
{
sum += inputSignal[y+r][x+c]*mask[r][c];
}
}
outputSignal[y][x] = sum;
}
}
end = clock();
cout << "Loop version time: " << (float)(end - start) << endl;
return 0;
}
和卷积.cl:
__kernel void convolve(const __global uint * const input,
__constant uint * const mask,
__global uint * const output,
const int inputWidth,
const int maskWidth)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
uint sum = 0;
for (int r = 0; r < maskWidth; r++)
{
const int idxIntmp = (y + r) * inputWidth + x;
for (int c =0; c < maskWidth; c++)
{
sum+= mask[r * maskWidth + c] * input[idxIntmp + c];
}
}
output[y * get_global_id(0) + x] = sum;
}
该平台是MacOS 10.9和AMD 6750M。CL版本比循环版本慢很多(大约慢10倍),这是没有道理的。你们能帮我指出代码出了什么问题吗?
有两个主要问题:
const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalWidth };
首先,正如Basile Starynkevitch在评论中指出的那样,您的数据集非常小。太小了,无法从任何GPU加速中获益。在这里,你只运行了36个工作项:这个工作项小得离谱,在一个计算单元上只能容纳一半的波前。
您应该运行数千个的工作项,以正确利用GPU的功能。因此,OpenCL的开销使GPU版本比CPU版本慢。尝试使用大得多的数据集,您应该会注意到性能的显著提高。
另外:
const size_t localWorkSize[1] = { 1 };
您正在运行每个1
工作项的outputSignalWidth * outputSignalWidth
工作组。这是一个很大的问题。
在AMD GPU上,波前大小为64。这意味着,如果您想充分利用硬件,您应该安排的工作组至少64个工作项(理想情况下是64的倍数)。您目前正在浪费64个硬件线程中的63个,即98.4%的GPU无所事事!
要么调整代码以使用更大的工作组(并相应地更改全局工作大小),要么让OpenCL驱动程序通过传递NULL
而不是localWorkSize
来为您选择最佳大小。
简言之,您使用的是一台大型反铲,只需移动一个微小的砾石。
相关文章:
- 嵌套的 for 循环不循环通过第二个数组
- 带有开关语句的 do-while 循环 -- 无穷循环错误
- 编写按初始值循环的循环的更好方法是什么
- 如何在重复循环中循环遍历 std::vector
- 对于循环不循环和检测字符数组 [指针和字符数组]
- 自定义对象构造函数在循环外部循环
- 为什么在我的程序中的第二个循环时,当它在循环之前循环时不起作用
- 而循环比循环更有效.可能是什么原因
- 如何在循环中循环访问两个向量
- C++ for 循环继续循环,即使它不应该循环
- 当使用文件中的getline时,循环仅循环一次
- C 数组,如果循环.如果循环忽略值并在输入值时给出错误的行
- 对于循环(内部循环时)被忽略
- 对于循环,循环次数比它必须的次数多一次
- 我的循环保持循环,并导致程序崩溃
- 数组循环在循环结束后输出垃圾
- C++ do-while 循环不循环
- C++ AMP,用于循环parallel_for_each循环
- 避免在C++循环中循环中复杂对象时最小化范围效率低下的技术
- 设置 for 循环以循环,直到检测到空字节