OpenCL: for循环中的CL_OUT_OF_RESOURCES
OpenCL: CL_OUT_OF_RESOURCES in a for loop
我试图执行一个OpenCL,但它给了我一个CL_OUT_OF_RESOURCES。情况如下:
我正在测试100个工作项,所以我将global_sizes和local_sizes设置为100。我创建了一个100 * 128的只写缓冲区,用于处理工作项的128个值。我执行内核,当我要读取结果缓冲区时,我得到了错误。
内核代码如下:
__kernel void k2(__global int* debug) {
uint idx = 128 * get_global_id(0);
uint i, k;
for (i = 0; i < 128000; ++i) {
for (k = 0; k < 128; ++k) {
debug[idx+k] = 23;
}
}
}
我取变量idx中每个工作项的索引。然后,我执行一个128000次的子循环(我知道这是一个愚蠢的事情,但它只是为了测试的目的!),并将值23赋给缓冲区的每个值。
启动代码如下:
cl_int status;
cl_uint num_platforms;
cl_platform_id* platforms;
cl_uint* num_devices;
cl_device_id** devices;
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_kernel kernel;
cl_program program;
cl_ulong max_mem_size;
cl_ulong max_work_group_size;
size_t max_work_item_size[3];
// Discover and populate the platforms
status = clGetPlatformIDs(0, NULL, &num_platforms);
chk_err(status, "Getting platform IDs", true);
if (num_platforms <= 0) {
// If no platforms are available, we shouldn't continue
fprintf(stderr, "No OpenCL platforms found!n");
exit(-1);
}
// Get all the platforms
platforms = new cl_platform_id[num_platforms];
status = clGetPlatformIDs(num_platforms, platforms, NULL);
chk_err(status, "Getting platform IDs", true);
// Allocate space for the device lists and lengths
num_devices = new cl_uint[num_platforms];
devices = new cl_device_id*[num_platforms];
// Traverse the platforms array printing information and
// populating devices
for (cl_uint i = 0; i < num_platforms; ++i) {
// Print some platform info
char* name = get_platform_info(platforms[i], CL_PLATFORM_NAME,
"Getting platform name");
char* vendor = get_platform_info(platforms[i], CL_PLATFORM_VENDOR,
"Getting platform vendor");
//printf("Platform: %snVendor: %sn", name, vendor);
delete[] name;
delete[] vendor;
// Retrieve the devices
status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices[i]);
if (chk_err(status, "Getting device IDs")) {
printf("This is a known NVIDIA bug (if platform == AMD then die)n");
printf("Setting number of devices to 0 and continuingn");
num_devices[i] = 0;
}
//printf("Devices: %dn", num_devices[i]);
// Populate OpenCL devices if any exist
if (num_devices[i] != 0) {
// Allocate an array of devices of size "numDevices"
devices[i] = new cl_device_id[num_devices[i]];
// Populate Array with devices
status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, num_devices[i],
devices[i], NULL);
chk_err(status, "Getting device IDs", true);
}
}
cl_uint chosen_platform = 0;
cl_uint chosen_device = 0;
// Do a sanity check of platform/device selection
if (chosen_platform >= num_platforms ||
chosen_device >= num_devices[chosen_platform]) {
fprintf(stderr, "Invalid platform/device combinationn");
exit(-1);
}
// Set the selected platform and device
platform = platforms[chosen_platform];
device = devices[chosen_platform][chosen_device];
// Get some device info
char* name = get_device_name(device);
char* vendor = get_device_vendor(device);
max_mem_size = get_device_max_mem_size(device);
max_work_group_size = get_device_max_work_group_size(device);
get_device_max_work_item_size(device, max_work_item_size);
printf("Device: %sn", name);
printf("Vendor: %sn", vendor);
printf("Max mem size: %llu Mbn", max_mem_size / 1024);
printf("Max work group size: %llun", max_work_group_size);
printf("Max work item size: %llu, %llu, %llun",
max_work_item_size[0], max_work_item_size[1], max_work_item_size[2]);
delete[] name;
delete[] vendor;
// Create the context
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)(platform), 0};
context = clCreateContext(cps, 1, &device, NULL, NULL, &status);
chk_err(status, "Creating context", true);
// Create the command queue
queue = clCreateCommandQueue(context, device, 0, &status);
chk_err(status, "creating command queue", true);
// Load kernel source
char* source = load_kernel_source("vpm2.cl");
size_t source_size[] = { strlen(source) };
// Create the program object
program = clCreateProgramWithSource(context, 1, (const char**)&source,
source_size, &status);
chk_err(status, "Creating program", true);
delete[] source;
// Try to compile the program
const char options[] = "-D ENABLE_DOUBLE -Werror -cl-nv-verbose";
status = clBuildProgram(program, 1, &device, options, NULL, NULL);
if (chk_err(status, "Building program")) {
cl_build_status build_status;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &build_status, NULL);
size_t size;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0,
NULL, &size);
char* build_log = new char[size+1];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
size+1, build_log, NULL);
build_log[size] = ' ';
printf("Build log:n%snEnd logn", build_log);
chk_err(build_status, "Getting build info", true);
}
// Create the kernel
kernel = clCreateKernel(program, "k2", &status);
chk_err(status, "Creating kernel", true);
// Create the buffer
uint num_workitems = 100;
uint buf_size = num_workitems * 128;
cl_mem mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buf_size * sizeof(int), NULL, &status);
chk_err(status, "Error creating const mem buffer", true);
// Add arguments
status = clSetKernelArg(kernel, 0, sizeof(mem), &mem);
chk_err(status, "Setting kernel arg", true);
// Execute kernel
size_t global_sizes[1] = {num_workitems};
size_t local_sizes[1] = {num_workitems};
status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
global_sizes, local_sizes, 0, NULL, NULL);
chk_err(status, "Executing kernel", true);
// Read the results
int* res = new int[buf_size];
status = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0,
buf_size * sizeof(int), (void*)res, 0, NULL, NULL);
chk_err(status, "Reading buffer", true);
// Release objects
status = clReleaseProgram(program);
chk_err(status, "Releasing program");
status = clReleaseKernel(kernel);
chk_err(status, "Releasing kernel");
status = clReleaseMemObject(mem);
chk_err(status, "Releasing mem object");
clReleaseCommandQueue(queue);
clReleaseContext(context);
for (cl_uint i = 0; i < num_platforms; ++i) {
delete[] devices[i];
}
delete[] devices;
delete[] num_devices;
delete[] platforms;
delete res;
起初我以为我跑出了idx+k索引的范围,但事实并非如此。
这个错误真的很奇怪,因为如果我把idx+k改成idx+127,它就可以工作了。如果我也将数字128000更改为较小的数字,例如56000,它也可以工作(!),因此这一事实丢弃了内核创建/执行中的一些错误。很神奇,不是吗?我开始认为有一个问题在本地内存管理或类似的东西。任何想法?
顺便说一下……我在NVIDIA Quadro 2000上运行代码。
非常感谢!
最可能的情况是,您在内核中是SEG_FAULTing,并且给您CL_OUT_OF_RESOURCES,这是nVIDIA平台中内核seg_fault时的通用错误。但是,由于在对内核进行排队时,clEnqueueNDRangeKernel无法检测到错误,因此在读取缓冲区时返回SEG_FAULTED。
原因可以是:
- 你正在运行的项目比你想象的要多(我们可以看看你是如何运行内核的吗?)
- 为调试变量创建的内存少于所需的内存。 内存标志不是OK的,它们是只读的,或者任何其他问题。
PD:如果您只运行100个工作项,我最初的假设是错误的。
你的错误的另一个选择是,你正在写6GB的数据到一个120kB的区域,只有一个工作组,这导致了一个巨大的瓶颈,使内核需要这么多的时间来运行,被驱动程序杀死。返回CL_OUT_OF_RESOURCES.
减少循环的数量解决了这个问题,将k设置为固定值将消除编译器优化阶段的循环(因此,也解决了这个问题)。你可以尝试使用更多的工作组来解决这个问题。
你是否经历过2秒的屏幕冻结?
- 为什么我的向量::擦除调用会抛出"vector subscript out of range"?
- 解决方案在第 25 行执行错误'out of bounds'
- C++ 向量加减抛出"expression: vector subscript out of range."错误
- 从 MSVC14 切换到 MSVC16 会导致"compiler is out of heap space (C1060)"错误
- 庞大的初始化列表,如何修复"fatal error C1060: compiler is out of heap space"
- 如何解决'vector subscript out of range'错误?
- 在析构函数中调用"delete"运算符时"compiler is out of heap space"编译器错误
- 空集"Out of bound iterator"
- 在 std::unordered_map 中插入新的键/值对会导致"out of range"异常
- 注册 Clang 检查器时出错:"out-of-line-definition of register"
- QList::operator[]中的断言失败<T>:QJoysticks 中的"index out of range"
- 尝试获取矩阵的正确对角线会导致"vector out of range"错误
- 运算符方法和返回对象"out-of-scope"?
- 如何使用 mpi 分散修复"vector subscript out of range"?
- 如何修复邻接列表中的"Debug Assertion Failed" "vector subscript out of range"
- "Vector subscript out of range",在返回声明?
- QList<T>::operator[]: "index out of range" 中的断言失败
- 多次调用存储过程时C++连接器"Commands out of sync" mySQL
- 如何处理C++'index out of bounds error'?
- MySQL 异常"connection lost during query"、"MySQL server has gone away"和"command out of sync"