cl::Event::waitForEvents returns -7 (CL_EXEC_STATUS_ERROR_ F
cl::Event::waitForEvents returns -7 (CL_EXEC_STATUS_ERROR_ FOR_EVENTS_IN_WAIT_LIST)
我正在尝试在同一上下文中的两个GPU设备上同时运行相同的内核。
我遇到了一个障碍,当尝试分析事件对象时,第二个命令队列得到一个 -7(事件对象不可用(。
当我等待事件时,它出错并显示 -7。这似乎只发生在命令队列 2 中。
知道为什么吗?任何帮助都会得到很多赞赏。
附加代码。
void *bytes;
float *zeropad;
float *output_f;
void *outputbytes;
int ret;
ret = posix_memalign(&bytes, total_alignment_requirement, cshape[level][1]*(size+2)*(size+2)*sizeof(float));
zeropad = (float *)bytes;
//float *output_f = (float *)calloc(cshape[level][0]*size*size,sizeof(float));
//SR assigning aligned memory
ret = posix_memalign(&outputbytes, total_alignment_requirement, cshape[level][1]*(size+2)*(size+2)*sizeof(float));
output_f = (float *)outputbytes;
unsigned int total=0;
//prepare matrix for OpenCL
padding_input(matrix,zeropad,size,in_depth);
cl::Buffer zeropad_buf(openclObjects.context,CL_MEM_READ_ONLY| CL_MEM_COPY_HOST_PTR,(size+2)*(size+2)*cshape[level][1]*sizeof(float),zeropad);
cl::Buffer output_buf(openclObjects.context,CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR ,cshape[level][0]*size*size*sizeof(float),output_f);
cl::Buffer bs(openclObjects.context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,cshape[level][0]*sizeof(float),bc[level]);
// SR using sub buffers only zeropad_buf and output_bf and to chunk up the buffer and submit the kernels twice...once to each device
//Creating sub_buffers for zeropad_buf
size_t zeropad_buf_size = (size+2)*(size+2)*cshape[level][1]*sizeof(float);
size_t output_buf_size = cshape[level][0]*size*size*sizeof(float);
size_t zeropad_split_pos = zeropad_buf_size / 2;
zeropad_split_pos -= zeropad_split_pos % total_alignment_requirement;
cl_buffer_region zero_rgn_4core = {0, zeropad_split_pos};
cl_buffer_region zero_rgn_2core = {zeropad_split_pos, zeropad_buf_size - zeropad_split_pos};
/*
cl_buffer_region zero_rgn_4core = {0, zeropad_buf_size/2};
cl_buffer_region zero_rgn_2core = {zeropad_buf_size/2, zeropad_buf_size/2};
*/
cl_buffer_region output_rgn_4core = {0, output_buf_size/2};
cl_buffer_region output_rgn_2core = {output_buf_size/2, output_buf_size/2};
cl::Buffer zeropad_buf_4Core = zeropad_buf.createSubBuffer(CL_MEM_READ_ONLY,CL_BUFFER_CREATE_TYPE_REGION, &zero_rgn_4core);
std::cout<<"zero_pad sub-buffer region 1 created"<<std::endl;
cl::Buffer zeropad_buf_2Core = zeropad_buf.createSubBuffer(CL_MEM_READ_ONLY,CL_BUFFER_CREATE_TYPE_REGION, &zero_rgn_2core);
std::cout<<"zero_pad sub-buffer region 2 created"<<std::endl;
cl::Buffer output_buf_4Core = output_buf.createSubBuffer(CL_MEM_READ_WRITE,CL_BUFFER_CREATE_TYPE_REGION, &output_rgn_4core);
cl::Buffer output_buf_2Core = output_buf.createSubBuffer(CL_MEM_READ_WRITE,CL_BUFFER_CREATE_TYPE_REGION, &output_rgn_2core);
cl::NDRange global(global_x, global_y, global_y);
cl::NDRange local(1, group_size, group_size);
//cl::Event evt[2];//SR
//SR use a vector events
std::vector<cl::Event> events;
cl::Event evt1, evt2;
//SR Kernel after sub buffering - 4 core
openclObjects.conv_gpu.setArg<cl::Memory>(0, zeropad_buf_4Core);
openclObjects.conv_gpu.setArg<cl::Memory>(1, conv_weights[level]);
openclObjects.conv_gpu.setArg<cl::Memory>(2, output_buf_4Core);
openclObjects.conv_gpu.setArg<cl::Memory>(3, bs);
openclObjects.conv_gpu.setArg<int>(4, size+2);
openclObjects.conv_gpu.setArg<int>(5, cshape[level][1]);
openclObjects.conv_gpu.setArg<int>(6, size);
openclObjects.conv_gpu.setArg<int>(7, cshape[level][0]);
openclObjects.conv_gpu.setArg<int>(8, CONV_SIZE);
cl_int err=openclObjects.queue[0].enqueueNDRangeKernel( openclObjects.conv_gpu, cl::NullRange, global, local, NULL, &evt1); //SR
events.push_back(evt1);
cl_int err=openclObjects.queue.enqueueNDRangeKernel( openclObjects.conv_gpu, cl::NullRange, global, local, NULL(
//SR Kernel after sub buffering - 2 core
openclObjects.conv_gpu.setArg<cl::Memory>(0, zeropad_buf_2Core);
openclObjects.conv_gpu.setArg<cl::Memory>(1, conv_weights[level]);
openclObjects.conv_gpu.setArg<cl::Memory>(2, output_buf_2Core);
openclObjects.conv_gpu.setArg<cl::Memory>(3, bs);
openclObjects.conv_gpu.setArg<int>(4, size+2);
openclObjects.conv_gpu.setArg<int>(5, cshape[level][1]);
openclObjects.conv_gpu.setArg<int>(6, size);
openclObjects.conv_gpu.setArg<int>(7, cshape[level][0]);
openclObjects.conv_gpu.setArg<int>(8, CONV_SIZE);
//SR Added for CQ2 (2 Core GPU)
err=openclObjects.queue[1].enqueueNDRangeKernel( openclObjects.conv_gpu, cl::NullRange, global, local, NULL, &evt2);
events.push_back(evt2);
std::cout<<"Enqueue CQ2"<<std::endl;
//get event info
cl::CommandQueue CQ;
cl::Device CQ_device;
evt2.getInfo(CL_EVENT_COMMAND_QUEUE,&CQ);
CQ.getInfo(CL_QUEUE_DEVICE, &CQ_device);
std::cout<<"New Code"<<std::endl;
std::cout<<"Event attached to COmmand Q2"<<std::endl;
std::cout<<"Device Name in Command Queue 1: "<<CQ_device.getInfo<CL_DEVICE_NAME>()<<std::endl;
std::cout<<"Device Vendor: "<<CQ_device.getInfo<CL_DEVICE_VENDOR>()<<std::endl;
std::cout<<"Device max CU: "<<CQ_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()<<std::endl;
cl::Event::waitForEvents(events);
//openclObjects.queue[0].finish(); //SR
std::cout<<"Command Queue 1 complete"<<std::endl;
//openclObjects.queue[1].finish();//SR added for CQ2
std::cout<<"Command Queue 2 complete"<<std::endl;
printf("global_x, global_y, global_y, error: %d %d %d %d",global_x, global_y, global_y, err(; printf("%d",err(;
cl_ulong elapsed=0;
cl_ulong elapsed1=0; //SR calculate elapse per command queue
cl_ulong elapsed0=0; //SR calculate elapse per command queue
elapsed0 =evt1.getProfilingInfo<CL_PROFILING_COMMAND_END>()-evt1.getProfilingInfo<CL_PROFILING_COMMAND_START>(); //SR
std::cout<<"Profile Info: Command Queue 1"<<std::endl;
elapsed1 =evt2.getProfilingInfo<CL_PROFILING_COMMAND_END>()-evt2.getProfilingInfo<CL_PROFILING_COMMAND_START>(); //SR
std::cout<<"Profile Info: Command Queue 2"<<std::endl;
//std::cout<<"elapsed CQ0"<<elapsed0<<std::endl; //SR
//std::cout<<"elapsed CQ1"<<elapsed1<<std::endl; //SR
elapsed = elapsed0+elapsed1;
尝试取消注释 openclObjects.queue[0].finish((;
和 openclObjects.queue[1].finish((;
您也可以使用齐平而不是完成。
相关文章:
- OpenSSL C API:如何在程序exec()之后恢复TLS连接?
- Qt/SQL - 从 QSqlQuery exec Stored Procedure 获取列类型和名称?
- Python 的 exec() 函数的 C++ 版本
- 为什么使用 exec() 重新启动程序不能正常工作?
- 如何将参数包装在 C 或 C++ 中并将它们传递给系统或 exec*
- 使用MSVC编译的Qt程序在app.exec()上崩溃
- -bash:/a.out:无法执行二进制文件:Exec格式错误
- 无法使用管道将数据发送到我通过exec(C++)启动的第二个应用程序
- 我在C++中收到错误" [Error] ld returned 1 exit status".帮我解决这个问题
- 为什么错误"permission denied","id returned 1 exit status"仅在 IM 使用 C++ 中的头文件 fstream 时才出现
- 在C++中可能等同于Python"exec"
- PHP 网站在传递图像作为参数时未运行 exec 函数
- wifi.status(),在AP_Mode运行时返回WL_Disconnected(6)
- [虚幻引擎4]与2个Exec一起制作节点
- 如何修复张量流中的"Non-OK-status: Not found: Op type not registered 'NoOp' in binary running"
- Android 从 Runtime.exec() 或 Java.Process() 执行本机库
- 在 c++ 和程序中使用循环遍历数组说"exit status -1"?
- 使用 exec 在 c++ 中执行 shell 命令
- 将 PHP 中的 $_POST 变量传递给由 exec() 函数运行的C++程序
- 当常量引用参数绑定到右值时,右值是否保持其"status"?