cl::Event::waitForEvents returns -7 (CL_EXEC_STATUS_ERROR_ F

cl::Event::waitForEvents returns -7 (CL_EXEC_STATUS_ERROR_ FOR_EVENTS_IN_WAIT_LIST)

本文关键字:EXEC STATUS ERROR CL Event waitForEvents returns cl      更新时间:2023-10-16

我正在尝试在同一上下文中的两个GPU设备上同时运行相同的内核。

我遇到了一个障碍,当尝试分析事件对象时,第二个命令队列得到一个 -7(事件对象不可用(。

当我等待事件时,它出错并显示 -7。这似乎只发生在命令队列 2 中。

知道为什么吗?任何帮助都会得到很多赞赏。

附加代码。

void *bytes;
float *zeropad;
float *output_f;
void *outputbytes;
int ret;
ret = posix_memalign(&bytes, total_alignment_requirement, cshape[level][1]*(size+2)*(size+2)*sizeof(float));
zeropad = (float *)bytes;

//float *output_f = (float *)calloc(cshape[level][0]*size*size,sizeof(float));  
//SR assigning aligned memory
ret = posix_memalign(&outputbytes, total_alignment_requirement, cshape[level][1]*(size+2)*(size+2)*sizeof(float));
output_f = (float *)outputbytes;
unsigned int total=0;
//prepare matrix for OpenCL    
padding_input(matrix,zeropad,size,in_depth);
cl::Buffer zeropad_buf(openclObjects.context,CL_MEM_READ_ONLY| CL_MEM_COPY_HOST_PTR,(size+2)*(size+2)*cshape[level][1]*sizeof(float),zeropad);
cl::Buffer output_buf(openclObjects.context,CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR  ,cshape[level][0]*size*size*sizeof(float),output_f);
cl::Buffer bs(openclObjects.context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,cshape[level][0]*sizeof(float),bc[level]);

// SR using sub buffers only zeropad_buf and output_bf and to chunk up the buffer and submit the kernels twice...once to each device
//Creating sub_buffers for zeropad_buf
size_t zeropad_buf_size = (size+2)*(size+2)*cshape[level][1]*sizeof(float);
size_t output_buf_size = cshape[level][0]*size*size*sizeof(float);

size_t zeropad_split_pos = zeropad_buf_size / 2;
zeropad_split_pos -= zeropad_split_pos % total_alignment_requirement;
cl_buffer_region zero_rgn_4core = {0, zeropad_split_pos};
cl_buffer_region zero_rgn_2core = {zeropad_split_pos, zeropad_buf_size - zeropad_split_pos};

/*
cl_buffer_region zero_rgn_4core = {0, zeropad_buf_size/2};
cl_buffer_region zero_rgn_2core = {zeropad_buf_size/2, zeropad_buf_size/2};
*/

cl_buffer_region output_rgn_4core = {0, output_buf_size/2};
cl_buffer_region output_rgn_2core = {output_buf_size/2, output_buf_size/2};

cl::Buffer zeropad_buf_4Core = zeropad_buf.createSubBuffer(CL_MEM_READ_ONLY,CL_BUFFER_CREATE_TYPE_REGION, &zero_rgn_4core);
std::cout<<"zero_pad sub-buffer region 1 created"<<std::endl;
cl::Buffer zeropad_buf_2Core = zeropad_buf.createSubBuffer(CL_MEM_READ_ONLY,CL_BUFFER_CREATE_TYPE_REGION, &zero_rgn_2core); 
std::cout<<"zero_pad sub-buffer region 2 created"<<std::endl;
cl::Buffer output_buf_4Core = output_buf.createSubBuffer(CL_MEM_READ_WRITE,CL_BUFFER_CREATE_TYPE_REGION, &output_rgn_4core);
cl::Buffer output_buf_2Core = output_buf.createSubBuffer(CL_MEM_READ_WRITE,CL_BUFFER_CREATE_TYPE_REGION, &output_rgn_2core);
cl::NDRange global(global_x, global_y, global_y);
cl::NDRange local(1, group_size, group_size);   
//cl::Event evt[2];//SR
//SR use a vector events
std::vector<cl::Event> events;
cl::Event evt1, evt2;

//SR Kernel after sub buffering - 4 core
openclObjects.conv_gpu.setArg<cl::Memory>(0, zeropad_buf_4Core);
openclObjects.conv_gpu.setArg<cl::Memory>(1, conv_weights[level]);
openclObjects.conv_gpu.setArg<cl::Memory>(2, output_buf_4Core);
openclObjects.conv_gpu.setArg<cl::Memory>(3, bs);
openclObjects.conv_gpu.setArg<int>(4, size+2);
openclObjects.conv_gpu.setArg<int>(5, cshape[level][1]);
openclObjects.conv_gpu.setArg<int>(6, size);
openclObjects.conv_gpu.setArg<int>(7, cshape[level][0]);
openclObjects.conv_gpu.setArg<int>(8, CONV_SIZE);


cl_int err=openclObjects.queue[0].enqueueNDRangeKernel( openclObjects.conv_gpu, cl::NullRange, global, local, NULL, &evt1);  //SR
events.push_back(evt1);

cl_int err=openclObjects.queue.enqueueNDRangeKernel( openclObjects.conv_gpu, cl::NullRange, global, local, NULL(

//SR Kernel after sub buffering - 2 core
openclObjects.conv_gpu.setArg<cl::Memory>(0, zeropad_buf_2Core);
openclObjects.conv_gpu.setArg<cl::Memory>(1, conv_weights[level]);
openclObjects.conv_gpu.setArg<cl::Memory>(2, output_buf_2Core);
openclObjects.conv_gpu.setArg<cl::Memory>(3, bs);
openclObjects.conv_gpu.setArg<int>(4, size+2);
openclObjects.conv_gpu.setArg<int>(5, cshape[level][1]);
openclObjects.conv_gpu.setArg<int>(6, size);
openclObjects.conv_gpu.setArg<int>(7, cshape[level][0]);
openclObjects.conv_gpu.setArg<int>(8, CONV_SIZE);

//SR Added for CQ2 (2 Core GPU)
err=openclObjects.queue[1].enqueueNDRangeKernel( openclObjects.conv_gpu, cl::NullRange, global, local, NULL, &evt2);
events.push_back(evt2);
std::cout<<"Enqueue CQ2"<<std::endl;
//get event info
cl::CommandQueue CQ;
cl::Device CQ_device;
evt2.getInfo(CL_EVENT_COMMAND_QUEUE,&CQ);

CQ.getInfo(CL_QUEUE_DEVICE, &CQ_device);
std::cout<<"New Code"<<std::endl;
std::cout<<"Event attached to COmmand Q2"<<std::endl;
std::cout<<"Device Name in Command Queue 1: "<<CQ_device.getInfo<CL_DEVICE_NAME>()<<std::endl;
std::cout<<"Device Vendor: "<<CQ_device.getInfo<CL_DEVICE_VENDOR>()<<std::endl;
std::cout<<"Device max CU: "<<CQ_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()<<std::endl;

cl::Event::waitForEvents(events);

//openclObjects.queue[0].finish(); //SR
std::cout<<"Command Queue 1 complete"<<std::endl;
//openclObjects.queue[1].finish();//SR added for CQ2
std::cout<<"Command Queue 2 complete"<<std::endl;
printf("global_x, global_y, global_y, error: %d %d

%d %d",global_x, global_y, global_y, err(; printf("%d",err(;

cl_ulong elapsed=0;
cl_ulong elapsed1=0; //SR calculate elapse per command queue
cl_ulong elapsed0=0; //SR calculate elapse per command queue
elapsed0 =evt1.getProfilingInfo<CL_PROFILING_COMMAND_END>()-evt1.getProfilingInfo<CL_PROFILING_COMMAND_START>(); //SR
std::cout<<"Profile Info: Command Queue 1"<<std::endl;
elapsed1 =evt2.getProfilingInfo<CL_PROFILING_COMMAND_END>()-evt2.getProfilingInfo<CL_PROFILING_COMMAND_START>(); //SR
std::cout<<"Profile Info: Command Queue 2"<<std::endl;
//std::cout<<"elapsed CQ0"<<elapsed0<<std::endl; //SR
//std::cout<<"elapsed CQ1"<<elapsed1<<std::endl; //SR
elapsed = elapsed0+elapsed1;
尝试取消注释 openclObjects.queue[0].finish((;

和 openclObjects.queue[1].finish((;

您也可以使用齐平而不是完成。