2d循环OpenCl程序不工作
2d loop OpenCl program is not working
这个程序是一个简单的并行程序,它添加了2个数组的元素。程序已成功编译,但结果不正确。
程序从2个文件中读取数组,然后添加它们的元素。
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif
const int number_of_points = 12; // number of points in Both A and B files (number of rows)
const int number_of_axis = 3; // number of points axis in Both A and B files (number of Columns)
using namespace std;
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i,j;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
int mem_size_InoutA = number_of_points * number_of_axis;
int mem_size_InoutB = number_of_points * number_of_axis;
int mem_size_Output = number_of_points * number_of_axis;
float inputAArray[number_of_points][number_of_axis]={{0}}; // array contains file A data
float inputBArray[number_of_points][number_of_axis]={{0}}; // array contains file B data
float outputArray[number_of_points][number_of_axis]={{0}}; // array contains file B data
// import input files
input_fileA.open(argv[1]);
input_fileB.open(argv[2]);
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
{
istringstream streamA(line);
col = 0;
while(streamA >> x){
inputAArray[row][col] = x;
col++;
}
row++;
}
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
{
istringstream streamB(line);
col = 0;
while(streamB >> x){
inputBArray[row][col] = x;
col++;
}
row++;
}
// switch columns of B array
for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++ )
{
float temporary = inputBArray[row_of_arrayB][2];
inputBArray[row_of_arrayB][2] = inputBArray[row_of_arrayB][1];
inputBArray[row_of_arrayB][1] = temporary;
}
// close input files
input_fileA.close();
input_fileB.close();
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("calculate_bottom_SNM_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.n");
exit(1);
}
source_str = (char*)malloc(number_of_points);
source_size = fread( source_str, 1, number_of_points, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InoutA , NULL, &ret);
cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InoutB, NULL, &ret);
cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
mem_size_Output, NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0,
mem_size_InoutA, inputAArray, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0,
mem_size_InoutB, inputBArray, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size[2], local_item_size[2];
global_item_size[0] = number_of_points; // Process the entire lists
global_item_size[1] = number_of_points; // Process the entire lists
local_item_size[0] = 3; // Process in groups of 64
local_item_size[1] = 3; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
global_item_size, local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
// float *C = (float*)malloc(sizeof(float)*number_of_points);
ret = clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), outputArray, 0, NULL, NULL);
// Display the result to the screen
float buttomSNM = 0;
for(i = 0; i < number_of_points; i++)
{
for(j= 0; j < number_of_axis; j++)
{
printf("%f + %f = %fn", inputAArray[i][j], inputBArray[i][j], outputArray[i][j]);
}
}
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(inputa_mem_obj);
ret = clReleaseMemObject(inputb_mem_obj);
ret = clReleaseMemObject(output_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
printf("ALL Time taken: %.2fsn", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
内核文件是:
__kernel void calculate_bottom_SNM(__global float *inputAArray, __global float *inputBArray,
__global float *outputArray) {
// Get the index of the current element
int i = get_global_id(0);
int j = get_global_id(1);
outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];
}
第一个文件(第一个阵列)
0 0.000000e+00 9.998994e-01
1 1.000000e-03 9.998981e-01
2 2.000000e-03 9.998967e-01
3 3.000000e-03 9.998953e-01
4 4.000000e-03 9.998939e-01
5 5.000000e-03 9.998925e-01
6 6.000000e-03 9.998911e-01
7 7.000000e-03 9.998896e-01
8 8.000000e-03 9.998881e-01
9 9.000000e-03 9.998865e-01
10 1.000000e-02 9.998850e-01
11 1.100000e-02 9.998834e-01
第二文件(第二阵列)
0 0.000000e+00 9.998966e-01
1 1.000000e-03 9.998953e-01
2 2.000000e-03 9.998939e-01
3 3.000000e-03 9.998925e-01
4 4.000000e-03 9.998911e-01
5 5.000000e-03 9.998896e-01
6 6.000000e-03 9.998881e-01
7 7.000000e-03 9.998866e-01
8 8.000000e-03 9.998850e-01
9 9.000000e-03 9.998834e-01
10 1.000000e-02 9.998818e-01
结果:
0.000000 + 0.000000 = 0.000000
0.000000 + 0.999897 = 0.000000
0.999899 + 0.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
0.001000 + 0.999895 = 0.000000
0.999898 + 0.001000 = 0.000000
2.000000 + 2.000000 = 0.000000
0.002000 + 0.999894 = 0.000000
0.999897 + 0.002000 = 0.000000
3.000000 + 3.000000 = 0.000000
0.003000 + 0.999892 = 0.000000
0.999895 + 0.003000 = 0.000000
4.000000 + 4.000000 = 0.000000
0.004000 + 0.999891 = 0.000000
0.999894 + 0.004000 = 0.000000
5.000000 + 5.000000 = 0.000000
0.005000 + 0.999890 = 0.000000
0.999892 + 0.005000 = 0.000000
6.000000 + 6.000000 = 0.000000
0.006000 + 0.999888 = 0.000000
0.999891 + 0.006000 = 0.000000
7.000000 + 7.000000 = 0.000000
0.007000 + 0.999887 = 0.000000
0.999890 + 0.007000 = 0.000000
8.000000 + 8.000000 = 0.000000
0.008000 + 0.999885 = 0.000000
0.999888 + 0.008000 = 0.000000
9.000000 + 9.000000 = 0.000000
0.009000 + 0.999883 = 0.000000
0.999887 + 0.009000 = 0.000000
10.000000 + 10.000000 = 0.000000
0.010000 + 0.999882 = 0.000000
0.999885 + 0.010000 = 0.000000
11.000000 + 0.000000 = 0.000000
0.011000 + 0.000000 = 0.000000
0.999883 + 0.000000 = 0.000000
ALL Time taken: 0.06s
当然,结果是不对的,正确的结果是元素的总和。谢谢,
再一次,您无法检查OpenCL API调用的返回代码。如果你不这样做,你就不可能知道什么时候会出现问题。每次调用OpenCL函数时,都应该执行以下操作:
ret = clDoSomething(...);
if (ret != CL_SUCCESS)
{
printf("Failed on function clDoSomething: %dn", ret);
exit(1); // Or do whatever cleanup needs to be done before exiting
}
您可以通过定义一个简单的实用程序函数来简化这一点:
void checkError(cl_int err, const char *operation)
{
if (err != CL_SUCCESS)
{
fprintf(stderr, "Error during operation '%s': %dn", operation, err);
exit(1);
}
}
...
ret = clDoSomething(...);
checkError(ret, "calling clDoSomething");
这一次,问题似乎来自clBuildProgram
调用(它返回-54
,对应于CL_BUILD_PROGRAM_FAILURE
)。在这种情况下,您还需要获取构建日志以查看完整的错误:
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret == CL_BUILD_PROGRAM_FAILURE)
{
// Get size of build log
size_t logSize;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
0, NULL, &logSize);
checkError(ret, "getting build log size");
// Get build log
char log[logSize];
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
logSize, log, NULL);
checkError(ret, "getting build log");
printf("OpenCL program build log:n%sn", log);
exit(1);
}
如果你把它添加到你的代码中,你会得到一个构建日志,看起来像这样:
input.cl:1:10: error: unknown type name 'voi'; did you mean 'void'?
__kernel voi
^~~
void
input.cl:1:13: error: expected identifier or '('
__kernel voi
^
这看起来很奇怪,但表明你的程序可能在几个字符后就被切断了。如果你看看你为从文件中读取OpenCL程序而写的代码,你会发现:
source_str = (char*)malloc(number_of_points);
source_size = fread( source_str, 1, number_of_points, fp);
所以,你只阅读了程序的前12个字符!您可以使用fseek
和ftell
来获得文件的实际长度:
fseek(fp, 0, SEEK_END);
size_t programLength = ftell(fp);
rewind(fp);
source_str = (char*)malloc(programLength+1);
source_size = fread( source_str, 1, programLength, fp);
source_str[programLength] = ' ';
如果你这样做,你会得到一个不同的程序构建错误:
input.cl:8:17: error: subscripted value is not an array, pointer, or vector
outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];
~~~~~~~~~~~~~~^~
input.cl:8:37: error: subscripted value is not an array, pointer, or vector
outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];
~~~~~~~~~~~~~~^~
input.cl:8:57: error: subscripted value is not an array, pointer, or vector
outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];
~~~~~~~~~~~~~~^~
这是因为您试图将数组索引为二维数组,而实际上它们只是一维数组(就像所有OpenCL缓冲区一样)。您需要手动计算1D阵列中的偏移量来解决此问题,例如:
outputArray[i + j*number_of_points] = inputAArray[i + j*number_of_points] + inputBArray[i + j*number_of_points];
(这需要将number_of_points
作为参数传递给内核)。
最后,还有几个其他错误:
正如另一个答案所指出的,内存对象的大小需要乘以
sizeof(cl_float)
(而clEnqueueReadBuffer
调用需要使用这个)。你的全球工作规模可能应该是这样的:
global_item_size[0] = number_of_points;
global_item_size[1] = number_of_axis;
从这个答案中得出的主要结论是,您真的,真的需要检查每个OpenCL API函数调用返回的错误代码,否则您将永远无法调试这些问题。
int mem_size_InoutA = number_of_points * number_of_axis * sizeof(cl_float);
int mem_size_InoutB = number_of_points * number_of_axis * sizeof(cl_float);
int mem_size_Output = number_of_points * number_of_axis * sizeof(cl_float);
和
clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
mem_size_Output, outputArray, 0, NULL, NULL);
- 以螺旋方式打印矩阵的程序.(工作不好)
- 我的评分程序无法正常工作
- 如何解决在负数的情况下程序以相同方式工作的问题?
- 如何使实例化在我的 OpenGL 程序中工作?
- 我的 SDL2 程序需要哪些二进制文件,以便它在另一台未安装 SDL2 的计算机中工作
- 有没有办法在RCPP程序无法正常工作时阻止RGui崩溃?
- cout 在我的程序上无法正常工作,有人可以帮助我吗?
- C++程序工作,但 Windows 显示"程序.exe已停止工作"
- 程序在CLion IDE中工作,但exe不起作用
- 适用于 macOS 的 Xcode 应用程序。这就是我设置从USB麦克风输入获取音频的方式。一年前工作,现在没有了。为什么
- 为什么使用 exec() 重新启动程序不能正常工作?
- 使用程序生成来创建磁盘,但纹理无法正常工作
- 我无法让我的程序工作,我一直得到未定义的符号:C
- 所有Visual Studio安装程序崩溃,可视化构建工具也无法正常工作
- 名为DLL的C++windows服务程序工作不正常
- .exe应用程序在windows10中创建新模块时抛出错误,但在windows7中工作正常
- O2优化水平中断程序工作
- 从书本中学习C++无法使该程序与类一起工作
- r-工作程序中对自定义函数的未定义引用(C++和RcppParallel)
- 带有 Allegro 5 的工作程序不再起作用