可能导致我的矢量核如此荒谬的幅度减速
What could be causing my vectorized kernels to slow down by such an absurd magnitude?
我已经编写了一个程序,旨在测试在OS上注册的每个计算设备的性能。图形卡是AMD Radeon HD5450。特别是在我的计算机上,这些设备是:
-
平台: AMD加速并行处理
- 名称: Cedar
- 类型: gpu
-
平台: AMD加速并行处理
- 名称: Cedar
- 类型: gpu
- (显然,图形卡本身已列出了两次?我不知道,我没有构建此东西...)
-
平台: AMD加速并行处理
- 名称: intel(r)core(tm)i7-2600 cpu @ 3.40GHz
- 类型: cpu
-
平台:实验opencl 2.0仅CPU平台
- 名称: intel(r)core(tm)i7-2600 cpu @ 3.40GHz
- 类型: cpu
在设备0、1和3上执行下面列出的内核时,它们的整体速度都有很大差异,但名义上都在直接比较方面的预期差异范围内。我已经将Intel CPU的结果放在Pastebin的Intel平台上。
但是使用设备2,不仅执行不仅比其他设备要慢(仅对于积分类型,我可能会添加!),尽管荒谬的数量级都会慢慢,尽管事实上它据推测使用了相同的设备(Intel CPU)作为英特尔平台正在使用,这没有此类问题。请参阅此粘贴。
显着的外围时间(导致大规模放缓的时间)与我的代码的矢量化版本有关,也取决于代码不是一致的。关于驾驶Intel CPU的AMD平台的某些东西似乎很不兼容。
有人知道发生了什么事吗?我在下面包含了完整的完整代码,以防它与基本问题有关。
executor.hpp
#pragma once
#define CL_HPP_ENABLE_EXCEPTIONS
#pragma warning(disable : 4996)
#include<CLcl2.hpp>
class buffers {
cl::Buffer a, b, c;
cl::Buffer output;
size_t size;
template<typename T>
buffers(cl::Context const& context, size_t size, T t) :
size(size){
std::vector<T> values;
values.resize(size * 16);
for (size_t i = 0; i < size; i++)
values[i] = T(i);
a = cl::Buffer( context, values.begin(), values.end(), true );
for (auto & val : values)
val *= 3;
b = cl::Buffer( context, values.begin(), values.end(), true );
for (auto & val : values)
val /= 10;
c = cl::Buffer( context, values.begin(), values.end(), true );
output = cl::Buffer( context, CL_MEM_WRITE_ONLY, size * 16 * sizeof(T) );
}
public:
template<typename T>
static buffers make_buffer(cl::Context const& context, size_t size) {
return buffers(context, size, T(0));
}
cl::Buffer get_a() const {
return a;
}
cl::Buffer get_b() const {
return b;
}
cl::Buffer get_c() const {
return c;
}
cl::Buffer get_output() const {
return output;
}
size_t get_size() const {
return size;
}
};
class task {
cl::CommandQueue queue;
cl::Kernel kernel;
buffers * b;
std::string type_name;
public:
task(cl::CommandQueue queue, cl::Kernel kernel, buffers * b, std::string const& type_name) :
queue(queue),
kernel(kernel),
b(b),
type_name(type_name){
int argc = kernel.getInfo<CL_KERNEL_NUM_ARGS>();
for (int i = 0; i < argc; i++) {
std::string something = kernel.getArgInfo<CL_KERNEL_ARG_NAME>(i);
if(something == "a")
kernel.setArg(i, b->get_a());
else if(something == "b")
kernel.setArg(i, b->get_b());
else if(something == "c")
kernel.setArg(i, b->get_c());
else if(something == "output")
kernel.setArg(i, b->get_output());
}
}
cl::Event enqueue() {
cl::Event event;
queue.enqueueNDRangeKernel(kernel, {}, cl::NDRange{ b->get_size() }, {}, nullptr, &event);
return event;
}
cl::Kernel get_kernel() const {
return kernel;
}
std::string get_type_name() const {
return type_name;
}
static std::chrono::nanoseconds time_event(cl::Event event) {
event.wait();
return std::chrono::nanoseconds{ event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>() };
}
};
taskgenerator.hpp
#pragma once
#include "Executor.hpp"
#include<iostream>
#include<fstream>
class kernel_generator {
public:
static std::string get_primary_kernels() {
return ""
#include "Primary Transform.cl"
#include "Transformations.cl"
#include "Transform Kernels.cl"
;
}
static std::string get_trigonometric_kernels() {
return ""
#include "Trigonometry Transform.cl"
#include "Transformations.cl"
#include "Transform Kernels.cl"
;
}
static std::string get_utility_kernels() {
return ""
#include "Utility Transform.cl"
#include "Transformations.cl"
#include "Transform Kernels.cl"
;
}
private:
static std::vector<cl::Kernel> get_kernels(std::string src, cl::Context context, cl::Device device, std::ostream & err_log) {
try {
cl::Program program{ context, src, false };
program.build();
std::vector<cl::Kernel> kernels;
program.createKernels(&kernels);
return kernels;
}
catch (cl::BuildError const& e) {
std::cerr << "Unable to build kernels for " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
err_log << "Build Log:n";
auto log = e.getBuildLog();
for (auto const& log_p : log) {
err_log << log_p.second << "n";
}
return{};
}
}
public:
static std::vector<cl::Kernel> get_char_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "char_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_short_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "short_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_int_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "int_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_long_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "long_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_float_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "float_defines.cl"
+ get_primary_kernels();
std::vector<cl::Kernel> primary_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "float_defines.cl"
+ get_utility_kernels();
std::vector<cl::Kernel> utility_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "float_defines.cl"
+ get_trigonometric_kernels();
std::vector<cl::Kernel> trig_kernels = get_kernels(src, context, device, err_log);
std::vector<cl::Kernel> final_kernels;
final_kernels.insert(final_kernels.end(), primary_kernels.begin(), primary_kernels.end());
final_kernels.insert(final_kernels.end(), utility_kernels.begin(), utility_kernels.end());
final_kernels.insert(final_kernels.end(), trig_kernels.begin(), trig_kernels.end());
return final_kernels;
}
static std::vector<cl::Kernel> get_double_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "double_defines.cl"
+ get_primary_kernels();
std::vector<cl::Kernel> primary_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "double_defines.cl"
+ get_utility_kernels();
std::vector<cl::Kernel> utility_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "double_defines.cl"
+ get_trigonometric_kernels();
std::vector<cl::Kernel> trig_kernels = get_kernels(src, context, device, err_log);
std::vector<cl::Kernel> final_kernels;
final_kernels.insert(final_kernels.end(), primary_kernels.begin(), primary_kernels.end());
final_kernels.insert(final_kernels.end(), utility_kernels.begin(), utility_kernels.end());
final_kernels.insert(final_kernels.end(), trig_kernels.begin(), trig_kernels.end());
return final_kernels;
}
};
内核testing.cpp(main)
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#pragma warning(disable : 4996)
#include<CLcl2.hpp>
#include<iostream>
#include<iomanip>
#include<chrono>
#include<fstream>
#include<sstream>
#include<filesystem>
#include "TaskGenerator.hpp"
namespace filesystem = std::experimental::filesystem;
void print_device_info(std::ostream & out, cl::Platform platform, cl::Device device) {
out << std::setw(20) << std::left << "Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << "n";
out << std::setw(20) << std::left << "Name: " << device.getInfo<CL_DEVICE_NAME>() << "n";
out << std::setw(20) << std::left << "Memory Size: " << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() << "n";
out << std::setw(20) << std::left << "Device Type: " << ((device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU") << "n";
}
void test_device(cl::Platform platform, cl::Device device, filesystem::path output_file_path) {
std::ofstream out(output_file_path);
print_device_info(out, platform, device);
cl::Context context(device);
cl::CommandQueue queue{ context, CL_QUEUE_PROFILING_ENABLE };
size_t size = 100'000;
buffers char_buffers = buffers::make_buffer<cl_char>(context, size);
buffers short_buffers = buffers::make_buffer<cl_short>(context, size);
buffers int_buffers = buffers::make_buffer<cl_int>(context, size);
buffers long_buffers = buffers::make_buffer<cl_long>(context, size);
buffers float_buffers = buffers::make_buffer<cl_float>(context, size);
buffers double_buffers = buffers::make_buffer<cl_double>(context, size);
auto char_kernels = kernel_generator::get_char_kernels(context, device, out);
auto short_kernels = kernel_generator::get_short_kernels(context, device, out);
auto int_kernels = kernel_generator::get_int_kernels(context, device, out);
auto long_kernels = kernel_generator::get_long_kernels(context, device, out);
auto float_kernels = kernel_generator::get_float_kernels(context, device, out);
std::vector<cl::Kernel> double_kernels = kernel_generator::get_double_kernels(context, device, out);
std::vector<task> tasks;
for (auto & kernel : char_kernels) {
tasks.emplace_back(queue, kernel, &char_buffers, "char");
}
for (auto & kernel : short_kernels) {
tasks.emplace_back(queue, kernel, &short_buffers, "short");
}
for (auto & kernel : int_kernels) {
tasks.emplace_back(queue, kernel, &int_buffers, "int");
}
for (auto & kernel : long_kernels) {
tasks.emplace_back(queue, kernel, &long_buffers, "long");
}
for (auto & kernel : float_kernels) {
tasks.emplace_back(queue, kernel, &float_buffers, "float");
}
for (auto & kernel : double_kernels) {
tasks.emplace_back(queue, kernel, &double_buffers, "double");
}
std::vector<cl::Event> events;
size_t index = 0;
for (auto & task : tasks) {
events.emplace_back(task.enqueue());
std::cout << "Enqueueing " << task.get_kernel().getInfo<CL_KERNEL_FUNCTION_NAME>() << "(" << task.get_type_name() << ")" << "n";
cl::Event e = task.enqueue();
}
out << "==========================================nnProfiling Results:nn";
for (size_t i = 0; i < events.size(); i++) {
events[i].wait();
auto duration = task::time_event(events[i]);
std::cout << "Task " << (i + 1) << " of " << events.size() << " complete.r";
std::string task_name = tasks[i].get_kernel().getInfo<CL_KERNEL_FUNCTION_NAME>();
task_name.append("(" + tasks[i].get_type_name() + ")");
out << " " << std::setw(40) << std::right << task_name;
out << ": " << std::setw(12) << std::right << duration.count() << "nsn";
}
}
int main() {
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
int i = 0;
for (auto & platform : platforms) {
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
for (auto & device : devices) {
test_device(platform, device, "Device " + std::to_string(i++) + ".txt");
}
}
system("pause");
return 0;
}
char_defines.cl,short_defines.cl,int_defines.cl,long_defines.cl,float_defines.cl,double_defines.cl
R"D(
typedef char Scalar;
typedef char2 Vector2;
typedef char4 Vector4;
typedef char8 Vector8;
typedef char16 Vector16;
)D"
//All the other defines files are identical, but with their respective types swapped in.
//Only exception is double_defines.cl, which also has '#pragma OPENCL EXTENSION cl_khr_fp64 : enable' added.
主变换。cl
R"D(
#define UNARY_TRANSFORM_PROCESS(a) a * a + a / a
#define BINARY_TRANSFORM_PROCESS(a, b) a * b + a / b
#define TERNARY_TRANSFORM_PROCESS(a, b, c) a * b + c / a
)D"
三角转换。cl
R"D(
#define UNARY_TRANSFORM_PROCESS(a) sin(a) + cos(a) + tan(a)
#define BINARY_TRANSFORM_PROCESS(a, b) sin(a) + cos(b) + tan(a)
#define TERNARY_TRANSFORM_PROCESS(a, b, c) sin(a) + cos(b) + tan(c)
)D"
实用程序transform.cl
R"D(
#define UNARY_TRANSFORM_PROCESS(a) log(a) + hypot(a, a) + tgamma(a)
#define BINARY_TRANSFORM_PROCESS(a, b) log(a) + hypot(b, a) + tgamma(b)
#define TERNARY_TRANSFORM_PROCESS(a, b, c) log(a) + hypot(b, c) + tgamma(a)
)D"
transformations.cl
R"D(
Scalar Unary_Transform1(Scalar a) {
return UNARY_TRANSFORM_PROCESS(a);
}
Vector2 Unary_Transform2(Vector2 a) {
return UNARY_TRANSFORM_PROCESS(a);
}
Vector4 Unary_Transform4(Vector4 a) {
return UNARY_TRANSFORM_PROCESS(a);
}
Vector8 Unary_Transform8(Vector8 a) {
return UNARY_TRANSFORM_PROCESS(a);
}
Vector16 Unary_Transform16(Vector16 a) {
return UNARY_TRANSFORM_PROCESS(a);
}
Scalar Binary_Transform1(Scalar a, Scalar b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}
Vector2 Binary_Transform2(Vector2 a, Vector2 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}
Vector4 Binary_Transform4(Vector4 a, Vector4 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}
Vector8 Binary_Transform8(Vector8 a, Vector8 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}
Vector16 Binary_Transform16(Vector16 a, Vector16 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}
Scalar Ternary_Transform1(Scalar a, Scalar b, Scalar c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}
Vector2 Ternary_Transform2(Vector2 a, Vector2 b, Vector2 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}
Vector4 Ternary_Transform4(Vector4 a, Vector4 b, Vector4 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}
Vector8 Ternary_Transform8(Vector8 a, Vector8 b, Vector8 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}
Vector16 Ternary_Transform16(Vector16 a, Vector16 b, Vector16 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}
)D"
变换内核
R"D(
kernel void unary_transform_scalar(global Scalar * a, global Scalar * output) {
size_t id = get_global_id(0);
output[id] = Unary_Transform1(a[id]);
}
kernel void binary_transform_scalar(global Scalar * a, global Scalar * b, global Scalar * output) {
size_t id = get_global_id(0);
output[id] = Binary_Transform1(a[id], b[id]);
}
kernel void ternary_transform_scalar(global Scalar * a, global Scalar * b, global Scalar * c, global Scalar * output) {
size_t id = get_global_id(0);
output[id] = Ternary_Transform1(a[id], b[id], c[id]);
}
kernel void unary_transform_vector2(global Vector2 * a, global Vector2 * output) {
size_t id = get_global_id(0);
output[id] = Unary_Transform2(a[id]);
}
kernel void binary_transform_vector2(global Vector2 * a, global Vector2 * b, global Vector2 * output) {
size_t id = get_global_id(0);
output[id] = Binary_Transform2(a[id], b[id]);
}
kernel void ternary_transform_vector2(global Vector2 * a, global Vector2 * b, global Vector2 * c, global Vector2 * output) {
size_t id = get_global_id(0);
output[id] = Ternary_Transform2(a[id], b[id], c[id]);
}
kernel void unary_transform_vector4(global Vector4 * a, global Vector4 * output) {
size_t id = get_global_id(0);
output[id] = Unary_Transform4(a[id]);
}
/* For the sake of brevity, I've cut the rest. It should be pretty clear what the
rest look like.*/
)D"
据我所知,AMD Radeon HD 5450
不是双GPU,可能被列出了两次,因为安装了2个不同版本的AMD OpenCL平台。我记得当时有OpenCL 1.2和实验性OpenCL 2.0的情况。检查平台版本。
当涉及CPU时,我认为Experimental OpenCL 2.0 CPU Only Platform
是Intel实现,对Intel CPU进行了很好的优化。AMD Opencl SDK刚刚从事Intel CPU工作,并且性能很差 - 我过去经历了类似的问题。
总结 - 您不必在所有设备上使用所有可用的OpenCL平台。通常,GPU的最新OpenCL平台版本可提供体面的性能,并始终在Intel CPU上使用Intel Opencl。
- 我的神经网络不起作用 [XOR 问题]
- C++我的数学有什么问题,为什么我的代码不能正确循环
- 我的字符计数代码计算错误.为什么
- 为什么我的C#代码在调用回C++COM直到Task时会暂停.等待/线程.加入
- cmake在我的项目中所需的所有静态库都不成功
- 为什么我的代码在输出中增加了93天
- 我的简单if-else语句是如何无法访问的代码
- 为什么我的for循环不能正确获取argv
- 我的项目不会像"undefined reference to `grpc::g_core_codegen_interface'"那样使用未定义的引用错误进行编译
- 0-1背包代码中的错误.我的代码中有什么错误
- 当我的阵列太大时出现分段错误
- 如何确认我的constexpr表达式实际上已经在编译时执行
- 为什么二进制搜索在我的测试中不起作用
- 如何指定我希望我的LIB链接到的DLL文件?-Visual Studio 2019
- 我的代码中有错误吗?使用BGI图形的C++代码对我不起作用
- 当我在main中声明了我的2d数组时,为什么我的程序会退出
- 可能导致我的矢量核如此荒谬的幅度减速
- SPOJ 问题 ADDREV - 添加反转数字(考虑到我的代码,我得到了一个错误的答案,这很荒谬......
- 当用gdb调试我的c++代码时,函数调用中的变量值是荒谬的
- 欧拉#8项目,我不知道为什么我的代码给出了荒谬的高值