将 2D 推力::d evice_vector 复矩阵传递给 CUDA 内核函数

Pass 2D thrust::device_vector Complex Matrix to CUDA kernel function

本文关键字：CUDA 函数内核推力 2D evice vector 更新时间：2023-10-16

我是 Cuda 的新手，我正在尝试使用 Cuda 将我现有的项目移动到 GPU。我的代码基于复杂矩阵和复杂缓冲区。

第一步，我尝试将嵌套的 For 循环代码移动到 Cuda(其余的将类似(：

typedef thrust::complex<double> smp_t;
uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
// Create matrix.
thrust::complex<double> i_unit(0.0, 1.0);
thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);
// Fill the Matrix
for (size_t row = 0; row < 8; row++) {
for (size_t col = 0; col < 8; col++) {
std::complex<double> tmp =
exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
tw[row].push_back(tmp);
}
}
/* The Code To Move to the GPU processing */
for (unsigned int i = 0; i < bufsize; i++) {
for (size_t ch = 0; ch < 8; ch++)
for (size_t k = 0; k < 8; k++)
cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
}

这是来自 .cu 文件的代码，它将替换当前的嵌套 for 循环：

__global__ void kernel_func(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
unsigned int ch = threadIdx.x;
unsigned int k = blockIdx.x;
for (int x = 0; x < block_size; ++x) {
unsigned int sig_index = k*block_size+x;
unsigned int tw_index = ch*k;
unsigned int cn_index = ch*block_size+x;

cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
}
}
void kernel_wrap(
smp_t *cnbuf,
smp_t *sgbuf,
thrust::host_vector<thrust::host_vector<smp_t>>tw,
size_t buffer_size) {
smp_t *d_sgbuf;
smp_t *d_cnbuf;
thrust::device_vector<smp_t> d_tw(8*8);
thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
cudaMalloc((void **)&d_sgbuf, buffer_size);
cudaMalloc((void **)&d_cnbuf, buffer_size);
cudaMemcpy(d_sgbuf, sgbuf, buffer_size, cudaMemcpyDeviceToHost);
cudaMemcpy(d_cnbuf, cnbuf, buffer_size, cudaMemcpyDeviceToHost);
thrust::raw_pointer_cast(d_tw.data());
kernel_func<<<8, 8>>>(
reinterpret_cast<cuDoubleComplex*>(d_cnbuf),
reinterpret_cast<cuDoubleComplex*>(d_sgbuf),
thrust::raw_pointer_cast(d_tw.data()),
buffer_size
);
cudaError_t varCudaError1 = cudaGetLastError();
if (varCudaError1 != cudaSuccess)
{
std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
exit(EXIT_FAILURE);
}
cudaMemcpy(sgbuf, d_sgbuf, buffer_size, cudaMemcpyHostToDevice);
cudaMemcpy(cnbuf, d_cnbuf, buffer_size, cudaMemcpyHostToDevice);

}

当我运行代码时，我收到错误：

Failed to launch subDelimiterExamine kernel (error code: invalid argument)!

我认为造成麻烦的论点是"d_tw"。所以，我的问题是：

我做错了什么 <thrust：：host_vector><thrust：：host_vector>> 到 <thrust：:d>>(从 2d 矩阵到一个扁平的 arr(？
在 CUDA 中使用 2D 复数有更好的乳清吗？
Cuda 中关于复杂数组的文档非常糟糕，我在哪里可以阅读大量使用 Cuda 复杂矩阵的工作？

谢谢！！！！

有各种各样的问题。我将列出一些，可能会错过一些。因此，请参阅我给出的示例代码以了解其他差异。

最直接的问题就在这里：

thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());

这就是导致您看到的无效参数错误的原因。在引擎盖下，推力将尝试为此使用cudaMemcpyAsync操作，因为这本质上是从主机到设备的副本。我们将通过将其替换为普通的cudaMemcpy操作来解决此问题，但要了解如何构造它，有必要了解第 2 项。

您似乎认为向量的向量意味着连续存储。它没有，而且该声明不是特定于推力的。由于向量thrust::host_vector(甚至std::vector向量(并不意味着连续存储，因此我们无法轻松构造单个操作，例如cudaMemcpy或thrust::copy来复制此数据。因此，有必要显式展平它。
您在cudaMemcpy操作上的复制方向普遍是落后的。你应该有cudaMemcpyHostToDevice的地方你有cudaMemcpyDeviceToHost，反之亦然。
CUDAcuComplex.h头文件早于 thrust，并且提供了一种快速的 C 样式方法来处理复数。它没有文档 - 您必须阅读文件本身并弄清楚如何使用它，就像似乎已经做的那样。但是，由于您无论如何都在使用thrust::complex<>，因此使用该编码范例并编写设备代码以使其看起来几乎与您的主机代码完全相同要简单得多。
您的各种传输大小都错误。cudaMemcpy需要以字节为单位的大小来传输。

下面是一个例子，从您展示的部分拼凑而成，带有各种"修复"。我并不是声称它以任何方式完美或正确，但它避免了我上面概述的问题。此外，根据您使用或定义-DUSE_KERNEL进行编译的方式，它将运行您的"原始"主机代码并显示输出，或者运行内核代码并显示输出。根据我的测试，输出匹配。

$ cat t1751.cu
#include <thrust/complex.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
#include <cstdint>
#include <cuComplex.h>
typedef thrust::complex<double> smp_t;
__global__ void kernel_func_old(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
unsigned int ch = threadIdx.x;
unsigned int k = blockIdx.x;
for (int x = 0; x < block_size; ++x) {
unsigned int sig_index = k*block_size+x;
unsigned int tw_index = ch*k;
unsigned int cn_index = ch*block_size+x;

cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
}
}
__global__ void kernel_func(smp_t *cnbuf, smp_t *sgbuf, smp_t *tw, size_t block_size) {
unsigned row = blockIdx.x;
unsigned col = threadIdx.x;
unsigned idx = row*block_size+col;
for (int k = 0; k < 8; k++)
cnbuf[idx] += sgbuf[k*block_size+col] * tw[row*block_size+k];
}
void kernel_wrap(
smp_t *cnbuf,
smp_t *sgbuf,
thrust::host_vector<thrust::host_vector<smp_t>>tw,
size_t buffer_size) {
smp_t *d_sgbuf;
smp_t *d_cnbuf;
thrust::device_vector<smp_t> d_tw(8*8);
//    thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
thrust::host_vector<smp_t> htw(buffer_size*buffer_size);
for (int i = 0; i < buffer_size; i++)
for (int j = 0; j < buffer_size; j++)
htw[i*buffer_size + j] = tw[i][j];
cudaMemcpy(thrust::raw_pointer_cast(d_tw.data()), &htw[0], 8*8*sizeof(smp_t), cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_sgbuf, buffer_size*buffer_size*sizeof(smp_t));
cudaMalloc((void **)&d_cnbuf, buffer_size*buffer_size*sizeof(smp_t));
cudaMemcpy(d_sgbuf, sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_cnbuf, cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
thrust::raw_pointer_cast(d_tw.data());
kernel_func<<<8, 8>>>(d_cnbuf,d_sgbuf,thrust::raw_pointer_cast(d_tw.data()),buffer_size);
cudaError_t varCudaError1 = cudaGetLastError();
if (varCudaError1 != cudaSuccess)
{
std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
exit(EXIT_FAILURE);
}
//    cudaMemcpy(sgbuf, d_sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
cudaMemcpy(cnbuf, d_cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
for (int i = 0; i < 8; i++)
for (int j = 0; j < 8; j++)
std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
}
int main(){
const int bufsize = 8;
const int decfactor = 8;
uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
memset(cnbuf, 0, 8*bufsize*sizeof(smp_t));
// Create matrix.
thrust::complex<double> i_unit(0.0, 1.0);
#ifndef USE_KERNEL
std::vector<std::vector<smp_t> > tw(decfactor);
#else
thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);
#endif
// Fill the Matrix
for (size_t row = 0; row < 8; row++) {
for (size_t col = 0; col < 8; col++) {
std::complex<double> tmp = exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
tw[row].push_back(tmp);
}
}
thrust::complex<double> test(1.0, 1.0);
for (int i = 0; i < 8*8; i++) sgbuf[i]  = test;
#ifndef USE_KERNEL
/* The Code To Move to the GPU processing */
for (unsigned int i = 0; i < bufsize; i++) {
for (size_t ch = 0; ch < 8; ch++)
for (size_t k = 0; k < 8; k++)
cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
}
for (int i = 0; i < 8; i++)
for (int j = 0; j < 8; j++)
std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
#else
kernel_wrap(cnbuf,sgbuf,tw,bufsize);
#endif
}
$ nvcc -o t1751 t1751.cu -std=c++11
$ ./t1751 >out_host.txt
$ nvcc -o t1751 t1751.cu -std=c++11 -DUSE_KERNEL
$ ./t1751 >out_device.txt
$ diff out_host.txt out_device.txt
$

请记住，这主要是您的代码，我并不是说它是正确的，或无缺陷的，或适合任何特定目的。使用它的风险由您自己承担。