如何在多个 GPU 上同时执行 cufftXt 和 CUDA 内核
how do i execute both cufftXt and CUDA kernels on multiple GPUs?
我想使用两个GPU来执行一个内核,然后使用cufftXt执行单个FFT。数据的大小可能为数 GB。我对在 2 个 GPU 上为内核分配内存的理解是,您应该将主机阵列分成两半,并将前半发送到 GPU0,另一半发送到 GPU1。以下示例演示如何执行此操作。
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void Cube (cufftReal *data, cufftReal *data3, int N, int real_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<real_size){
float x = (i % (N+2));
if(x < N){
data3[i] = pow(data[i], 3.0f);
}
else{
data3[i] = 0.0f;
}
}
__syncthreads();
}
int main (int argc, char **argv) {
int x;
int N = 8;
int cplx_size = N * (N/2 + 1);
int real_size = 2 * cplx_size;
int mem_size = sizeof(cufftReal)*real_size;
int half_real_size = real_size/2;
int half_mem_size = mem_size/2;
cufftReal *h_data = (cufftReal*)malloc(mem_size);
cufftReal *h_data3 = (cufftReal*)malloc(mem_size);
cufftReal *h0_data = (cufftReal*)malloc(half_mem_size);
cufftReal *h0_data3 = (cufftReal*)malloc(half_mem_size);
cufftReal *h1_data = (cufftReal*)malloc(half_mem_size);
cufftReal *h1_data3 = (cufftReal*)malloc(half_mem_size);
for(int i=0; i<real_size; i++){
x = (i % (N+2));
if(x < N){h_data[i] = 2;}
else{h_data[i] = 0;}
}
for(int i=0; i<half_real_size; i++){
h0_data[i] = h_data[i];
h1_data[i] = h_data[i+half_real_size];
}
cufftReal *d0_data;
cufftReal *d0_data3;
cufftReal *d1_data;
cufftReal *d1_data3;
cudaSetDevice(0);
gpuErrchk(cudaMalloc((void**)&d0_data, half_mem_size));
gpuErrchk(cudaMalloc((void**)&d0_data3, half_mem_size));
cudaSetDevice(1);
gpuErrchk(cudaMalloc((void**)&d1_data, half_mem_size));
gpuErrchk(cudaMalloc((void**)&d1_data3, half_mem_size));
cout <<"device memory allocated" <<endl;
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = (half_real_size)/threadsPerBlock;
cudaSetDevice(0);
gpuErrchk(cudaMemcpy(d0_data, h0_data, half_mem_size, cudaMemcpyHostToDevice));
cudaSetDevice(1);
gpuErrchk(cudaMemcpy(d1_data, h1_data, half_mem_size, cudaMemcpyHostToDevice));
cout <<"mem copied to devices" <<endl;
cudaSetDevice(0);
Cube <<<numBlocks, threadsPerBlock>>> (d0_data, d0_data3, N, half_real_size);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaSetDevice(1);
Cube <<<numBlocks, threadsPerBlock>>> (d1_data, d1_data3, N, half_real_size);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaSetDevice(0);
gpuErrchk(cudaMemcpy(h0_data3, d0_data3, half_mem_size, cudaMemcpyDeviceToHost));
cudaSetDevice(1);
gpuErrchk(cudaMemcpy(h1_data3, d1_data3, half_mem_size, cudaMemcpyDeviceToHost));
cout <<endl;
for(int i = 0; i<half_real_size; i++){
cout <<h0_data3[i] <<" ";
}
cout <<endl;
for(int i = 0; i<half_real_size; i++){
cout <<h1_data3[i] <<" ";
}
//clean up
cudaFree(d0_data);
cudaFree(d0_data3);
cudaFree(d1_data);
cudaFree(d1_data3);
return 0;
}
但是,我看不出这种方法如何与 cufftXt 兼容。看来我应该使用辅助功能cufftXtMemcpy自动将数据拆分到设备上。但是如果我这样做,那么上面显示的多 GPU 内核方法就无法使用,除非我为 cufftXt 和内核分配单独的设备内存。有没有办法在不双重分配设备内存的情况下同时运行 cufftXt 和内核?
以下是我按照工具包中的simpleCUFFT_2d_MGPU代码示例进行操作的方法。我不确定它是否完全正确。在 2 个 GPU 上比仅使用 1 个 GPU 慢 50%。我在Tesla K40 GPU上测试了这段代码(而不是使用R2C和C2R FFT的另一个代码(。
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
__global__ void Cube (cufftComplex *data, cufftComplex *data3, int N, int n, int nGPUs) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data3[i].x = pow(data[i].x, 3.0f);
data3[i].y = 0;
}
__syncthreads();
}
__global__ void Normalize (cufftComplex *data, int N, int n, int nGPUs){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data[i].x /= n;
}
__syncthreads();
}
int main (int argc, char **argv) {
int x, y;
int N = 8192;
int n = N*N;
//int cplx_size = N * (N/2 + 1);
//int real_size = 2 * cplx_size;
int mem_size = sizeof(cufftComplex)*n;
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = (n)/threadsPerBlock;
cout <<"numBlocks " <<numBlocks <<endl;
cufftComplex *h_data;
h_data = (cufftComplex*)malloc(mem_size);
cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);
cout <<"host data allocated" <<endl;
int index;
float lambda = N*.1;
for(y=0; y<N; y++){
for(x=0; x<N; x++){
//cout <<x <<" " <<y <<endl;
index = x + y*N;
h_data[index].x = cos(2*M_PI*(x+y)/lambda);
h_data[index].y = 0;
}
}
cout <<"host data values set" <<endl;
cufftResult res;
int device;
int nGPUs;
cudaGetDeviceCount(&nGPUs);
cout <<nGPUs <<" CUDA devices" <<endl;
size_t total_mem, free_mem;
for(int i=0; i<nGPUs; i++){
cudaMemGetInfo(&free_mem, &total_mem);
cout <<"GPU" <<i <<" used memory " <<(total_mem-free_mem)/pow(10,9);
}
int whichGPUs[nGPUs];
for(int i=0; i<nGPUs; i++){
whichGPUs[i]=i;
}
cout <<"whichgpus set" <<endl;
size_t* worksize;
worksize =(size_t*)malloc(sizeof(size_t) * nGPUs);
cout <<"worksize set" <<endl;
cufftHandle plan_complex;
res = cufftCreate(&plan_complex);
if (res != CUFFT_SUCCESS){cout <<"create plan failed" <<endl;}
res = cufftXtSetGPUs(plan_complex, nGPUs, whichGPUs);
if (res != CUFFT_SUCCESS){cout <<"setgpus forward failed" <<endl;}
cout <<"set gpus" <<endl;
res = cufftMakePlan2d(plan_complex, N, N, CUFFT_C2C, worksize);
if (res != CUFFT_SUCCESS){cout <<"make plan forward failed" <<endl;}
cout <<"plan created" <<endl;
cudaLibXtDesc *d_data;
cudaLibXtDesc *d_data3;
res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data, CUFFT_XT_FORMAT_INPLACE);
if (res != CUFFT_SUCCESS){cout <<"data malloc failed" <<endl;}
res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data3, CUFFT_XT_FORMAT_INPLACE);
if (res != CUFFT_SUCCESS){cout <<"data3 malloc failed" <<endl;}
cout <<"xtmalloc done" <<endl;
res = cufftXtMemcpy (plan_complex, d_data, h_data, CUFFT_COPY_HOST_TO_DEVICE);
if (res != CUFFT_SUCCESS){cout <<"memcpy to device failed" <<endl;}
cout <<"memcpy h to d" <<endl;
int tmax = 10000;
int start = time(0);
for(int tau=0; tau<tmax; tau++){
res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_FORWARD);
if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}
res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_INVERSE);
if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}
for(int i=0; i<nGPUs; i++){
device = d_data->descriptor->GPUs[i];
cudaSetDevice(device);
Normalize <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], N, n, nGPUs);
}
cudaDeviceSynchronize();
}
int stop = time(0);
cout <<tmax <<" timesteps" <<endl <<(stop-start) <<" seconds"<<endl;
/*
for(int i=0; i<nGPUs; i++){
device = d_data->descriptor->GPUs[i];
cudaSetDevice(device);
Cube <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], (cufftComplex*) d_data3->descriptor->data[i], N, real_size);
}
*/
/*
cudaDeviceSynchronize();
res = cufftXtMemcpy (plan_complex, h_data, d_data, CUFFT_COPY_DEVICE_TO_HOST);
if (res != CUFFT_SUCCESS){cout <<"memcpy to host failed" <<endl;}
cout <<"memcpy d to h" <<endl;
ofstream fout;
ostringstream outstr;
outstr.precision(4);
outstr <<time(0) <<".dat";
string filename=outstr.str();
fout.open(filename.c_str());
fout.precision(4);
for (int i = 0; i < n; i++) {
x = (i % (N));
y = (i /(N))%N;
fout <<x <<" " <<y <<" " <<h_data[i].x <<endl;
}
fout.close();
*/
//clean up
res = cufftXtFree(d_data);
if (res != CUFFT_SUCCESS){cout <<"free data failed" <<endl;}
res = cufftXtFree(d_data3);
if (res != CUFFT_SUCCESS){cout <<"free data3 failed" <<endl;}
cufftDestroy(plan_complex);
return 0;
}
相关文章:
- 在执行其他功能的同时播放动画(LED矩阵和Arduino/ESP8266)
- C++,系统无法执行指定的程序
- 使用C++中的模板和运算符重载执行矩阵运算
- 创建一个函数以在输入为负数或零时输出字符串.第一次执行用户定义的函数
- 执行函数时导致崩溃的变量
- 无论条件是否为true,if总是在c++中执行
- 当函数模板参数是具有默认参数的类模板时,函数模板参数的推导如何执行
- 在C++中对T*类型执行std::move的意外行为
- 使用QProcess执行命令,并将结果存储在QStringList中
- 如何在没有信号的情况下从C++执行QML插槽
- 如何确认我的constexpr表达式实际上已经在编译时执行
- C++17中的并行执行策略
- QML按钮点击功能执行顺序
- 程序在执行程序的其余部分之前退出
- 为什么catch中的代码没有被执行
- C++从其他 constexpr 创建 lambda 不能按顺序执行 Constexpr
- 将执行、作业和WinAPI相乘
- 对字符数组中的元素执行逐位操作
- 为什么g++在未执行的代码处标记强制转换错误
- 如何在多个 GPU 上同时执行 cufftXt 和 CUDA 内核