如何在多个 GPU 上同时执行 cufftXt 和 CUDA 内核

how do i execute both cufftXt and CUDA kernels on multiple GPUs?

本文关键字：执行 cufftXt 内核 CUDA GPU 更新时间：2023-10-16

我想使用两个GPU来执行一个内核，然后使用cufftXt执行单个FFT。数据的大小可能为数 GB。我对在 2 个 GPU 上为内核分配内存的理解是，您应该将主机阵列分成两半，并将前半发送到 GPU0，另一半发送到 GPU1。以下示例演示如何执行此操作。

#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}
__global__ void Cube (cufftReal *data, cufftReal *data3, int N, int real_size) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i<real_size){
    float x = (i % (N+2));
    if(x < N){
            data3[i] = pow(data[i], 3.0f);  
    }
    else{
            data3[i] = 0.0f;        
    }
    }
    __syncthreads();
}

int main (int argc, char **argv) {
    int x;
    int N = 8;
        int cplx_size = N * (N/2 + 1);
        int real_size = 2 * cplx_size;
    int mem_size = sizeof(cufftReal)*real_size;
    int half_real_size = real_size/2;
    int half_mem_size = mem_size/2;
    cufftReal *h_data = (cufftReal*)malloc(mem_size);
    cufftReal *h_data3 = (cufftReal*)malloc(mem_size);
    cufftReal *h0_data = (cufftReal*)malloc(half_mem_size);
    cufftReal *h0_data3 = (cufftReal*)malloc(half_mem_size);
    cufftReal *h1_data = (cufftReal*)malloc(half_mem_size);
    cufftReal *h1_data3 = (cufftReal*)malloc(half_mem_size);
    for(int i=0; i<real_size; i++){
            x = (i % (N+2));
        if(x < N){h_data[i] = 2;}
        else{h_data[i] = 0;}
    }
    for(int i=0; i<half_real_size; i++){
        h0_data[i] = h_data[i];
        h1_data[i] = h_data[i+half_real_size];
    }
    cufftReal *d0_data;
    cufftReal *d0_data3;
    cufftReal *d1_data;
    cufftReal *d1_data3;
    cudaSetDevice(0);
    gpuErrchk(cudaMalloc((void**)&d0_data, half_mem_size));
    gpuErrchk(cudaMalloc((void**)&d0_data3, half_mem_size));
    cudaSetDevice(1);
    gpuErrchk(cudaMalloc((void**)&d1_data, half_mem_size));
    gpuErrchk(cudaMalloc((void**)&d1_data3, half_mem_size));
cout <<"device memory allocated" <<endl;
    int maxThreads=(N>1024)?1024:N;
    int threadsPerBlock = maxThreads;
    int numBlocks = (half_real_size)/threadsPerBlock;
    cudaSetDevice(0);
    gpuErrchk(cudaMemcpy(d0_data, h0_data, half_mem_size, cudaMemcpyHostToDevice));
    cudaSetDevice(1);
    gpuErrchk(cudaMemcpy(d1_data, h1_data, half_mem_size, cudaMemcpyHostToDevice));
cout <<"mem copied to devices" <<endl;
        cudaSetDevice(0);
        Cube <<<numBlocks, threadsPerBlock>>> (d0_data, d0_data3, N, half_real_size);
        gpuErrchk( cudaPeekAtLastError() );
        gpuErrchk( cudaDeviceSynchronize() );
        cudaSetDevice(1);
        Cube <<<numBlocks, threadsPerBlock>>> (d1_data, d1_data3, N, half_real_size);
        gpuErrchk( cudaPeekAtLastError() );
        gpuErrchk( cudaDeviceSynchronize() );
    cudaSetDevice(0);
    gpuErrchk(cudaMemcpy(h0_data3, d0_data3, half_mem_size, cudaMemcpyDeviceToHost));
    cudaSetDevice(1);
    gpuErrchk(cudaMemcpy(h1_data3, d1_data3, half_mem_size, cudaMemcpyDeviceToHost));   
    cout <<endl;
    for(int i = 0; i<half_real_size; i++){
        cout <<h0_data3[i] <<" ";
    }
    cout <<endl;
    for(int i = 0; i<half_real_size; i++){
        cout <<h1_data3[i] <<" ";
    }
    //clean up
    cudaFree(d0_data);
    cudaFree(d0_data3);
    cudaFree(d1_data);
    cudaFree(d1_data3);   
    return 0;
}

但是，我看不出这种方法如何与 cufftXt 兼容。看来我应该使用辅助功能cufftXtMemcpy自动将数据拆分到设备上。但是如果我这样做，那么上面显示的多 GPU 内核方法就无法使用，除非我为 cufftXt 和内核分配单独的设备内存。有没有办法在不双重分配设备内存的情况下同时运行 cufftXt 和内核？

以下是我按照工具包中的simpleCUFFT_2d_MGPU代码示例进行操作的方法。我不确定它是否完全正确。在 2 个 GPU 上比仅使用 1 个 GPU 慢 50%。我在Tesla K40 GPU上测试了这段代码(而不是使用R2C和C2R FFT的另一个代码(。

#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
__global__ void Cube (cufftComplex *data, cufftComplex *data3, int N, int n, int nGPUs) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;   
    if (i<n){
        data3[i].x = pow(data[i].x, 3.0f);  
    data3[i].y = 0;
    }
    __syncthreads();
}
__global__ void Normalize (cufftComplex *data, int N, int n, int nGPUs){
    int i = blockIdx.x * blockDim.x + threadIdx.x;   
    if (i<n){
    data[i].x /= n;
    }
    __syncthreads();
}
int main (int argc, char **argv) {
    int x, y;
    int N = 8192;
    int n = N*N;
        //int cplx_size = N * (N/2 + 1);
        //int real_size = 2 * cplx_size;
    int mem_size = sizeof(cufftComplex)*n;
    int maxThreads=(N>1024)?1024:N;
    int threadsPerBlock = maxThreads;
    int numBlocks = (n)/threadsPerBlock;
    cout <<"numBlocks " <<numBlocks <<endl;
    cufftComplex *h_data; 
    h_data = (cufftComplex*)malloc(mem_size);
    cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);
cout <<"host data allocated" <<endl;
    int index;
    float lambda = N*.1;
    for(y=0; y<N; y++){
    for(x=0; x<N; x++){
        //cout <<x <<" " <<y <<endl;
        index = x + y*N;
        h_data[index].x = cos(2*M_PI*(x+y)/lambda);
        h_data[index].y = 0;
    }
    }
cout <<"host data values set" <<endl;
    cufftResult res;
    int  device;
    int nGPUs;
    cudaGetDeviceCount(&nGPUs);
    cout <<nGPUs <<" CUDA devices" <<endl;
    size_t total_mem, free_mem;
    for(int i=0; i<nGPUs; i++){
        cudaMemGetInfo(&free_mem, &total_mem);
        cout <<"GPU" <<i <<" used memory " <<(total_mem-free_mem)/pow(10,9);
    }
    int whichGPUs[nGPUs];
    for(int i=0; i<nGPUs; i++){
        whichGPUs[i]=i;
    }
cout <<"whichgpus set" <<endl;
        size_t* worksize;
        worksize =(size_t*)malloc(sizeof(size_t) * nGPUs);
cout <<"worksize set" <<endl;
    cufftHandle plan_complex;
    res = cufftCreate(&plan_complex);
        if (res != CUFFT_SUCCESS){cout <<"create plan failed" <<endl;}
    res = cufftXtSetGPUs(plan_complex, nGPUs, whichGPUs);
        if (res != CUFFT_SUCCESS){cout <<"setgpus forward failed" <<endl;}
cout <<"set gpus" <<endl;
    res = cufftMakePlan2d(plan_complex, N, N, CUFFT_C2C, worksize);
        if (res != CUFFT_SUCCESS){cout <<"make plan forward failed" <<endl;}
cout <<"plan created" <<endl;
    cudaLibXtDesc *d_data; 
    cudaLibXtDesc *d_data3;
    res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data, CUFFT_XT_FORMAT_INPLACE);
        if (res != CUFFT_SUCCESS){cout <<"data malloc failed" <<endl;}
    res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data3, CUFFT_XT_FORMAT_INPLACE);
        if (res != CUFFT_SUCCESS){cout <<"data3 malloc failed" <<endl;}
cout <<"xtmalloc done" <<endl;
    res = cufftXtMemcpy (plan_complex, d_data, h_data, CUFFT_COPY_HOST_TO_DEVICE);
        if (res != CUFFT_SUCCESS){cout <<"memcpy to device failed" <<endl;}
cout <<"memcpy h to d" <<endl;
int tmax = 10000;
int start = time(0);
for(int tau=0; tau<tmax; tau++){
    res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_FORWARD);
        if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}
    res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_INVERSE);
        if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}
    for(int i=0; i<nGPUs; i++){
        device = d_data->descriptor->GPUs[i];
        cudaSetDevice(device);
        Normalize <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], N, n, nGPUs);
    }
    cudaDeviceSynchronize();
}
int stop = time(0);
cout <<tmax <<" timesteps" <<endl <<(stop-start) <<" seconds"<<endl;
/*
    for(int i=0; i<nGPUs; i++){
        device = d_data->descriptor->GPUs[i];
        cudaSetDevice(device);
        Cube <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], (cufftComplex*) d_data3->descriptor->data[i], N, real_size);
    }
*/
/*
    cudaDeviceSynchronize();
    res = cufftXtMemcpy (plan_complex, h_data, d_data, CUFFT_COPY_DEVICE_TO_HOST);
        if (res != CUFFT_SUCCESS){cout <<"memcpy to host failed" <<endl;}
cout <<"memcpy d to h" <<endl;
    ofstream fout;
    ostringstream outstr;
    outstr.precision(4);
    outstr <<time(0) <<".dat";
    string filename=outstr.str();
    fout.open(filename.c_str());
    fout.precision(4);
    for (int i = 0; i < n; i++) {
        x = (i % (N));
            y = (i /(N))%N; 
            fout <<x <<" " <<y <<" " <<h_data[i].x <<endl;
    }
    fout.close();  
*/
    //clean up
    res = cufftXtFree(d_data);
        if (res != CUFFT_SUCCESS){cout <<"free data failed" <<endl;}
    res = cufftXtFree(d_data3);
        if (res != CUFFT_SUCCESS){cout <<"free data3 failed" <<endl;}
    cufftDestroy(plan_complex);
    return 0;
}