如何将类的非静态成员函数传递给CUDA内核函数(__global__函数)

How to pass non-static member function of a class into CUDA kernel function (__global__ function)

本文关键字:函数 CUDA 内核 global 静态成员      更新时间:2023-10-16

最后,我已经能够在CUDA内核函数(__global__函数)中传递一个主机函数作为函数指针。感谢Robert Crovella和njuffa给出的答案。我已经能够将类成员函数(cpu函数)作为指向CUDA内核的函数指针传递。但是,主要的问题是,我只能传递静态类成员函数。我不能传递未声明为静态的函数。

我的问题是:如何将非静态成员函数传递到CUDA内核

例如:

__host__ __device__ 
static int CellfunPtr(void*ptr, int a);

上述函数之所以有效,是因为该成员函数被声明为静态成员函数。如果我没有将这个成员函数声明为静态成员,

__host__ __device__ 
in CellfunPtr(void*ptr, int a);

则不工作

完整的代码有四个文件。


fundef.h

typedef int (*pFunc_t)(void* ptr, int N);

solver.h文件

class CalcVars {
   int eqnCount;
   int numCell;                      
   int numTri;
   int numTet;
public:
   double* cellVel; 
   double* cellPre;
/** Constructor */
CalcVars(
    const int eqnCount_,             
    const int numCell_,          
    const int numTri_,             
    const int numTet_                
);
/** Destructor */
~CalcVars(void);
public:
  void 
      CalcAdv();

  __host__ __device__ 
  static int 
      CellfunPtr(
      void*ptr, int a
);
};

solver.cu

#include "solver.h"
#include "fundef.h"
#include <stdio.h>
__device__ pFunc_t pF1_d = CalcVars::CellfunPtr;
pFunc_t pF1_h ;

__global__ void kernel(int*a, pFunc_t func, void* thisPtr_){
    int tid = threadIdx.x;
    a[tid] = (*func)(thisPtr_, a[tid]); 
};
/* Constructor */
CalcVars::CalcVars(
    const int eqnCount_,             
    const int numCell_,          
    const int numTri_,             
    const int numTet_   
)
{
    this->eqnCount = eqnCount_;
    this->numCell = numCell_;
    this->numTri = numTri_;
    this->cellVel = (double*) calloc((size_t) eqnCount, sizeof(double)); 
    this->cellPre = (double*) calloc((size_t) eqnCount, sizeof(double)); 
}
/* Destructor */
CalcVars::~CalcVars(void)
{
   free(this->cellVel);
   free(this->cellPre);
}

void 
CalcVars::CalcAdv(
){
    /*int b1 = 0;
    b1 = CellfunPtr(this, 1);*/
   int Num = 50;
   int *a1, *a1_dev;
    a1 = (int *)malloc(Num*sizeof(int));
    cudaMalloc((void**)&a1_dev, Num*sizeof(int));
    for(int i = 0; i <Num; i++){
        a1[i] = i;
    }
    cudaMemcpy(a1_dev, a1, Num*sizeof(int), cudaMemcpyHostToDevice);
    //copy addresses of device functions to host 
    cudaMemcpyFromSymbol(&pF1_h, pF1_d, sizeof(pFunc_t));

    kernel<<<1,42>>>(a1_dev, pF1_h, this);
    cudaDeviceSynchronize();
    cudaMemcpy(a1, a1_dev, Num*sizeof(int), cudaMemcpyDeviceToHost);

};

int 
CalcVars::CellfunPtr(
    void* ptr, int a
){
    //CalcVars* ClsPtr = (CalcVars*)ptr;
    printf("Printing from CPU functionn");
    //int eqn_size = ClsPtr->eqnCount;
    //printf("The number is %d",eqn_size);
    return a-1;
};

main.cpp文件

#include "solver.h"
int main(){
    int n_Eqn, n_cell, n_tri, n_tetra;
    n_Eqn = 100;
    n_cell = 200;
    n_tri = 300;
    n_tetra = 400;
   CalcVars* calcvars;
   calcvars = new CalcVars(n_Eqn, n_cell, n_tri, n_tetra );
   calcvars->CalcAdv();
   system("pause");
}

成员函数的类型不同:

typedef int (CalcVars::*MethodPtr)(int N);
__device__ MethodPtr pF1_d = &CalcVars::CellfunPtr;

你可以这样调用:

__global__ void kernel(int*a, MethodPtr func, void* thisPtr_)
{
    int tid = threadIdx.x;
    CalcVars* c = ((CalcVars*)thisPtr_);
    a[tid] = (c->*func)(a[tid]);
};

但是传递给内核的this指针是一个主机指针:

kernel<<<1,42>>>(a1_dev, pF1_h, this);

这个导致内核中无效的内存访问。

你必须将CalcVars实例的设备指针传递给内核,以使其工作。


根据要求,一个完整的可编译示例,它是您的示例的浓缩版本,仍然存在我上面写的this指针问题。

demo.cu

#include <stdio.h>
struct CalcVars
{
  void  CalcAdv();
  __host__ __device__
   int  CellfunPtr(int a);
};
typedef int (CalcVars::*MethodPtr)(int N);
__device__ MethodPtr pF1_d = &CalcVars::CellfunPtr;
MethodPtr pF1_h;
__global__ void kernel(int* a, MethodPtr func, void* thisPtr_)
{
    int tid = threadIdx.x;
    CalcVars* c = ((CalcVars*)thisPtr_);
    a[tid] = (c->*func)(a[tid]);
};
voidCalcVars::CalcAdv()
{
   int Num = 50;
   int *a1, *a1_dev;
    a1 = (int *)malloc(Num*sizeof(int));
    cudaMalloc((void**)&a1_dev, Num*sizeof(int));
    for (int i = 0; i <Num; i++)
    {
        a1[i] = i;
    }
    cudaMemcpy(a1_dev, a1, Num*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpyFromSymbol(&pF1_h, pF1_d, sizeof(MethodPtr));
    // DON'T pass the host this pointer here in real code
    kernel<<<1,42>>>(a1_dev, pF1_h, this);
    cudaDeviceSynchronize();
    cudaMemcpy(a1, a1_dev, Num*sizeof(int), cudaMemcpyDeviceToHost);
};
int CalcVars::CellfunPtr(int a)
{
    printf("Printing from CPU functionn");
    return a-1;
};
int main()
{
   CalcVars calcvars;
   calcvars.CalcAdv();
}