cuda 内核调用/传递参数中的编译错误

Compilation error in cuda kernel calling/ passing parameters

本文关键字：编译错误参数内核调用 cuda 更新时间：2023-10-16

在实际代码中，我的目的是通过将输入数组与标量进行比较来获取输出数组。或者简单地输出 = 输入>标量。

如下所示的简单示例主机端代码按预期工作。

    float *h_data1 = (float *)malloc(W1*H1 * sizeof(float));
    bool *h_result = (bool *)malloc(H1*W2 * sizeof(bool));
    float *d_data1;      gpuErrchk(cudaMalloc(&d_data1, W1*H1 * sizeof(float)));
    bool *d_result;    gpuErrchk(cudaMalloc(&d_result, H1*W2 * sizeof(bool)));
    for (int i = 0; i < W1*H1; i++) h_data1[i] = (float)i;
    gpuErrchk(cudaMemcpy(d_data1, h_data1, W1*H1 * sizeof(float), cudaMemcpyHostToDevice));
    float scalar = 2;
    compGraterRetOut<float, bool><< <outw, outh >> > (d_data1, d_result, scalar);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

设备端代码为

template<typename TType, typename TTypeOut>
__global__  void compGraterRetOut(TType *dataIn, TTypeOut *dataOut, const TType scalar)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    dataOut[i] = (dataIn[i] > scalar);
}

来到实际代码，我有一个如下所示的图像类(仅显示该类的某些部分(。

template<typename TType, ImageType TImageType>
class Image
{
public:
    Image(uint32_t width, uint32_t height, uint32_t depth = 1);
private:
    TType* m_data;
    uint32_t m_width;
    uint32_t m_height;
    uint32_t m_depth;
    uint32_t m_bufferSize;
};
template<typename TType, ImageType TImageType>
Image<TType, TImageType>::Image(uint32_t width, uint32_t height, uint32_t depth) :m_width(width), 
m_height(height), m_depth(depth)
{
    if (width == 0 || height == 0)
        return;
    cudaError_t cudaStatus;
    //m_data = new TType[m_width * m_height * m_depth];
    gpuErrchk(cudaStatus = cudaMalloc(&m_data, sizeof(TType) * m_width * m_height * m_depth));
    if (cudaStatus == cudaSuccess)
    {
        m_bufferSize = m_width * m_height * m_depth;
    }
    else
    {
        std::cout << "Error malloc function failed [" << cudaStatus << "]" << std::endl;
    }
};

为了实现标量> out = 的目标，运算符> 重载如下所示。这抛出了一个编译错误，因为

"成员 "图像：：m_data [与 TType=float_t， TImageType=ImageType：：WHD]">

代码如下所示。

inline Image<uint32_t, TImageType> Image<TType, TImageType>::operator>(TType scalar) const
{
        Image<uint32_t, TImageType> ret(m_width, m_height, m_depth);
        compGraterRetOut<TType, uint32_t> << <m_width * 4, (m_height * m_depth/4) >> > (m_data, ret.m_data, scalar);
        gpuErrchk(cudaGetLastError());
        gpuErrchk(cudaDeviceSynchronize());
        return std::move(ret);
}

为了修复编译错误，我更改了函数运算符>。在这里，cuda 内存在函数内部分配，而不是在类的构造器内部分配。

template<class TType, ImageType TImageType>
inline Image<uint32_t, TImageType> Image<TType, TImageType>::operator>(TType scalar) const
{
        cudaError_t cudaStatus;
        uint32_t *dataout;
        gpuErrchk(cudaMalloc(&dataout, m_width*m_height*m_depth * sizeof(uint32_t)));
        Image<uint32_t, TImageType> ret(dataout, m_width, m_height, m_depth);
        compGraterRetOut<TType, uint32_t> << <m_width * 4, (m_height * m_depth/4) >> > (m_data, dataout, scalar);
        gpuErrchk(cudaGetLastError());
        gpuErrchk(cudaDeviceSynchronize());
        return std::move(ret);
}

最后，

我的问题是为什么最后编译的代码没有错误，而不是在此之前？

这个问题与库达无关。这是模板和 OOPS 的问题。当模板类访问其自身类型的成员时，它不会违反 OOPS 范例。使用不同的模板参数访问同一类的私有成员违反了 OOPS 范例。这就是答案。