NaN problems with cuFFT

本文关键字：cuFFT with problems NaN 更新时间：2023-10-16

我正在使用cuFFT为C++和Cuda的学校作业编写一个频率滤波应用程序，但我无法使其工作。您可以在此处找到整个Visual Studio 2010解决方案。（需求过剩。）

以下是我认为相关的部分：（fourierUtils.cu/194）

//////////////////////////////////////////////////////////////////////////////
// Function to help invoking the kernel, creates the parameters and gets 
// the result
__host__
void Process(
        BitmapStruct& in_img, // these contain an image in an rgba byte array
        BitmapStruct& out_img, 
        MaskGenerator maskGenerator, // this is a pointer to a device function
        float param1, // mask parameters
        float param2)
{    
    // Declare and allocate variables
    cufftHandle plan;
    cufftReal* img;
    cufftReal* dev_img;
    cufftComplex* dev_freq_img;
    int imgsize = in_img.image_size();
    int pixelcount = imgsize / 4;
    img = new float[pixelcount];
    checkResult(
        cudaMalloc(&dev_img, sizeof(cufftReal) * pixelcount));
    checkResult(
        cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount));
    // Optimize execution
    cudaFuncAttributes attrs;
    checkResult(
        cudaFuncGetAttributes(&attrs, &Filter));
    std::pair<dim3, dim3> params 
        = Optimizer::GetOptimalParameters(pixelcount, attrs);
    // Process r, g, b channels
    for(int chan = 0; chan <= 2; chan++)
    {
        // Init
        for(int i = 0; i < pixelcount; i++)
        {
            img[i] = in_img.pixels[4 * i + chan];
        }
        checkResult(
            cudaMemcpy(dev_img, img, pixelcount, cudaMemcpyHostToDevice));
        // Create frequency image
        checkResult(
            cufftPlan1d(&plan, pixelcount, CUFFT_R2C, 1));
        checkResult(
            cufftExecR2C(plan, dev_img, dev_freq_img));
        checkResult(
            cudaThreadSynchronize());
        checkResult(
            cufftDestroy(plan));
        // Mask frequency image
        Filter<<<params.first, params.second>>>(
            dev_freq_img, in_img.x, in_img.y, maskGenerator, param1, param2);
        getLastCudaError("Filtering the image failed.");
        // Get result
        checkResult(
            cufftPlan1d(&plan, pixelcount, CUFFT_C2R, 1));
        checkResult(
            cufftExecC2R(plan, dev_freq_img, dev_img));
        checkResult(
            cudaThreadSynchronize());
        checkResult(
            cufftDestroy(plan));
        checkResult(
            cudaMemcpy(img, dev_img, pixelcount, cudaMemcpyDeviceToHost));
        for(int i = 0; i < pixelcount; i++)
        {
            out_img.pixels[4 * i + chan] = img[i];
        }
    }
    // Copy alpha channel
    for(int i = 0; i < pixelcount; i++)
    {
        out_img.pixels[4 * i + 3] = in_img.pixels[4 * i + 3];
    }
    // Free memory
    checkResult(
        cudaFree(dev_freq_img));
    checkResult(
        cudaFree(dev_img));
    delete img;
    getLastCudaError("An error occured during processing the image.");
}

与我看到的官方示例相比，我看不到任何实际的差异，但当我使用Nsight对其进行调试时，我的内核接收到的所有cufftComplex值都是NaN，输入图像和结果图像之间的唯一差异是，无论我使用哪个过滤掩码和什么参数，结果底部都有一个黑条。所有Cuda和cuFFT调用都返回成功，并且在内核调用后也没有错误报告。

我做错了什么？

我试着用复杂的数组替换img和dev_img，使用C2C转换，也在原地进行转换，但这只改变了结果图像上黑条的大小。

谢谢你的帮助。

编辑：这里有一个精简版，不需要过多，也应该在linux上编译。

我还没有编译并运行您的精简版，但我认为问题在于dev_img和dev_freq_imag的大小。

请考虑CUFFT库用户指南第4.2节中的示例。它执行从实到复的就地转换，这与您首先执行的步骤相同。

#define NX 256
cufftHandle plan;
cufftComplex *data;
cudaMalloc((void**)&data, sizeof(cufftComplex)*(NX/2+1)*BATCH);
cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH);
cufftExecR2C(plan, (cufftReal*)data, data);

由于变换的对称性，cufftExecR2C只填充NX/2+1的输出元素，其中NX是输入数组的大小。

在您的情况下，您正在执行以下操作：

cufftHandle plan;
cufftReal* dev_img;
cufftComplex* dev_freq_img;
cudaMalloc(&dev_img, sizeof(cufftReal) * pixelcount);
cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount);

所以您正在分配一个大小相同的cufftReal数组和一个cufftComplex数组。当你使用

cufftPlan1d(&plan, pixelcount, CUFFT_R2C, 1);
cufftExecR2C(plan, dev_img, dev_freq_img);

则只有一半的dev_freq_img被cufftExecR2C填充，剩余部分包含垃圾。如果在Filter __global__函数中使用dev_freq_img的全部范围，则这可能是导致NaN s的原因。

我的错误是忘记在一些cudaMemcpy调用中将项的数量与其大小相乘，因此馈送到cuFFT的向量的末尾由NaN组成。修复这些问题已经解决了。

我还用cufftComplex数组替换了cufftRal数组，因为C2C转换似乎更可预测，并为值添加了规范化。

所以最后的工作方法是：

///////////////////////////////////////////////////////////////////////////////
// Function to help invoking the kernel, creates the parameters and gets 
// the result
__host__
void Process(
        BitmapStruct& in_img, 
        BitmapStruct& out_img, 
        MaskGenerator maskGenerator, 
        float param1, 
        float param2)
{    
    // Declare and allocate variables
    cufftHandle plan;
    cufftComplex* img;
    cufftComplex* dev_img;
    cufftComplex* dev_freq_img;
    int imgsize = in_img.image_size();
    int pixelcount = imgsize / 4;
    img = new cufftComplex[pixelcount];
    checkResult(
        cudaMalloc(&dev_img, sizeof(cufftComplex) * pixelcount));
    checkResult(
        cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount));
    // Optimize execution
    cudaFuncAttributes attrs;
    checkResult(
        cudaFuncGetAttributes(&attrs, &Filter));
    std::pair<dim3, dim3> params = 
            Optimizer::GetOptimalParameters(pixelcount, attrs);
    // Process r, g, b channels
    for(int chan = 0; chan <= 2; chan++)
    {
        // Init
        for(int i = 0; i < pixelcount; i++)
        {
            img[i].x = in_img.pixels[4 * i + chan];
            img[i].y = 0;
        }
        checkResult(
            cudaMemcpy(
                dev_img, 
                img, 
                pixelcount * sizeof(cufftComplex), 
                cudaMemcpyHostToDevice));
        // Create frequency image
        checkResult(
            cufftPlan1d(&plan, pixelcount, CUFFT_C2C, 1));
        checkResult(
            cufftExecC2C(plan, dev_img, dev_freq_img, CUFFT_FORWARD));
        checkResult(
            cudaThreadSynchronize());
        checkResult(
            cufftDestroy(plan));
        // Mask frequency image
        Filter<<<params.first, params.second>>>(
            dev_freq_img, 
            in_img.x, 
            in_img.y, 
            maskGenerator, 
            param1, 
            param2);
        getLastCudaError("Filtering the image failed.");
        // Get result
        checkResult(
            cufftPlan1d(&plan, pixelcount, CUFFT_C2C, 1));
        checkResult(
            cufftExecC2C(plan, dev_freq_img, dev_img, CUFFT_INVERSE));
        checkResult(
            cudaThreadSynchronize());
        checkResult(
            cufftDestroy(plan));
        checkResult(
            cudaMemcpy(
                img, 
                dev_img, 
                pixelcount * sizeof(cufftComplex), 
                cudaMemcpyDeviceToHost));
        for(int i = 0; i < pixelcount; i++)
        {
            out_img.pixels[4 * i + chan] = img[i].x / pixelcount;
        }
    }
    // Copy alpha channel
    for(int i = 0; i < pixelcount; i++)
    {
        out_img.pixels[4 * i + 3] = in_img.pixels[4 * i + 3];
    }
    // Free memory
    checkResult(
        cudaFree(dev_freq_img));
    checkResult(
        cudaFree(dev_img));
    delete img;
    getLastCudaError("An error occured during processing the image.");
}

谢谢你的帮助。