NaN problems with cuFFT
NaN problems with cuFFT
我正在使用cuFFT为C++和Cuda的学校作业编写一个频率滤波应用程序,但我无法使其工作。您可以在此处找到整个Visual Studio 2010解决方案。(需求过剩。)
以下是我认为相关的部分:(fourierUtils.cu/194)
//////////////////////////////////////////////////////////////////////////////
// Function to help invoking the kernel, creates the parameters and gets
// the result
__host__
void Process(
BitmapStruct& in_img, // these contain an image in an rgba byte array
BitmapStruct& out_img,
MaskGenerator maskGenerator, // this is a pointer to a device function
float param1, // mask parameters
float param2)
{
// Declare and allocate variables
cufftHandle plan;
cufftReal* img;
cufftReal* dev_img;
cufftComplex* dev_freq_img;
int imgsize = in_img.image_size();
int pixelcount = imgsize / 4;
img = new float[pixelcount];
checkResult(
cudaMalloc(&dev_img, sizeof(cufftReal) * pixelcount));
checkResult(
cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount));
// Optimize execution
cudaFuncAttributes attrs;
checkResult(
cudaFuncGetAttributes(&attrs, &Filter));
std::pair<dim3, dim3> params
= Optimizer::GetOptimalParameters(pixelcount, attrs);
// Process r, g, b channels
for(int chan = 0; chan <= 2; chan++)
{
// Init
for(int i = 0; i < pixelcount; i++)
{
img[i] = in_img.pixels[4 * i + chan];
}
checkResult(
cudaMemcpy(dev_img, img, pixelcount, cudaMemcpyHostToDevice));
// Create frequency image
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_R2C, 1));
checkResult(
cufftExecR2C(plan, dev_img, dev_freq_img));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
// Mask frequency image
Filter<<<params.first, params.second>>>(
dev_freq_img, in_img.x, in_img.y, maskGenerator, param1, param2);
getLastCudaError("Filtering the image failed.");
// Get result
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_C2R, 1));
checkResult(
cufftExecC2R(plan, dev_freq_img, dev_img));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
checkResult(
cudaMemcpy(img, dev_img, pixelcount, cudaMemcpyDeviceToHost));
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + chan] = img[i];
}
}
// Copy alpha channel
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + 3] = in_img.pixels[4 * i + 3];
}
// Free memory
checkResult(
cudaFree(dev_freq_img));
checkResult(
cudaFree(dev_img));
delete img;
getLastCudaError("An error occured during processing the image.");
}
与我看到的官方示例相比,我看不到任何实际的差异,但当我使用Nsight对其进行调试时,我的内核接收到的所有cufftComplex值都是NaN,输入图像和结果图像之间的唯一差异是,无论我使用哪个过滤掩码和什么参数,结果底部都有一个黑条。所有Cuda和cuFFT调用都返回成功,并且在内核调用后也没有错误报告。
我做错了什么?
我试着用复杂的数组替换img和dev_img,使用C2C转换,也在原地进行转换,但这只改变了结果图像上黑条的大小。
谢谢你的帮助。
编辑:这里有一个精简版,不需要过多,也应该在linux上编译。
我还没有编译并运行您的精简版,但我认为问题在于dev_img
和dev_freq_imag
的大小。
请考虑CUFFT库用户指南第4.2节中的示例。它执行从实到复的就地转换,这与您首先执行的步骤相同。
#define NX 256
cufftHandle plan;
cufftComplex *data;
cudaMalloc((void**)&data, sizeof(cufftComplex)*(NX/2+1)*BATCH);
cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH);
cufftExecR2C(plan, (cufftReal*)data, data);
由于变换的对称性,cufftExecR2C
只填充NX/2+1
的输出元素,其中NX
是输入数组的大小。
在您的情况下,您正在执行以下操作:
cufftHandle plan;
cufftReal* dev_img;
cufftComplex* dev_freq_img;
cudaMalloc(&dev_img, sizeof(cufftReal) * pixelcount);
cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount);
所以您正在分配一个大小相同的cufftReal
数组和一个cufftComplex
数组。当你使用
cufftPlan1d(&plan, pixelcount, CUFFT_R2C, 1);
cufftExecR2C(plan, dev_img, dev_freq_img);
则只有一半的dev_freq_img
被cufftExecR2C
填充,剩余部分包含垃圾。如果在Filter
__global__
函数中使用dev_freq_img
的全部范围,则这可能是导致NaN
s的原因。
我的错误是忘记在一些cudaMemcpy
调用中将项的数量与其大小相乘,因此馈送到cuFFT的向量的末尾由NaN组成。修复这些问题已经解决了。
我还用cufftComplex数组替换了cufftRal数组,因为C2C转换似乎更可预测,并为值添加了规范化。
所以最后的工作方法是:
///////////////////////////////////////////////////////////////////////////////
// Function to help invoking the kernel, creates the parameters and gets
// the result
__host__
void Process(
BitmapStruct& in_img,
BitmapStruct& out_img,
MaskGenerator maskGenerator,
float param1,
float param2)
{
// Declare and allocate variables
cufftHandle plan;
cufftComplex* img;
cufftComplex* dev_img;
cufftComplex* dev_freq_img;
int imgsize = in_img.image_size();
int pixelcount = imgsize / 4;
img = new cufftComplex[pixelcount];
checkResult(
cudaMalloc(&dev_img, sizeof(cufftComplex) * pixelcount));
checkResult(
cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount));
// Optimize execution
cudaFuncAttributes attrs;
checkResult(
cudaFuncGetAttributes(&attrs, &Filter));
std::pair<dim3, dim3> params =
Optimizer::GetOptimalParameters(pixelcount, attrs);
// Process r, g, b channels
for(int chan = 0; chan <= 2; chan++)
{
// Init
for(int i = 0; i < pixelcount; i++)
{
img[i].x = in_img.pixels[4 * i + chan];
img[i].y = 0;
}
checkResult(
cudaMemcpy(
dev_img,
img,
pixelcount * sizeof(cufftComplex),
cudaMemcpyHostToDevice));
// Create frequency image
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_C2C, 1));
checkResult(
cufftExecC2C(plan, dev_img, dev_freq_img, CUFFT_FORWARD));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
// Mask frequency image
Filter<<<params.first, params.second>>>(
dev_freq_img,
in_img.x,
in_img.y,
maskGenerator,
param1,
param2);
getLastCudaError("Filtering the image failed.");
// Get result
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_C2C, 1));
checkResult(
cufftExecC2C(plan, dev_freq_img, dev_img, CUFFT_INVERSE));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
checkResult(
cudaMemcpy(
img,
dev_img,
pixelcount * sizeof(cufftComplex),
cudaMemcpyDeviceToHost));
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + chan] = img[i].x / pixelcount;
}
}
// Copy alpha channel
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + 3] = in_img.pixels[4 * i + 3];
}
// Free memory
checkResult(
cudaFree(dev_freq_img));
checkResult(
cudaFree(dev_img));
delete img;
getLastCudaError("An error occured during processing the image.");
}
谢谢你的帮助。
- Problems with std::cin.fail()
- 应用程序崩溃并显示"symbol _ZdlPvm, version Qt_5 not defined in file libQt5Core.so.5 with link time reference"
- 这对"With a stackless coroutine, only the top-level routine may be suspended."意味着什么
- Boost.TEST with CLion: "Test framework quit unexpectedly"
- 避免碎片化的ClientHellos with OpenSSL (DTLS)
- Issues with Win32 ReadProcessMemory API
- Qt with WinAPI MouseProc
- [[maybe_unused]] with structured_binding?
- Issue with WriteProcessMemory
- OpenCV RTP-Stream with FFMPEG
- "Unable to start debugging. No process is associated with this object." - 在Visual Studio Code中使用GDB
- std::adjacent_difference with std::chrono time_point
- DLL Made with CMake 使程序崩溃
- QtCreator with C 库中的链接器问题
- SHBrowseForFolder with BIF_BROWSEFORCOMPUTER and SHGetPathFr
- specialized std::default_delete with QQmlComponent
- VS2019 - Sudo Remote Debugging on Linux with Cmake project
- Inference pytorch C++ with alexnet and cv::imread image
- Pybind11: init<> with lambda
- NaN problems with cuFFT