Cuda - 内核执行后的设备值 0

Cuda - Device values 0 after kernel execution

本文关键字：内核执行 Cuda 更新时间：2023-10-16

由于某种原因，当我执行程序时，设备变量的值为零。就在我执行 cuda 内核之前，设备变量具有正确的值。输出图像只是原始图像大小的黑色。所有内存分配和复制到主机和从主机复制似乎都是正确的。

感谢您的任何帮助！

    // Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#ifdef _WIN32
#  define WINDOWS_LEAN_AND_MEAN
#  define NOMINMAX
#  include <windows.h>
#endif
#define Image_Size 512
#define Kernel_Size 3
// Includes CUDA
#include <cuda_runtime.h>
// Utilities and timing functions
#include "./inc/helper_functions.h"    // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include "./inc/helper_cuda.h"         // helper functions for CUDA error check
const char *imageFilename = "lena_bw.pgm";
const char *sampleName = "simpleTexture";
#define C_PI 3.141592653589793238462643383279502884197169399375
void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor)
{
    // This function effectively swirls an image
    // This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup
    // Each thread on the GPU will process exactly one pixel
    // Before doing anything, we need to determine the current pixel we are calculating in this thread
    // Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    // Test to see if we're testing a valid pixel
    if (i >= height || j >= width) return;  // Don't bother doing the calculation. We're not in a valid pixel location
    double cX = (double)width/2.0f;
    double cY = (double)height/2.0f;
    double relY = cY-i;
    double relX = j-cX;
    // relX and relY are points in our UV space
    // Calculate the angle our points are relative to UV origin. Everything is in radians.
    double originalAngle;
    if (relX != 0)
    {
        originalAngle = atan(abs(relY)/abs(relX));
        if ( relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle;
        else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle;
        else if (relX <=0 && relY <0) originalAngle += C_PI;
    }
    else
    {
        // Take care of rare special case
        if (relY >= 0) originalAngle = 0.5f * C_PI;
        else originalAngle = 1.5f * C_PI;
    }
    // Calculate the distance from the center of the UV using pythagorean distance
    double radius = sqrt(relX*relX + relY*relY);
    // Use any equation we want to determine how much to rotate image by
    //double newAngle = originalAngle + factor*radius;  // a progressive twist
    double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI));
    // Transform source UV coordinates back into bitmap coordinates
    int srcX = (int)(floor(radius * cos(newAngle)+0.5f));
    int srcY = (int)(floor(radius * sin(newAngle)+0.5f));
    srcX += cX;
    srcY += cY;
    srcY = height - srcY;
    // Clamp the source to legal image pixel
    if (srcX < 0) srcX = 0;
    else if (srcX >= width) srcX = width-1;
    if (srcY < 0) srcY = 0;
    else if (srcY >= height) srcY = height-1;
    // Set the pixel color
    // Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here
    pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
}


////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("%s starting...n", sampleName);
    // Process command-line arguments
    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **) argv, "input"))
        {
            getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename);
        }
        else if (checkCmdLineFlag(argc, (const char **) argv, "reference"))
        {
            printf("-reference flag should be used with -input flag");
            exit(EXIT_FAILURE);
        }
    }
    runTest(argc, argv);
    cudaDeviceReset();
    printf("%s completed",
           sampleName);
    //exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
    int devID = findCudaDevice(argc, (const char **) argv);
    unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float);
    // load image from disk
    float *hData = NULL;
    float *host_array_kernel = 0;
    float *device_array_Image = 0;
    float *device_array_kernel = 0;
    float *device_array_Result = 0;

    unsigned int width, height;
    char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
    if (imagePath == NULL)
    {
        printf("Unable to source image file: %sn", imageFilename);
        exit(EXIT_FAILURE);
    }
    sdkLoadPGM(imagePath, &hData, &width, &height);
    unsigned int size = width * height * sizeof(float);
    printf("Loaded '%s', %d x %d pixelsn", imageFilename, width, height);
    // Allocation of device arrays using CudaMalloc
    cudaMalloc((void**)&device_array_Image, size);
    cudaMalloc((void**)&device_array_kernel, kernel_bytes);
    cudaMalloc((void**)&device_array_Result, size);

    host_array_kernel = (float*)malloc(kernel_bytes); // kernel

   // Allocate mem for the result on host side
   float *hOutputDataSharp = (float *) malloc(size);
    GenerateKernel (host_array_kernel);

// copy arrays and kernel from host to device
    checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice));

    dim3 dimBlock(16, 16, 1);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
    //Do the Convolution
    printf("DImage : '%.8f'n",device_array_Image);
    printf("DKernel : '%.8f'n",device_array_kernel);
    //serialConvolution(hData, host_array_kernel ,hOutputDataSharp);

    SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f);
    printf("DResult : '%.8f'n",device_array_Result);
    checkCudaErrors(cudaDeviceSynchronize());
    cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost);
    printf("HResult : '%.8f'n",hOutputDataSharp);
    // Write result to file
    char outputSharp[1024];
    strcpy(outputSharp, imagePath);
    strcpy(outputSharp, "data/serial_sharptest.pgm");
    sdkSavePGM(outputSharp, hOutputDataSharp, width, height);
    cudaFree(device_array_Result);
    cudaFree(device_array_Image);
    cudaFree(device_array_kernel);
    free(hData);
    free(imagePath);
    //free(host_array_Image);
    free(host_array_kernel);
    free(hOutputDataSharp);
    //free(hOutputImage);
    //free(hOutputKernel);
}

您的代码正在源图像中写入：

pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];

它写入device_array_Image哪个是源，而不是您期望结果的目标。

此外，我对printf("DResult : '%.8f'n",device_array_Result);的输出非常好奇，因为device_array_Result在GPU地址空间中并分配了cudaMalloc。您在哪个设备上运行？