CudaError未知代码= 30在任何cuda调用上

CudaErrorUnknown code=30 on any cuda call

本文关键字：任何 cuda 调用未知代码 CudaError 更新时间：2023-10-16

我已经安装了cuda工具包，我可以毫无问题地运行示例。现在，我想在我的项目中使用 cuda，在我的项目中我使用 cmake。因此，为了演示我的问题，我创建了一个简单的例子。我有 3 个文件，我的主文件是"teste.cpp"，一个 cuda 文件"hello_world.cu"，它是标题。我唯一主要的是在 hello_world.cu 上调用一个函数，如下所示：

#include <iostream>
#include "hello_world.h"
using namespace std;
int main(int argc, char** argv)
{
    teste(argc, argv);  
    return 0;
}

我的 hello_world.cu 是"时钟"示例的精确副本。所以，看起来像这样：

// CUDA runtime
#include </usr/local/cuda-9.0/include/cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include </usr/local/cuda-9.0/samples/common/inc/helper_functions.h>
#include </usr/local/cuda-9.0/samples/common/inc/helper_cuda.h>
#define NUM_BLOCKS    64
#define NUM_THREADS   256
__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
{
    // __shared__ float shared[2 * blockDim.x];
    extern __shared__ float shared[];
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    if (tid == 0) timer[bid] = clock();
    // Copy input.
    shared[tid] = input[tid];
    shared[tid + blockDim.x] = input[tid + blockDim.x];
    // Perform reduction to find minimum.
    for (int d = blockDim.x; d > 0; d /= 2)
    {
        __syncthreads();
        if (tid < d)
        {
            float f0 = shared[tid];
            float f1 = shared[tid + d];
            if (f1 < f0)
            {
                shared[tid] = f1;
            }
        }
    }
    // Write result.
    if (tid == 0) output[bid] = shared[0];
    __syncthreads();
    if (tid == 0) timer[bid+gridDim.x] = clock();
}
int teste(int argc, char** argv) {
  printf("CUDA Clock samplen");
    // This will pick the best possible CUDA capable device
    int dev = findCudaDevice(argc, (const char **)argv);
    float *dinput = NULL;
    float *doutput = NULL;
    clock_t *dtimer = NULL;
    clock_t timer[NUM_BLOCKS * 2];
    float input[NUM_THREADS * 2];
    for (int i = 0; i < NUM_THREADS * 2; i++)
    {
        input[i] = (float)i;
    }
    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
    checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 *NUM_THREADS>>>(dinput, doutput, dtimer);
    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaFree(dinput));
    checkCudaErrors(cudaFree(doutput));
    checkCudaErrors(cudaFree(dtimer));
    long double avgElapsedClocks = 0;
    for (int i = 0; i < NUM_BLOCKS; i++)
    {
        avgElapsedClocks += (long double) (timer[i + NUM_BLOCKS] - timer[i]);
    }
    avgElapsedClocks = avgElapsedClocks/NUM_BLOCKS;
    printf("Average clocks/block = %Lfn", avgElapsedClocks);
    return EXIT_SUCCESS;
}

我的CMakeLists.txt看起来像这样：

cmake_minimum_required(VERSION 2.8)
set(CUDA_HOST_COMPILER /usr/bin/g++-4.9)
find_package(CUDA QUIET REQUIRED)
# Pass options to NVCC
set(
    CUDA_NVCC_FLAGS
    ${CUDA_NVCC_FLAGS};
    -O3 -std=c++11 -g 
    )
# For compilation ...
# Specify target & source files to compile it from
cuda_add_executable(
    helloworld
    teste.cpp
    hello_world.cu
)

代码编译，当我运行它时，我得到以下输出：

CUDA Clock sample
GPU Device 0: "GeForce GTX 950M" with compute capability 5.0
CUDA error at /home/cesar/Documents/cuda_teste/hello_world.cu:69 code=30(cudaErrorUnknown) "cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2)"

为什么我在这里收到此错误，使用 cmake？如果我转到示例目录并直接尝试"时钟"示例，一切正常。那么我的CMakeList.txt是问题吗？

我设法找到了解决方案。

在我的 CMakeList 上.txt我需要用"-arch=sm_50"向我的 NVCC 添加一个标志，在我的情况下，由于我的显卡具有 5.0 的计算能力，因此sm_50，如果有人有相同的错误并想尝试这个，您必须检查您的计算能力