cudaDeviceSynchronize(）错误代码77:cudaErrorIllegalAddress

cudaDeviceSynchronize() error code 77: cudaErrorIllegalAddress

本文关键字：cudaErrorIllegalAddress 错误代码 cudaDeviceSynchronize 更新时间：2023-10-16

非常感谢您阅读我的线程。

我正在做CUDA的工作，但一直得到cudaDeviceSynchronize（）错误代码77:cudaErrorIllegalAddress，不知道为什么。我搜索了代码和函数，令人惊讶的是，只出现了几条记录。很奇怪。

我基本上总结了图像的所有像素。为了让我的问题有尽可能多的参考，我在这里展示了我所有的CUDA代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thorcalgpu.h"
#include <stdio.h>
#include "math.h"
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
#include <iostream>
using namespace std;
float random_float(void)
{
  return static_cast<float>(rand()) / RAND_MAX;
}

__global__ void reduceSum(unsigned short *input,
                          unsigned long long *per_block_results,
                          const int n)
{
    extern __shared__ unsigned long long sdata[];
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    // load input into __shared__ memory
    unsigned short x = 0;
    if(i < n)
    {
        x = input[i];
    }
    sdata[threadIdx.x] = x;
    __syncthreads();
    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
    {
        if(threadIdx.x < offset)
        {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }
        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }
    // thread 0 writes the final result
    if(threadIdx.x == 0)
    {
        per_block_results[blockIdx.x] = sdata[0];
    }
}
// Helper function for using CUDA to add vectors in parallel.
//template <class T>
cudaError_t gpuWrapper(float *mean,  int N,  vector<string> filelist)
{
    int size = N*N;
    unsigned long long* dev_sum = 0;
    unsigned short* dev_img = 0;
    cudaError_t cudaStatus;
    const int block_size = 512;
    const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0);
    int L = filelist.size();
    // Choose which GPU to run on, change this on a multi-GPU system.
    double totalgpuinittime = 0;
    StartCounter(7);
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }
    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    totalgpuinittime = GetCounter(7);
    unsigned short* img;
    unsigned short* pimg;
    unsigned long long* sum = new unsigned long long[num_blocks];
    unsigned long long* psum = sum;
    cout<<endl;
    cout << "gpu looping starts, and in progress ..." << endl;
    StartCounter(6);
    double totalfileiotime = 0;
    double totalh2dcpytime = 0;
    double totalkerneltime = 0;
    double totald2hcpytime = 0;
    double totalcpusumtime = 0;
    double totalloopingtime = 0;
    for (int k = 0; k < L; k++)
    {
        StartCounter(1);
        img = (unsigned short*)LoadTIFF(filelist[k].c_str());
        totalfileiotime += GetCounter(1);
        psum = sum;
        pimg = img;
        float gpumean = 0;
        memset(psum, 0, sizeof(unsigned long long)*num_blocks);
        StartCounter(2);
        // Copy input vectors from host memory to GPU buffers.
        cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
        cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }   
        totalh2dcpytime += GetCounter(2);
        StartCounter(3);
        //reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size);
         //reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size);
          reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size);
        totalkerneltime += GetCounter(3);
      // Check for any errors launching the kernel
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "reduction Kernel launch failed: %sn", cudaGetErrorString(cudaStatus));
            goto Error;
        }
        // cudaDeviceSynchronize waits for the kernel to finish, and returns
        // any errors encountered during the launch.
                // !!!!!! following is where the code 77 error occurs!!!!!!!
        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!n", cudaStatus);
            goto Error;
        }
        // Copy output vector from GPU buffer to host memory.
        StartCounter(4);
        cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
        totald2hcpytime += GetCounter(4);
        StartCounter(5);
        for (int i = 0; i < num_blocks; i++)
        {
            gpumean += *psum;
            psum++;
        }
        gpumean /= N*N;
        totalcpusumtime += GetCounter(5);
        delete img; 
        img = NULL;
        cout<<gpumean<<endl;
    }
    int S = 1e+6;
    int F = filelist.size();
    float R = S/F;
    totalloopingtime = GetCounter(6);
    cout<<"gpu looping ends."<<endl<<endl;
    cout<< "analysis:"<<endl;
    cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl;
    cout<<"file I/O time: "<<endl;
    cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl;
    cout<<"host-to-device copy time: "<<endl;
    cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl;
    cout<<"pure gpu kerneling time: "<<endl;
    cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl;
    cout<<"device-to-host copy time: "<<endl;
    cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl;
    /*cout<<"cpu summing time: "<<endl;
    cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/
    /*cout <<"gpu looping time: " << endl;
    cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/

Error:
    cudaFree(dev_sum);
    cudaFree(dev_img);
    delete sum;
    sum = NULL;
    return cudaStatus;
}
void kernel(float* &mean, int N, vector<string> filelist)
{
    // wrapper and kernel
    cudaError_t cudaStatus = gpuWrapper(mean, N,  filelist);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "gpuWapper failed!");
    }
   // printf("mean is: %fn", mean);
    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    StartCounter(8);
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaDeviceReset failed!");
    }
    cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl;
    //return *mean;
}

我已经为主机和设备内存分配了足够且等效的内存空间。欢迎发表任何意见。

虽然这可能不是代码中唯一的错误源，但您没有为reduction内核分配任何动态共享内存，从而导致您看到的非法寻址错误。正确的内核启动应该类似

size_t shm_size = block_size * sizeof(unsigned long long);
reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size);

这会为reduction内核中运行的每个线程分配相当于一个无符号长的线程，这（通过我对代码的粗略阅读）应该会使共享内存数组sdata成为内核运行的正确大小，而不会对该数组进行越界访问。

没有找到相关文章