与内存有关的莫名其妙的CUDA行为

inexplicable cuda behavior related to memory

本文关键字：莫名其妙行为 CUDA 内存更新时间：2023-10-16

因此，基本上我将C 代码（正常工作）重写为CUDA（我没有CUDA的经验）。代码（solve（）方法）的一部分无法正常工作，我真的不知道为什么。

所以我的问题是在cudamcpy期间的"未指定的启动失败"错误是什么，为什么在我的代码中发生。

我的第二个问题是为什么变量backup_ans和ANS计算同一件事时会有所不同？

#include "stdio.h"
#include <algorithm>
__device__ unsigned int primes[1024];
__device__ long long n = 1ll<<32; // #unsigned_integers


__device__ int hashh(long long x) {
      return (x>>1)%1024;
}
// compute (x^e)%n
__device__ unsigned long long mulmod(unsigned long long x,unsigned long long e,unsigned long long n) {
    unsigned long long ans = 1;
    while(e>0) {
        if(e&1) ans = (ans*x)%n;
        x = (x*x)%n;
        e>>=1;
    }
    return ans;
}
// determine whether n is strong probable prime base a or not.
// n is ODD
__device__ int is_SPRP(unsigned long long a,unsigned long long n) {
  int d=0;
  unsigned long long t = n-1;
  while(t%2==0) {
      ++d;
      t>>=1;
  }
  unsigned long long x = mulmod(a,t,n);
  if(x==1) return 1; 
  for(int i=0;i<d;++i) {
      if(x==n-1) return 1;
      x=(x*x)%n;
  }
  return 0;
}
__device__ int prime(long long x) {
        return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x);
}
// copy all unsigned COMPOSITE ingeters which are not congruent to zero modulo 2,3,5,7 and their hashh value = 0; 
// count of those elements store in c
// 335545 is just magic constant to distribute all integers equally on all 400*32 threads
__global__ void find(unsigned int *out,unsigned int *c) {
    unsigned int buff[4096];
    int local_c = 0;
    long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*335545;
    long long e = b+335545;
    if(b%2==0) ++b;
    for(long long i=b;i<e && i<n;i+=2) {
        if(i%3==0 || i%5==0 || i%7==0 || prime(i)) continue;
        if(hashh(i)==0) {
            buff[local_c++]=(unsigned int)i;
            if(local_c==4096) {
                int start = atomicAdd(c,local_c);
                for(int i=0;i<local_c;++i) out[i+start]=buff[i];
                local_c=0;
            }
        }
    }
    int start = atomicAdd(c,local_c);
    for(int i=0;i<local_c;++i) out[i+start]=buff[i];
}
// find base for which all elements in input are NOT SPRP. base is from {2,..,34} stored in 32bit uint
__global__ void solve(unsigned int *input, unsigned int *count,unsigned int *backup, unsigned int *ans) {
      __shared__ unsigned int s[32];
    unsigned int dif = (*count)/(blockDim.x*gridDim.x) +1;
    unsigned int b = (threadIdx.x+blockIdx.x*blockDim.x)*dif;
    unsigned int e = b+dif>(*count)?(*count):b+dif;
    unsigned int mysol = 0;
    for(long long i = 2; i<33; ++i) {
          int sol = 1;
          // each thread doing its part
          for(unsigned int j = b; j<e ; ++j) {
              //is some element is sprp base i break
              if(is_SPRP((unsigned long long)i,(unsigned long long)input[j])!=0) {
              sol=0;
              break;
              }
          }
          // if all elements passed store base to mysol
          if(sol==1) mysol|=1<<(i-2);
    }
    s[threadIdx.x] = mysol;
    // save thread_result
    backup[threadIdx.x+blockDim.x*blockIdx.x] = mysol;
    __syncthreads();
    // compute global resulte and store it to ans
    if(threadIdx.x==0) {
          unsigned int global_sol = ~0;
          for(int i=0;i<blockDim.x;++i) global_sol&=s[i];
          atomicAnd(ans,global_sol);
    }
}

int main(void) {
// number of blocks & thread for solve
const int blocks = 400;
const int threads = 32;
unsigned int prms[] = { 17, 11, 6, 60, 7, 13, 11, 34, 13, 2, 3, 37, 13, 11, 38, 2, 7, 105, 2, 7, 42, 11, 7, 3, 6, 15, 53, 44, 6, 6, 5, 15, 54, 7, 35, 10, 10, 15, 10, 10, 17, 17, 11, 10, 15, 43, 7, 5, 5, 3, 7, 43, 34, 2, 34, 2, 68, 53, 39, 10, 7, 6, 11, 2, 5, 2, 7, 2, 6, 5, 15, 40, 3, 5, 5, 2, 2, 10, 47, 13, 7, 43, 6, 7, 5, 6, 6, 13, 6, 35, 6, 15, 6, 13, 40, 10, 11, 2, 7, 2, 2, 3, 13, 3, 11, 15, 10, 5, 11, 14, 7, 11, 47, 5, 2, 2, 6, 2, 5, 55, 6, 5, 7, 2, 6, 58, 35, 11, 5, 12, 17, 6, 10, 12, 6, 6, 2, 53, 2, 2, 13, 5, 14, 7, 15, 6, 13, 62, 10, 6, 3, 7, 7, 3, 14, 5, 14, 73, 15, 11, 11, 6, 5, 17, 10, 5, 3, 37, 51, 10, 7, 5, 38, 12, 5, 11, 5, 7, 6, 5, 6, 40, 43, 57, 10, 13, 7, 15, 2, 10, 34, 7, 39, 10, 5, 3, 6, 13, 11, 5, 10, 43, 10, 5, 3, 14, 5, 2, 5, 41, 5, 39, 46, 2, 10, 2, 5, 12, 3, 2, 2, 5, 15, 43, 17, 41, 2, 13, 15, 38, 11, 11, 3, 34, 5, 6, 3, 7, 2, 37, 5, 6, 10, 17, 35, 2, 15, 6, 7, 5, 3, 13, 13, 12, 34, 2, 12, 10, 15, 13, 2, 2, 34, 6, 6, 5, 2, 7, 13, 3, 6, 11, 39, 42, 7, 2, 6, 39, 47, 3, 17, 5, 13, 7, 2, 47, 3, 7, 6, 11, 17, 37, 48, 7, 37, 11, 7, 10, 3, 14, 39, 14, 15, 43, 17, 2, 12, 7, 13, 5, 3, 6, 34, 37, 3, 17, 13, 2, 5, 10, 10, 44, 37, 2, 2, 10, 10, 7, 3, 7, 2, 7, 5, 43, 43, 11, 15, 51, 13, 17, 10, 11, 2, 5, 34, 17, 2, 2, 42, 6, 6, 5, 47, 15, 2, 12, 7, 3, 10, 15, 3, 7, 12, 12, 15, 43, 14, 7, 58, 13, 10, 6, 6, 38, 34, 5, 5, 13, 38, 6, 11, 10, 6, 7, 2, 55, 2, 13, 5, 11, 44, 15, 17, 2, 40, 2, 15, 13, 6, 2, 3, 3, 3, 3, 6, 39, 5, 11, 17, 37, 5, 7, 6, 10, 6, 12, 7, 5, 14, 10, 12, 71, 10, 35, 6, 11, 3, 2, 38, 3, 2, 34, 10, 17, 42, 2, 12, 6, 6, 11, 40, 12, 10, 6, 10, 2, 3, 3, 56, 11, 7, 42, 2, 38, 12, 2, 2, 13, 40, 12, 6, 5, 5, 59, 15, 38, 5, 5, 5, 7, 2, 10, 7, 2, 17, 10, 11, 6, 6, 6, 2, 10, 6, 54, 2, 82, 3, 34, 14, 15, 44, 5, 46, 2, 13, 5, 12, 13, 11, 10, 39, 5, 40, 3, 60, 3, 42, 11, 3, 46, 17, 3, 2, 37, 6, 42, 12, 14, 3, 12, 66, 13, 34, 7, 3, 13, 3, 11, 2, 13, 12, 38, 34, 5, 40, 10, 14, 6, 14, 11, 38, 58, 2, 48, 5, 15, 5, 73, 3, 37, 5, 11, 10, 5, 5, 13, 2, 10, 13, 34, 17, 3, 7, 47, 2, 2, 10, 15, 3, 3, 13, 6, 34, 13, 10, 13, 3, 6, 41, 10, 6, 2, 6, 2, 6, 2, 6, 6, 37, 10, 44, 35, 13, 51, 2, 7, 53, 5, 40, 5, 2, 37, 11, 15, 11, 13, 2, 5, 2, 6, 10, 17, 15, 43, 39, 17, 2, 12, 10, 15, 17, 7, 13, 3, 7, 15, 37, 5, 15, 7, 6, 10, 51, 2, 2, 40, 61, 2, 13, 13, 11, 2, 5, 34, 5, 5, 7, 2, 2, 2, 11, 3, 6, 13, 6, 17, 11, 10, 7, 46, 15, 7, 14, 35, 11, 7, 10, 6, 11, 40, 11, 2, 39, 7, 6, 66, 5, 3, 6, 5, 11, 10, 2, 10, 7, 13, 2, 45, 34, 6, 35, 2, 11, 5, 59, 75, 10, 17, 14, 17, 17, 17, 2, 11, 7, 10, 6, 11, 6, 56, 34, 35, 11, 14, 12, 41, 40, 17, 40, 3, 11, 7, 37, 14, 7, 13, 7, 5, 2, 10, 6, 39, 2, 7, 37, 35, 10, 5, 15, 2, 7, 38, 34, 11, 17, 5, 6, 10, 3, 6, 7, 7, 43, 14, 2, 43, 3, 2, 47, 7, 35, 7, 3, 53, 2, 10, 10, 10, 60, 10, 6, 2, 6, 10, 5, 7, 57, 53, 13, 3, 35, 38, 15, 42, 3, 3, 12, 2, 10, 3, 38, 54, 13, 10, 11, 7, 13, 7, 2, 12, 39, 10, 54, 2, 12, 38, 10, 12, 12, 5, 15, 6, 10, 13, 5, 15, 10, 13, 6, 41, 40, 14, 12, 10, 11, 40, 5, 11, 10, 2, 5, 2, 13, 6, 2, 13, 5, 2, 10, 15, 5, 5, 10, 34, 13, 2, 5, 14, 5, 6, 5, 13, 3, 43, 6, 13, 11, 50, 3, 6, 6, 12, 15, 11, 37, 7, 69, 11, 14, 14, 7, 43, 5, 35, 11, 35, 11, 11, 34, 34, 39, 14, 11, 2, 10, 53, 6, 11, 2, 11, 60, 39, 11, 6, 15, 40, 17, 47, 34, 50, 7, 59, 47, 5, 13, 39, 5, 6, 53, 10, 14, 5, 51, 5, 7, 5, 6, 77, 7, 12, 7, 42, 2, 5, 2, 6, 60, 10, 13, 10, 6, 47, 6, 15, 17, 10, 11, 10, 12, 7, 7, 10, 17, 34, 5, 10, 7, 7, 2, 6, 10, 38, 2, 15, 6, 13, 7, 13, 2, 3, 13, 5, 3, 17, 2, 5, 15, 11, 39, 7, 39, 10, 10, 2, 6, 13, 3, 5, 17, 6, 14, 10, 37, 44, 3, 34, 5, 11, 7, 12, 2, 5, 3, 12, 3, 2, 3, 133, 12, 2, 2, 2, 3, 34, 14, 41, 2, 37, 11, 2, 6, 11, 6, 7, 15, 11, 35, 13, 6, 5, 2, 14, 7, 2 };
printf("primes_copy: %sn",cudaGetErrorString(cudaMemcpyToSymbol(primes,prms,1024*4)));
/*-----*/
// allocate buffers
unsigned int *dev_input,*dev_count;
printf("alloc_input: %sn",cudaGetErrorString(cudaMalloc((void**)&dev_input,sizeof(int)*(1<<23))));
printf("alloc_count: %sn",cudaGetErrorString(cudaMalloc((void**)&dev_count,4)));
printf("memset_count: %sn",cudaGetErrorString(cudaMemset(dev_count,0,4)));
find<<<400,32>>>(dev_input,dev_count);
cudaDeviceSynchronize();
unsigned int count;
printf("copy_count: %sn",cudaGetErrorString(cudaMemcpy(&count,dev_count,4,cudaMemcpyDeviceToHost)));
// sort found elements just to make debbug easier, it is not necessary
unsigned int *backup_numbers = new unsigned int[1000000];
printf("copy_backup: %sn",cudaGetErrorString(cudaMemcpy(backup_numbers,dev_input,4*count,cudaMemcpyDeviceToHost)));
std::sort(backup_numbers,backup_numbers+count);
printf("copy_S_backup: %sn",cudaGetErrorString(cudaMemcpy(dev_input,backup_numbers,4*count,cudaMemcpyHostToDevice)));
delete[] backup_numbers;
printf("nsize: %un",count);
// allocate buffers
unsigned int *dev_backup, *dev_ans;
printf("malloc_backup: %sn",cudaGetErrorString(cudaMalloc((void**)&dev_backup,sizeof(int)*blocks*threads)));
printf("malloc_ans: %sn",cudaGetErrorString(cudaMalloc((void**)&dev_ans,4)));
printf("memset_ans: %sn",cudaGetErrorString(cudaMemset(dev_ans,0xFF,4)));
solve<<<blocks,threads>>>(dev_input,dev_count,dev_backup,dev_ans);
cudaDeviceSynchronize();
unsigned int ans,*backup;
printf("memcpy_ans: %sn",cudaGetErrorString(cudaMemcpy(&ans,dev_ans,4,cudaMemcpyDeviceToHost)));
backup = new unsigned int[400*32];
printf("memcpy_backup: %sn",cudaGetErrorString(cudaMemcpy(backup,dev_backup,4*blocks*threads,cudaMemcpyDeviceToHost)));
unsigned int backup_ans = ~0;
// compute global result using backuped thread_results
// notice backup_ans and ans MUST be the same, but they are NOT (WHY!)
for(int i=0;i<threads*blocks;++i) backup_ans&=backup[i];
printf("answers: %unbackup_ans %un",ans,backup_ans);
printf("%un",backup[48]);
delete[] backup;
cudaFree(dev_ans);
cudaFree(dev_backup);
cudaFree(dev_count);
cudaFree(dev_input);
}

除了solve（）方法外，所有代码都按预期工作。solve（）方法只能计算废话（因为backup_ans和ans不同），它也给我带来了最后两个cudamcpy的"未指定启动失败"错误。当我运行求解时，我得到了

ans：134816642 backup_ans 432501552

但是，当我运行时解决＆lt;＆lt;＆lt; 400,32>>>（...）它给了我

ans：134816642 backup_ans 0（正确答案应为0）

在所有情况下，都应计算backup_ans = ans = 0

任何我做错的建议都会有所帮助。

生成Primes.bin的代码

#include <cstdlib>
#include <stdio.h>
using namespace std;
const unsigned long long n = 1ll<<32;
const int buffer_size = 2000000;
typedef unsigned char uch;
typedef unsigned int uint;
typedef unsigned long long ull;
uch *primes;
int prime(long long x) {
if(x==2) return 1;
if(x%2==0) return 0;
long long pos = x/16;
long long index = (x&15)>>1;
return (1<<index)&(~(primes[pos]));
}
void eratosten_sieve(void) {
  long long pos;
  long long index;
  for(long long i=3;i*i<n;++i) {
      if(!prime(i)) continue;
      for(long long j=i*i;j<n;j+=(i<<1)) {
      pos = j/16;
      index = ((j&15)>>1);
      primes[pos]|=(1<<index);
      }
  }
}
int main(void) {
primes = new uch[(n/16)+1];
for(long long i=0;i<(n/16)+1;++i) primes[i]=0;
printf("generatingn");
eratosten_sieve(); 
int l = n/16 +1;
printf("writingn");
FILE *f = fopen("primes.bin","wb");
fwrite(primes,1,l,f);
fclose(f);
printf("donen");
delete[] primes;
}

ps：我正在编辑NVCC -arch Compute_11

CUDA Driver Version / Runtime Version          5.5 / 5.5
CUDA Capability Major/Minor version number:    1.1
Total amount of global memory:                 1023 MBytes (1073020928 bytes)
(14) Multiprocessors, (  8) CUDA Cores/MP:     112 CUDA Cores
GPU Clock rate:                                1500 MHz (1.50 GHz)
Memory Clock rate:                             900 Mhz
Memory Bus Width:                              256-bit
Maximum Texture Dimension Size (x,y,z)         1D=(8192), 2D=(65536, 32768), 3D=(2048, 2048, 2048)
Maximum Layered 1D Texture Size, (num) layers  1D=(8192), 512 layers
Maximum Layered 2D Texture Size, (num) layers  2D=(8192, 8192), 512 layers
Total amount of constant memory:               65536 bytes
Total amount of shared memory per block:       16384 bytes
Total number of registers available per block: 8192
Warp size:                                     32
Maximum number of threads per multiprocessor:  768
Maximum number of threads per block:           512
Max dimension size of a thread block (x,y,z): (512, 512, 64)
Max dimension size of a grid size    (x,y,z): (65535, 65535, 1)
Maximum memory pitch:                          2147483647 bytes
Texture alignment:                             256 bytes
Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
Run time limit on kernels:                     Yes
Integrated GPU sharing Host Memory:            No
Support host page-locked memory mapping:       Yes
Alignment requirement for Surfaces:            Yes
Device has ECC support:                        Disabled
Device supports Unified Addressing (UVA):      No
Device PCI Bus ID / PCI location ID:           1 / 0
Compute Mode:
  < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.5, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce 9800 GT
Result = PASS

好吧，您不在记忆中。我花了一段时间才弄清楚，因为我没有考虑大型静态分配：

__device__ unsigned char primes[(1<<28)+1];

通常，当人们不在记忆中时，他们会在cudaMalloc操作中发现它。就您而言，您的GPU具有1GB的内存，我猜您还在其中托管一个显示（您没有回答这个问题）。看看nvidia-smi -a输出中有多少免费内存，它看起来像这样：

FB Memory Usage
    Total                       : 1535 MiB
    Used                        : 3 MiB
    Free                        : 1532 MiB

您的数字会较小 - 自由线就是我们关心的。

您的动态分配（即cudaMalloc）分配了约350MB。但是内核发射将静态分配带入了播放，然后您的总足迹上升到700MB以上（2^28超过250MB）。如果您在该GPU上运行显示显示，它将消耗一些1GB的内存，使您不足以运行需要700MB的内核。

如果您想在该GPU上运行，请查看是否可以以某种方式将问题大小降低。

进行适当的CUDA错误检查总是很好的，但是除此问题外，您的代码似乎在具有更多内存的设备上没有错误。