CUDA:使用具有共享内存的全局线程索引不起作用

CUDA: Using a global thread index with shared memory won't work

本文关键字:全局 线程 索引 不起作用 内存 共享 CUDA      更新时间:2023-10-16

有人能解释为什么我的内核不工作时,我的共享内存数组的指针,TMS,访问一些索引以外的第0个索引(发生在最后一行)?如果在最后一行使用TMS[0],则一切都按预期工作。当我将TMS[0]更改为任何其他索引时,我得到一个CUDA意外错误。假设一个块上有64个线程。

#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include <cuda_runtime.h>
#include <curand_kernel.h>
__global__ void myKern(float *masterForces)
{
    int globalIdx = ...// set global thread id
    volatile __shared__ float uniques[64];
    {
        uniques[globalIDx] = 0;
    }
    __syncthreads();

    volatile __shared__ float *TMS[64]; 
    {
       TMS[globalIdx] = (&uniques[globalIdx]);
    }
    __syncthreads();
    masterForces[globalIdx] = *TMS[1];
}

如果你很好奇,可以参考原文:(你真的不需要看这个来解决我的问题)

#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>

__global__ void myKern(const float *transMatrix, const int *pointerMatrix, float *masterForces, const double *rands, const int r_max)
{


int globalIdx = ((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) + (threadIdx.x + (threadIdx.y * blockDim.x));
volatile __shared__ float uniques[51];
uniques[0] = transMatrix[0]; uniques[1] = transMatrix[1]; uniques[2] = transMatrix[2]; // 1
uniques[3] = transMatrix[3]; uniques[4] = transMatrix[4]; uniques[5] = transMatrix[12]; // 2
uniques[6] = transMatrix[14]; uniques[7] = transMatrix[15]; uniques[8] = transMatrix[24]; // 3
uniques[9] = transMatrix[26]; uniques[10] = transMatrix[27]; uniques[11] = transMatrix[28]; // 4
uniques[12] = transMatrix[40]; uniques[13] = transMatrix[50]; uniques[14] = transMatrix[60]; // 5
uniques[15] = transMatrix[62]; uniques[16] = transMatrix[146]; uniques[17] = transMatrix[156]; // 6
uniques[18] = transMatrix[158]; uniques[19] = transMatrix[168]; uniques[20] = transMatrix[170]; // 7
uniques[21] = transMatrix[172]; uniques[22] = transMatrix[184]; uniques[23] = transMatrix[290]; // 8
uniques[24] = transMatrix[300]; uniques[25] = transMatrix[302]; uniques[26] = transMatrix[312]; // 9
uniques[27] = transMatrix[314]; uniques[28] = transMatrix[316]; uniques[29] = transMatrix[328]; // 10
uniques[30] = transMatrix[1010]; uniques[31] = transMatrix[1020]; uniques[32] = transMatrix[1022]; // 11
uniques[33] = transMatrix[1032]; uniques[34] = transMatrix[1034]; uniques[35] = transMatrix[1036]; // 12
uniques[36] = transMatrix[1048]; uniques[37] = transMatrix[1154]; uniques[38] = transMatrix[1164]; // 13
uniques[39] = transMatrix[1166]; uniques[40] = transMatrix[1176]; uniques[41] = transMatrix[1178]; // 14
uniques[42] = transMatrix[1180]; uniques[43] = transMatrix[1192]; uniques[44] = transMatrix[2018]; // 15
uniques[45] = transMatrix[2028]; uniques[46] = transMatrix[2030]; uniques[47] = transMatrix[2040]; // 16
uniques[48] = transMatrix[2042]; uniques[49] = transMatrix[2044]; uniques[50] = transMatrix[2056]; // 17
__syncthreads();

volatile __shared__ float *TMS[2592]; 
  for (int t=0; t<2592; t++)    
  {
    for (int m=0; m< 51; m++){
       if (pointerMatrix[t] == m)
       {
        TMS[t] = (&uniques[m]);
       }
  }
__syncthreads();

int b0 = 0;
int c0 = 0;
int d0 = 0;
int e0 = 0;
int f0 = 0;
int g0 = 0;
int h0 = 0;
int i0 = 0;
int j0 = 0;
int k0 = 0;
int l0 = 0;
int m0 = 0;
int n0 = 0;
int o0 = 0;
int p0 = 0;
int q0 = 0;
int r0 = 0;
int s0 = 0;
int t0 = 0;
int u0 = 0;
int v0 = 0;
int w0 = 0;
int x0 = 0;
int y0 = 0;


int index = 0;
float r = 0.0;
float temp = 0;
int RUsnapshot = 0; 
int leftsnap = 0;
curandState s;
curand_init (rands[globalIdx] , 0, 0, &s);

for (int i =0; i < 160000; i++) //@@@depends on iterations @@@@@
{
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = b0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((0 * 6 + c0) * 6  + b0) * 2) * 6) ;
            b0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = b0;
        ///////////////////////////////////////////////////////////    
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = c0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + d0) * 6  + c0) * 2) * 6) ;
            c0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = c0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = d0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + e0) * 6  + d0) * 2) * 6) ;
            d0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = d0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = e0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + f0) * 6  + e0) * 2) * 6) ;
            e0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = e0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = f0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + g0) * 6  + f0) * 2) * 6) ;
            f0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = f0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = g0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + h0) * 6  + g0) * 2) * 6) ;
            g0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = g0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = h0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + i0) * 6  + h0) * 2) * 6) ;
            h0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = h0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = i0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + j0) * 6  + i0) * 2) * 6) ;
            i0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = i0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = j0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + k0) * 6  + j0) * 2) * 6) ;
            j0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = j0;
        ///////////////////////////////////////////////////////////    
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = k0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + l0) * 6  + k0) * 2) * 6) ;
            k0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = k0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = l0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + m0) * 6  + l0) * 2) * 6) ;
            l0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = l0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = m0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + n0) * 6  + m0) * 2) * 6) ;
            m0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = m0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = n0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + o0) * 6  + n0) * 2) * 6) ;
            n0 += ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = n0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = o0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + p0) * 6  + o0) * 2) * 6) ;
            o0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = o0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = p0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + q0) * 6  + p0) * 2) * 6) ;
            p0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = p0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = q0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + r0) * 6  + q0) * 2) * 6) ;
            q0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = q0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = r0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + s0) * 6  + r0) * 2) * 6) ;
            r0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = r0;
        ///////////////////////////////////////////////////////////    
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = s0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + t0) * 6  + s0) * 2) * 6) ;
            s0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = s0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = t0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + u0) * 6  + t0) * 2) * 6) ;
            t0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = t0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot =u0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + v0) * 6  + u0) * 2) * 6) ;
            u0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap =u0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = v0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + w0) * 6  + v0) * 2) * 6) ;
            v0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = v0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = w0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + x0) * 6  + w0) * 2) * 6) ;
            w0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = w0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = x0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + y0) * 6  + x0) * 2) * 6) ;
            x0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;
            leftsnap = x0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);
            RUsnapshot = y0;
            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + 0) * 6  + y0) * 2) * 6) ;
            y0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

        ///////////////////////////////////////////////////////////   



            temp = (b0 ==4) + (b0 ==5) + (c0 ==4) + (c0 ==5) + (d0 ==4) + (d0 ==5) + (e0 ==4) + (e0 ==5) + (f0 ==4) + (f0 ==5) + 
                   (g0 ==4) + (g0 ==5) + (h0 ==4) + (h0 ==5) + (i0 ==4) + (i0 ==5) + (j0 ==4) + (j0 ==5) + (k0 ==4) + (k0 ==5) + 
                   (l0 ==4) + (l0 ==5) + (m0 ==4) + (m0 ==5) + (n0 ==4) + (n0 ==5) + (o0 ==4) + (o0 ==5) + (p0 ==4) + (p0 ==5) + 
                   (q0 ==4) + (q0 ==5) + (r0 ==4) + (r0 ==5) + (s0 ==4) + (s0 ==5) + (t0 ==4) + (t0 ==5) + (u0 ==4) + (u0 ==5) + 
                   (v0 ==4) + (v0 ==5) + (w0 ==4) + (w0 ==5) + (x0 ==4) + (x0 ==5) + (y0 ==4) + (y0 ==5);

        masterForces[globalIdx + (r_max * i)] = *TMS[1]; 
        temp = 0.0;
}

}
}

假设您的全局线程索引(globalIdx)实际上是一个全局线程索引,例如:

int globalIdx = threadIdx.x+blockDim.x*blockIdx.x; // e.g. for 1D grid/threadblock

那么当你的全局线程索引超过63时,你不能索引到64项的共享内存数组:

    uniques[globalIDx] = 0;

   TMS[globalIdx] = (&uniques[globalIdx]);

如果只使用线程索引:

    uniques[threadIdx.x] = 0;

以及类似的代码中的其他地方,您应该至少可以对共享内存数组进行索引,假设每个块有64个线程。

下面是一个完整的例子:

$ cat t463.cu
#include <stdio.h>
#define DSIZE 128
#define nTPB 64
__global__ void myKern(float *masterForces)
{
    int globalIdx = threadIdx.x+blockDim.x*blockIdx.x;
    volatile __shared__ float uniques[nTPB];
    {
        uniques[threadIdx.x] = 0;
    }
    __syncthreads();

    volatile __shared__ float *TMS[nTPB];
    {
       TMS[threadIdx.x] = &(uniques[threadIdx.x]);
    }
    __syncthreads();
    masterForces[globalIdx] = *TMS[1];
}
int main(){
  float *d_data, *h_data;
  h_data=(float *)malloc(DSIZE*sizeof(float));
  cudaMalloc(&d_data, DSIZE*sizeof(float));
  for (int i = 0; i< DSIZE; i++)
    h_data[i] = 1.0f;
  cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
  myKern<<<DSIZE/nTPB, nTPB>>>(d_data);
  cudaMemcpy(h_data, d_data, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i<DSIZE; i++)
    if (h_data[i] != 0.0f) {printf("mismatch at %d, was: %f should be: %fn", i, h_data[i], 0.0f); return 1;}
  printf("Successn");
  return 0;
}
$ nvcc -arch=sm_20 -o t463 t463.cu
$ cuda-memcheck ./t463
========= CUDA-MEMCHECK
Success
========= ERROR SUMMARY: 0 errors
$

在未来,请提供一个完整的工作示例来演示问题,对于这样的问题。SO期望这样,如果你不提供,这是一个有效的关闭问题的理由。

您的示例代码不应该给出任何错误,正如Robert Crovella所证明的那样。我看了看你的原始代码,发现了由第一个"for"循环引起的不对齐的括号。祝你好运…