内核中的"while"/"for"循环导致 CUDA 内存不足错误?

"while"/"for" loop in kernel causing CUDA out of memory error?

本文关键字:CUDA 内存不足 错误 for while 内核 循环      更新时间:2023-10-16

如果我改变while循环(见下面的内核,这是一个可怕的循环,你不会错过它)只迭代一次,它使用的GPU内存可以忽略不计。然而,当允许循环迭代50,000次时,GPU立即占用2.5 GB。即使使用"for"循环,这个问题仍然存在。有人可以提供一个解释,也许是一个解决方案,以防止内核使用这么多的内存?在我看来,这种行为非常不寻常。提前感谢!

#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>
    __global__ void myKern(const float *transMatrix, float *masterForces, const double *rands, const int r_max)
    {

    const int iterationsx = 50000;
    const int RUsizex = 26;
    int threadsPerBlock = blockDim.x * blockDim.y;
    int blockId = blockIdx.x + (blockIdx.y * gridDim.x);
    int threadId = threadIdx.x + (threadIdx.y * blockDim.x); 
    int globalIdx = (blockId * threadsPerBlock) + threadId;
    int RU[RUsizex] = {0};
    int index = 0;
    float r = 0.0;
    double temp = 0;
    float forces[iterationsx] = {0.0};
    int left[RUsizex - 2] = {0};
    int right[RUsizex - 2] = {0};
    curandState s;
    curand_init (rands[globalIdx] , 0, 0, &s);
    int i= 0;
    while( i < iterationsx)
    {
            for(int k = 0; k < RUsizex - 2; k++)
            {
            left[k] = RU[k];
            right[k] = RU[k+2];
            }
            for(int j = 0; j < RUsizex -2; j++)
            {
                r = curand_uniform(&s);

                index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;

                RU[j + 1]= (RU[j + 1]) + ( r < transMatrix[index]) * (transMatrix[index + 1]) +
                    (! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
                    (! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5]) ;

            }

            for(int z = 1; z < RUsizex - 1; z++)
            {
                temp = temp + (RU[z] ==4) + (RU[z] ==5);
            }
            forces[i] = temp/(24.0);
            temp = 0.0;
    i++;
    }

    for(int y = 0; y < iterationsx; y++)
    {
        masterForces[globalIdx + (r_max * y)] = forces[y]; 
    }


    }

变量float forces[iterationsx]是全局函数中的堆栈变量。这需要每个线程的堆栈预留大于200000b。CUDA驱动程序必须根据使用公式SmCount * MaxTheadsPerSm * (LocalMemoryPerThread + StackPerThread)的最大常驻线程分配本地内存。对于完整的GK110,这将是15 * 2048 * ~51KiB = 1.5 GiB。