在多个Nvidia GPU上使用以下代码的分段故障

Segmentation fault using the following code on more than one Nvidia GPU

本文关键字：代码故障分段 Nvidia GPU 更新时间：2023-10-16

当配置为在多个卡上运行时，以下代码会在一段时间后可靠地导致分段故障（我有一个4 GPU系统（NVIDIA Titan的））。这是我以前试图解决的问题的一个最小的例子（请参阅此处：从libcuda.so.1调用的__pthread_getspecific中的分段错误）

代码功能描述：

首先，它在每张卡上分配大量内存（在前N张卡上，其中N为1-4），其想法是尽可能地强调卡。然后它继续启动16个线程，每个线程在一个随机卡上运行几个内核。它重复几次，每次随机挑选一张牌，然后退出。主线程等待所有16个线程完成，然后无限期地重复该过程，或者直到其中一个线程由于某种原因失败。

在主线程的几次（通常在20次左右，但可能或多或少）迭代之后，它会导致分段错误。堆栈通常如下所示：

#0  0x00007f164a71f43c in ?? () from /usr/lib/libcuda.so
#1  0x00007f164a6bd1b5 in ?? () from /usr/lib/libcuda.so
#2  0x00007f164a5dff1a in ?? () from /usr/lib/libcuda.so
#3  0x00007f164a6c0b34 in ?? () from /usr/lib/libcuda.so
#4  0x00007f164a6c0c92 in ?? () from /usr/lib/libcuda.so
#5  0x00007f164a5e009f in ?? () from /usr/lib/libcuda.so
#6  0x00007f164a5d03c0 in ?? () from /usr/lib/libcuda.so
#7  0x00007f164a5c43bf in ?? () from /usr/lib/libcuda.so
#8  0x00007f164c131c39 in ?? () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#9  0x00007f164c152879 in cudaDeviceSynchronize () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#10 0x0000000000401911 in TestCUDA(int, unsigned int, unsigned int, unsigned int const*, unsigned short*) ()
#11 0x00000000004012b0 in main (argc=0, argv=0x100000200) at main.cpp:208

完整的源代码：

main.cpp：

#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <pthread.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <cuda_runtime.h>
class CriticalSection
{
    pthread_mutex_t cs;
public:
    CriticalSection();
    ~CriticalSection();
    void Lock( void );
    void Unlock( void );
};
CriticalSection::CriticalSection()
{
    assert( pthread_mutex_init( &cs, NULL ) == 0 );
}
CriticalSection::~CriticalSection()
{
    assert( pthread_mutex_destroy( &cs ) == 0 );
}
void CriticalSection::Lock( void )
{
    assert( pthread_mutex_lock( &cs ) == 0 );
}
void CriticalSection::Unlock( void )
{
    assert( pthread_mutex_unlock( &cs ) == 0 );
}
class DeviceWrapper
{
protected:
    CriticalSection m_cs;
public:
    int32_t     m_i32DeviceId;
    uint32_t*   m_pdu32Data;
    uint16_t*   m_pdu16Res;
    uint32_t    m_u32Count;
    DeviceWrapper();
    ~DeviceWrapper();
    void Lock( void );
    void Unlock( void );
    bool Init( const int32_t i32DevId, const uint32_t u32Count );
    bool Free();    
};
DeviceWrapper::DeviceWrapper()
{
    m_i32DeviceId = 0;
    m_pdu32Data = NULL;
    m_pdu16Res = NULL;
    m_u32Count = 0;
}
DeviceWrapper::~DeviceWrapper()
{
}
void DeviceWrapper::Lock( void )
{
    m_cs.Lock();
}
void DeviceWrapper::Unlock( void )
{
    m_cs.Unlock();
}
bool DeviceWrapper::Init( const int32_t i32DevId, const uint32_t u32Count )
{
    if ( cudaSetDevice( i32DevId ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to set device %dn", i32DevId );
        return false;
    }
    if ( cudaMalloc( &m_pdu32Data, sizeof( uint32_t ) * u32Count ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to allocate %u unsigned int's on device %dn", u32Count, i32DevId );
        return false;
    }
    if ( cudaMalloc( &m_pdu16Res, sizeof( uint16_t ) * u32Count ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to allocate %u unsigned short's on device %dn", u32Count, i32DevId );
        return false;
    }
    m_u32Count = u32Count;
    m_i32DeviceId = i32DevId;   
    return true;
}
bool DeviceWrapper::Free()
{
    if ( cudaSetDevice( m_i32DeviceId ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to set device %dn", m_i32DeviceId );
        return false;
    }
    if ( cudaFree( m_pdu32Data ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to free pdu32Mem on device %dn", m_i32DeviceId );
        return false;
    }
    if ( cudaFree( m_pdu16Res ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to free pdu16Mem on device %dn", m_i32DeviceId );
        return false;
    }
    m_pdu32Data = NULL;
    m_pdu16Res = NULL;
    m_u32Count = 0;
    m_i32DeviceId = 0;
    return true;
}
bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res );
void* DoWork( void* pArg );
static bool bRun = true;
static DeviceWrapper devices[4];
int main( int argc, char* argv[] )
{
    if ( argc != 2 )
    {
        printf( "Usage: %s <number of cards to use>n", argv[0] );
        return 1;
    }
    uint32_t u32CardsToUse = strtoul( argv[1], NULL, 0 );
    if ( !u32CardsToUse || u32CardsToUse > 4 )
    {
        printf( "Invalid argument, must be in range 1-4n" );
        return 2;
    }
    for ( int32_t i = 0; i < u32CardsToUse; i++ )
    {
        if ( !devices[i].Init( i, 0x20000000 ) )
        {
            for ( uint32_t j = 0; j < i; j++ )
            {
                devices[j].Free();
            }
            printf( "Failed to init device %dn", i );
            return 3;
        }
    }
    uint32_t u32IterationsCompleted = 0;
    while ( bRun )
    {
        pthread_t pWorkers[ 16 ];
        memset( pWorkers, 0, 16 * sizeof( pthread_t ) );
        for ( uint32_t i = 0; i < 16; i++ )
        {       
            int iReturnValue = pthread_create( &pWorkers[i], NULL, &DoWork, (void*)u32CardsToUse );
            if ( iReturnValue != 0 )
            {
                printf( "Error calling pthread_create: %dn", iReturnValue );
                return 4;
            }
        }
        for ( uint32_t i = 0; i < 16; i++ )
        {
            pthread_join( pWorkers[i], NULL );
        }
        printf( "Iterations completed: %un", ++u32IterationsCompleted );
    }
    printf( "Finishedn" );
    fflush( stdout );
    return 0;
}
void* DoWork( void* pArg )
{
    uint32_t u32CardsToUse = uint32_t( pArg );
    uint32_t u32TestCount = (rand() % 4) + 4;
    for ( uint32_t i = 0; i < u32TestCount; i++ )
    {
        int32_t i32DeviceId = int32_t( rand() % u32CardsToUse );
        devices[ i32DeviceId ].Lock();
        if ( !TestCUDA( i32DeviceId, 1, devices[i32DeviceId].m_u32Count, devices[i32DeviceId].m_pdu32Data, devices[i32DeviceId].m_pdu16Res ) )
        {
            printf( "DoWork: Failure in executing TestCUDA for device %d (test number %u)n", i32DeviceId, i );
            bRun = false;
            devices[ i32DeviceId ].Unlock();
            return NULL;
        }
        devices[ i32DeviceId ].Unlock();
    }
    return NULL;
}

cuda_test.cu:

#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <cuda_runtime.h>
__global__ void HammingU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
    while ( gidx < u32Count )
    {
        pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx );
        gidx += blockDim.x * gridDim.x;
    }
}
__global__ void EqualU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ]++;
        gidx += blockDim.x * gridDim.x;
    }
}
__global__ void EqualByteU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 4;
        gidx += blockDim.x * gridDim.x;
    }
}
__global__ void EqualBitU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;
        gidx += blockDim.x * gridDim.x;
    }
}
__global__ void OrderU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
    while ( gidx < u32Count )
    {
        uint32_t u32File = pu32Data[gidx];                  // 32-bit value to find the log2 of 
        uint32_t u32FileLog = 0;                            // result of log2 will go here
        uint32_t u32Shift = 0;
        u32FileLog = (u32File > 0xFFFF) << 4;
        u32File >>= u32FileLog;
        u32Shift = (u32File > 0xFF) << 3;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32Shift = (u32File > 0xF) << 2;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32Shift = (u32File > 0x3) << 1;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32FileLog |= (u32File >> 1);
        uint32_t u32Other = gidx;           // 32-bit value to find the log2 of 
        uint32_t u32OtherLog = 0;                           // result of log2 will go here
        u32Shift = 0;
        u32OtherLog = (u32Other > 0xFFFF) << 4;
        u32Other >>= u32OtherLog;
        u32Shift = (u32Other > 0xFF) << 3;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32Shift = (u32Other > 0xF) << 2;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32Shift = (u32Other > 0x3) << 1;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32OtherLog |= (u32Other >> 1);
        if ( u32FileLog >= u32OtherLog )
        {
            pu16Results[ gidx ] += uint16_t( u32FileLog - u32OtherLog );
        }
        else
        {
            pu16Results[ gidx ] += uint16_t( u32OtherLog - u32FileLog );
        }
        gidx += blockDim.x * gridDim.x;
    }
}
__global__ void LogU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
    while ( gidx < u32Count )
    {
        uint32_t u32Value = 0;
        if ( pu32Data[gidx] >= gidx )
        {
            u32Value = pu32Data[gidx] - gidx;
        }
        else
        {
            u32Value = gidx - pu32Data[gidx];
        }
        uint32_t u32Log = 0;        // result of log2 will go here
        uint32_t u32Shift = 0;
        u32Log = (u32Value > 0xFFFF) << 4;
        u32Value >>= u32Log;
        u32Shift = (u32Value > 0xFF) << 3;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Shift = (u32Value > 0xF) << 2;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Shift = (u32Value > 0x3) << 1;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Log |= (u32Value >> 1);
        pu16Results[ gidx ] += (uint16_t)u32Log;
        gidx += blockDim.x * gridDim.x;
    }
}
__global__ void EqualRetU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;
        gidx += blockDim.x * gridDim.x;
    }
}
__global__ void HammingMulU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
    while ( gidx < u32Count )
    {
        pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx ) << 5;
        gidx += blockDim.x * gridDim.x;
    }
}
bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res )
{   
    for ( uint32_t i = 0; i < u32Iterations; i++ )
    {
        if ( cudaSetDevice( i32DeviceId ) != cudaSuccess )
        {
            return false;
        }
        if ( cudaMemset( pdu16Res, 0, u32Count * sizeof( uint16_t ) ) != cudaSuccess )
        {
            return false;
        }
        for ( uint32_t j = 0; j < 3; j++ )
        {
            HammingU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualByteU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualBitU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            OrderU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            LogU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualRetU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            HammingMulU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
        }
        cudaDeviceSynchronize();
    }
    return true;
}

生成文件：

IDIR_CUDA = -I. -I/usr/local/cuda-5.5/include
CC_CUDA = g++
CFLAGS_CUDA = -g $(IDIR_CUDA)
LIBS_CUDA = -lz -lpthread -lrt -ldl -L/usr/local/cuda-5.5/lib64 -lcudart -lcuda
all:
    nvcc -I/usr/include -arch=compute_35 -code=sm_35 --machine 64 --compile cuda_test.cu
    $(CC_CUDA) -fpermissive *.cpp *.o -o test_cuda.out $(CFLAGS_CUDA) $(LIBS_CUDA)
    @echo DONE TEST_CUDA BUILD
clean:
    rm -f *.o test_cuda.out

我使用CUDA 5.5和最新的驱动程序（319.32），运行Ubuntu Linux（64位）。

我的问题是：代码中是否存在可能导致这种行为的错误？为什么我只使用一张卡时没有看到崩溃？只是撞车的可能性要小得多吗？（似乎使用更多的牌会比使用更少的牌更快地出现崩溃）额外奖励：是否有其他人使用此代码看到崩溃

我找到了一个解决崩溃的解决方法：

其想法是，每个使用的GPU恰好有一个线程，其他工作线程将工作提交给这些线程以在GPU上运行。我能够在机器上运行测试应用程序一天，没有任何问题（完成了6000多次迭代）。