MPI中的奇怪分段故障

Strange Segmentation Fault from MPI

本文关键字:分段 故障 MPI      更新时间:2023-10-16

我编写了一个简单的MPI程序,目的是练习MPI用户定义的数据类型函数。以下是抛出segfault的版本。

    #include <mpi.h>
    #include <iostream>
    using namespace std;
    int main( int argc , char ** argv )
    {
        int rank;
        MPI_Datatype newtype;
        MPI_Datatype newertype;
        MPI_Init(&argc,&argv);
        MPI_Comm_rank(MPI_COMM_WORLD,&rank);
        MPI_Type_contiguous(2,MPI_INT,&newtype);
        MPI_Type_commit(&newtype);
        MPI_Type_vector(3,2,3,newtype,&newertype);
        MPI_Type_commit(&newertype);    

        int * buffer = new int[16];
        for( int i=0 ; i<16 ; i++ )
        {
            buffer[i] = 0;
        }
        if(rank==0)
        {
            for( int i=0 ; i<16 ; i++ )
            {
                buffer[i] = 9;
            }
            MPI_Send(buffer,3,newertype,1,0,MPI_COMM_WORLD);        
        }else if(rank==1)
        {
            MPI_Recv(buffer,3,newertype,0,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE);
            for( int i=0 ; i<16 ; i++ )
            {
                cout << buffer[i] << " ";
            }
            cout << endl;
        }
        MPI_Type_free(&newertype);
        MPI_Type_free(&newtype);
        MPI_Finalize();
        return 0;
    }

但是,如果将数组声明移到MPI_Init之前,则一切正常。

#include <mpi.h>
#include <iostream>
using namespace std;
int main( int argc , char ** argv )
{
    int rank;
    **int * buffer = new int[16];
    for( int i=0 ; i<16 ; i++ )
    {
            buffer[i] = 0;
    }**
    MPI_Datatype newtype;
    MPI_Datatype newertype;
    MPI_Init(&argc,&argv);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
    MPI_Type_contiguous(2,MPI_INT,&newtype);
    MPI_Type_commit(&newtype);
    MPI_Type_vector(3,2,3,newtype,&newertype);
    MPI_Type_commit(&newertype);    
    if(rank==0)
    {
        for( int i=0 ; i<16 ; i++ )
        {
            buffer[i] = 9;
        }
        MPI_Send(buffer,3,newertype,1,0,MPI_COMM_WORLD);        
    }else if(rank==1)
    {
        MPI_Recv(buffer,3,newertype,0,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE);
        for( int i=0 ; i<16 ; i++ )
        {
            cout << buffer[i] << " ";
        }
        cout << endl;
    }
    MPI_Type_free(&newertype);
    MPI_Type_free(&newtype);
    MPI_Finalize();
    return 0;
}

有人能解释在MPI_Init调用后声明数组的错误吗?

For your information, below is the error message
9 9 9 9 0 0 9 9 9 9 0 0 9 9 9 9 
[linuxscc003:10019] *** Process received signal ***
[linuxscc003:10019] Signal: Segmentation fault (11)
[linuxscc003:10019] Signal code: Address not mapped (1)
[linuxscc003:10019] Failing at address: 0x7fa00d0b36c8 
[linuxscc003:10019] [ 0] /lib64/libpthread.so.0() [0x3abf80f500]
[linuxscc003:10019] [ 1] /opt/MPI/openmpi-1.5.3/linux/gcc/lib/libmpi.so.1(opal_memory_ptmalloc2_int_free+0x299) [0x7f980ce46509]
[linuxscc003:10019] [ 2] /opt/MPI/openmpi-1.5.3/linux/gcc/lib/libmpi.so.1(+0xe7b2b) [0x7f980ce46b2b]                            
[linuxscc003:10019] [ 3] /opt/MPI/openmpi-1.5.3/linux/gcc/lib/libmpi.so.1(+0xf0a60) [0x7f980ce4fa60]                            
[linuxscc003:10019] [ 4] /opt/MPI/openmpi-1.5.3/linux/gcc/lib/libmpi.so.1(mca_base_param_finalize+0x41) [0x7f980ce4f731]        
[linuxscc003:10019] [ 5] /opt/MPI/openmpi-1.5.3/linux/gcc/lib/libmpi.so.1(opal_finalize_util+0x1b) [0x7f980ce3f53b]             
[linuxscc003:10019] [ 6] /opt/MPI/openmpi-1.5.3/linux/gcc/lib/libmpi.so.1(+0x4ce35) [0x7f980cdabe35]                            
[linuxscc003:10019] [ 7] type_contiguous(main+0x1aa) [0x408f2e]                                                                 
[linuxscc003:10019] [ 8] /lib64/libc.so.6(__libc_start_main+0xfd) [0x3abec1ecdd]                                                
[linuxscc003:10019] [ 9] type_contiguous() [0x408cc9]                                                                           
[linuxscc003:10019] *** End of error message ***                                                                                
--------------------------------------------------------------------------                                                      
mpiexec noticed that process rank 1 with PID 10019 on node linuxscc003 exited on signal 11 (Segmentation fault).                
--------------------------------------------------------------------------                                                      
Failure executing command /opt/MPI/openmpi-1.5.3/linux/gcc/bin/mpiexec -x  LD_LIBRARY_PATH -x  PATH -x  OMP_NUM_THREADS -x  MPI_NAME --hostfile /tmp/hostfile-9252 -np 2 type_contiguous                     

newertype有3个段,由newtype的2个元素组成,步长为3。您正在发送3个该类型的元素。这意味着内存中从发送或接收操作期间访问的第一个元素到最后一个元素的跨度是3*3*3 - 1(3个元素,每个元素有3段,共3个元素)或newtype类型的26个元素。每个CCD_ 5是两个连续的CCD_。您的发送或接收缓冲区应该至少是52整数,但您只分配16,因此秩1中的MPI_Recv正在写入超过分配的缓冲区的末尾,可能会覆盖堆控制结构。在调用MPI_Init之前移动分配会改变内存中这些结构的顺序,您的代码现在正在覆盖一些不同但不重要的内容。代码仍然不正确,你很幸运它没有出错。使用较大的缓冲区(至少52个元素)。