大规模套接字操作的分段故障

segmentation fault of massive sockets operation

本文关键字:分段 故障 操作 套接字 大规模      更新时间:2023-10-16

这个问题困扰了我几个星期,我在网上找不到任何解决方案。所以我必须向你们大师们提出一个新问题。

我试图在大量套接字上读/写,请参阅下面的测试代码。当套接字数低于 1500 时,它表现正常。当套接字数超过 1500 时,程序将意外崩溃。我知道我应该使用命令ulimit -n 32768来增加打开的文件数量限制。但是程序仍然无法正常运行。

#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <stdint.h>
#include <netdb.h>
#include <errno.h>
#include <malloc.h>
#include <string.h>
int main(int argc, char* argv[])
{
    if (argc!=2)
    {
        printf("usage: test <number of sockets>n");
        return -1;
    }
    int socketsNum=atoi(argv[1]);
    if (socketsNum<=0)
    {
        printf("error: invalid sockets numbern");
        return -1;
    }
    int *socketHandles=(int*)malloc(sizeof(int)*socketsNum);
    if (socketHandles==NULL)
    {
        printf("error: failed to alloc socket handle memoryn");
        return -1;
    }
    for (int i=0;i<socketsNum;i++)
    {
        socketHandles[i]=-1;
    }

    printf("creating %d sockets ...n",socketsNum);
    int createdSocketsNum=0;
    for (int i=0;i<socketsNum;i++)
    {
        int socketHandle=socket(AF_INET,SOCK_DGRAM,IPPROTO_UDP);
        if (socketHandle==-1)
        {
            int lastError=errno;
            printf("warning: socket() failed: index: %d, error: %dn",i+1,lastError);
            continue;
        }
        sockaddr_in sockAddr; // 0.0.0.0:0
        memset(&sockAddr,0,sizeof(sockAddr));
        sockAddr.sin_family = AF_INET;
        sockAddr.sin_addr.s_addr = htonl(INADDR_ANY);
        sockAddr.sin_port = htons(0);
        if (bind( socketHandle, (sockaddr*) &sockAddr, sizeof(sockAddr)) == -1)
        {
            int lastError=errno;
            printf("warning: bind() failed: index: %d, error: %dn",i+1,lastError);
            close(socketHandle);
            continue;
        }
        socketHandles[i]=socketHandle;
        createdSocketsNum++;
    }
    printf("created %d sockets.n",createdSocketsNum);
    //test reading;
    printf("testing reading ...n");
    int readableNumber=0;
    int unreadableNumber=0;
    int readingSkippedNumber=0;
    for (int i=0;i<socketsNum;i++)
    {
        int socketHandle=socketHandles[i];
        if (socketHandle==-1)
        {
            readingSkippedNumber++;
            continue;
        }
        fd_set rset;
        FD_ZERO(&rset);
        FD_SET(socketHandle, &rset);
        struct timeval timeout = {0, 0};
        int retCode=select(socketHandle + 1, &rset, NULL, NULL, &timeout);
        if (retCode==-1)
        {
            int lastError=errno;
            printf("warning: select() failed: index: %d, error: %dn",i+1,lastError);
        }
        else if (retCode==0)
        {
            unreadableNumber++;
        }
        else
        {
            readableNumber++;
        }
    }
    printf("readable: %d, unreadable: %d, skipped: %d, total: %dn",readableNumber,unreadableNumber,readingSkippedNumber,socketsNum);
    //test writing
    printf("testing writing ...n");
    int writableNumber=0;
    int unwritableNumber=0;
    int writingSkippedNumber=0;
    for (int i=0;i<socketsNum;i++)
    {
        int socketHandle=socketHandles[i];
        if (socketHandle==-1)
        {
            writingSkippedNumber++;
            continue;
        }
        fd_set wset;
        FD_ZERO(&wset);
        FD_SET(socketHandle, &wset);
        struct timeval timeout = {0, 0};
        int retCode=select(socketHandle + 1, NULL, &wset, NULL, &timeout);
        if (retCode==-1)
        {
            int lastError=errno;
            printf("warning: select() failed: index: %d, error: %dn",i+1,lastError);
        }
        else if (retCode==0)
        {
            unwritableNumber++;
        }
        else
        {
            writableNumber++;
        }
    }
    printf("writable: %d, unwritable: %d, skipped: %d, total: %dn",writableNumber,unwritableNumber,writingSkippedNumber,socketsNum);
    printf("closing ...n");
    for (int i=0;i<socketsNum;i++)
    {
        int socketHandle=socketHandles[i];
        if (socketHandle==-1)
        {
            continue;
        }
        close(socketHandle);
    }
    free(socketHandles);
    printf("completed!n");
    return 0;
}

编译:

g++ TestSockets.cpp -ldl -g -ggdb -o TestSockets

配置:

ulimit -n 32768

一些典型结果:

  1. ./TestSockets 1500的良好结果:

    creating 1500 sockets ...
    created 1500 sockets.
    testing reading ...
    readable: 0, unreadable: 1500, skipped: 0, total: 1500
    testing writing ...
    writable: 1372, unwritable: 128, skipped: 0, total: 1500
    closing ...
    completed!
    
  2. ./TestSockets 1900的不良结果:

    creating 1900 sockets ...
    created 1900 sockets.
    testing reading ...
    warning: select() failed: index: 1797, error: 9
    ...(more lines trimmed)
    warning: select() failed: index: 1820, error: 9
    warning: select() failed: index: 1821, error: 22
    readable: 0, unreadable: 1878, skipped: 0, total: 1900
    testing writing ...
    warning: select() failed: index: 1641, error: 9
    ...(more lines trimmed)
    warning: select() failed: index: 1660, error: 9
    warning: select() failed: index: 1661, error: 22
    writable: 1751, unwritable: 128, skipped: 0, total: 1900
    closing ...
    completed!
    

    评论:因为1900>1751+128,似乎堆栈损坏了。

  3. ./TestSockets 2000的不良结果:

    creating 2000 sockets ...
    created 2000 sockets.
    testing reading ...
    Segmentation fault
    

更多调查:

根据 gdb 信息。似乎堆栈内存在运行过程中损坏:

    creating 2000 sockets ...
    created 2000 sockets.
    testing reading ...
    Program received signal SIGSEGV, Segmentation fault.
    0x08048b79 in main (argc=2, argv=0xffffd3b4) at TestSockets.cpp:78
    78          int socketHandle=socketHandles[i];
    (gdb) print socketHandles
    $1 = (int *) 0x0
    (gdb) info local
    socketHandle = 0
    rset = {fds_bits = {0 <repeats 32 times>}}
    timeout = {tv_sec = 0, tv_usec = 0}
    retCode = 0
    i = 1601
    socketsNum = 2000
    unreadableNumber = 1601
    unwritableNumber = 134514249
    socketHandles = 0x0
    createdSocketsNum = 2000
    readableNumber = 0
    readingSkippedNumber = 0
    writableNumber = -136436764
    writingSkippedNumber = 0
    (gdb) info stack
    #0  0x08048b79 in main (argc=2, argv=0xffffd3b4) at TestSockets.cpp:78

fd_set受文件描述符的最大值(而不是同时设置的文件描述符数量)的限制。通常是 1024。

因此,如果您的套接字大于 1023,则根本无法对其使用 select

我知道的操作系统不支持重新定义FD_SETSIZE。您也许能够在程序中成功重新定义fd_set,但select最多只能FD_SETSIZE .

我已经解决了这个令人头疼的问题。Windows和Linux上的fd_set完全不同。在 Linux 上,如果套接字句柄 VALUE 大于 FD_SETSIZE,则在 Linux 版本FD_SET宏上会出现溢出问题。我做了一个解决方法来分配足够的缓冲区用于在 Linux 上fd_set。如

char rsetBuffer[10240];
memset(rsetBuffer,0,10240);
fd_set& rset=(fd_set&)rsetBuffer;
FD_ZERO(&rset);
FD_SET(socketHandle, &rset);

p.s. Windows和Linux上fd_set结构和FD_SET宏的定义:

在窗户上:

typedef struct fd_set {
    u_int fd_count;               /* how many are SET? */
    SOCKET  fd_array[FD_SETSIZE];   /* an array of SOCKETs */
} fd_set;

#define FD_SET(fd, set) do { 
    u_int __i; 
    for (__i = 0; __i < ((fd_set FAR *)(set))->fd_count; __i++) { 
    if (((fd_set FAR *)(set))->fd_array[__i] == (fd)) { 
        break; 
    } 
    } 
    if (__i == ((fd_set FAR *)(set))->fd_count) { 
    if (((fd_set FAR *)(set))->fd_count < FD_SETSIZE) { 
        ((fd_set FAR *)(set))->fd_array[__i] = (fd); 
        ((fd_set FAR *)(set))->fd_count++; 
    } 
    } 
} while(0)

在 Linux 上:

/* fd_set for select and pselect.  */
typedef struct
  {
    /* XPG4.2 requires this member name.  Otherwise avoid the name
       from the global namespace.  */
#ifdef __USE_XOPEN
    __fd_mask fds_bits[__FD_SETSIZE / __NFDBITS];
# define __FDS_BITS(set) ((set)->fds_bits)
#else
    __fd_mask __fds_bits[__FD_SETSIZE / __NFDBITS];
# define __FDS_BITS(set) ((set)->__fds_bits)
#endif
  } fd_set;
#define __FD_SET(d, set) 
  ((void) (__FDS_BITS (set)[__FD_ELT (d)] |= __FD_MASK (d)))
#define __FD_CLR(d, set) 
  ((void) (__FDS_BITS (set)[__FD_ELT (d)] &= ~__FD_MASK (d)))
#define __FD_ISSET(d, set) 
  ((__FDS_BITS (set)[__FD_ELT (d)] & __FD_MASK (d)) != 0)