将 MPI 等级分配给 GPU 的

Assigning MPI ranks to gpu's

本文关键字：GPU 分配 MPI 更新时间：2023-10-16

我想在节点上的MPI等级之间创建一个上下文，并为多个节点做到这一点。

我在这里找到了一个代码。

C代码如下：

 #include <mpi.h>
 #include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <cuda_runtime.h>

int stringCmp( const void *a, const void *b)
{
   return strcmp(a,b);
}
void  assignDeviceToProcess(int *p2myrank)
{
   char     host_name[MPI_MAX_PROCESSOR_NAME];
   char (*host_names)[MPI_MAX_PROCESSOR_NAME];
   MPI_Comm nodeComm;

   int i, n, namelen, color, rank, nprocs, myrank,gpu_per_node;
   size_t bytes;
   int dev, err1;
   struct cudaDeviceProp deviceProp;
   /* Check if the device has been alreasy assigned */
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
   MPI_Get_processor_name(host_name,&namelen);
   bytes = nprocs * sizeof(char[MPI_MAX_PROCESSOR_NAME]);
   host_names = (char (*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);
   strcpy(host_names[rank], host_name);
   for (n=0; n<nprocs; n++)
   {
    MPI_Bcast(&(host_names[n]),MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
   }

   qsort(host_names, nprocs,  sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);
   color = 0;
   for (n=0; n<nprocs; n++)
   {
     if(n>0&&strcmp(host_names[n-1], host_names[n])) color++;
     if(strcmp(host_name, host_names[n]) == 0) break;
   }
   MPI_Comm_split(MPI_COMM_WORLD, color, 0, &nodeComm);
   MPI_Comm_rank(nodeComm, &myrank);
   MPI_Comm_size(nodeComm, &gpu_per_node);
   p2myrank[0]=myrank;
   return;
    /* Find out how many DP capable GPUs are in the system and their device number */
   int deviceCount,slot=0;
   int *devloc;
   cudaGetDeviceCount(&deviceCount);
   devloc=(int *)malloc(deviceCount*sizeof(int));
   devloc[0]=999;
   for (dev = 0; dev < deviceCount; ++dev)
    {
    cudaGetDeviceProperties(&deviceProp, dev);
    if(deviceProp.major>1)
      {
       devloc[slot]=dev;
       slot++;
      };
    }
   //printf ("Assigning device %d  to process on node %s rank %d n",devloc[myrank],  host_name, rank );
   /* Assign device to MPI process and probe device properties */
   cudaSetDevice(devloc[myrank]);
   cudaGetDevice(&dev);
   cudaGetDeviceProperties(&deviceProp, dev);
   size_t free_bytes, total_bytes;
   cudaMemGetInfo(&free_bytes, &total_bytes);
   printf("Host: %s Rank=%d Device= %d (%s)  ECC=%s  Free = %lu, Total = %lun",host_name,rank, devloc[myrank],deviceProp.name, deviceProp.ECCEnabled ? "Enabled " : "Disabled", (unsigned long)free_bytes, (unsigned long)total_bytes);
}

我的fortran代码是：

subroutine MPI_to_gpu_assign(comm,nprocs)
  use cudafor
  use sort
  implicit none
  include "mpif.h"
  integer:: max_len, rank, code, comm,i,size, ierr,totaldev,n, namelen, color, nprocs
  integer:: nodeComm,first_time ,myrank, proc_len
  character::    host_name(MPI_MAX_PROCESSOR_NAME)
  character:: host_names(nprocs*MPI_MAX_PROCESSOR_NAME)
  proc_len = MPI_MAX_PROCESSOR_NAME
  !Check if the device has been assigned already
  if(first_time) then
      first_time=0
    call MPI_Comm_rank(comm, rank,code)
    call MPI_Get_processor_name(host_name,namelen,code)
    host_names((rank-1)*proc_len+1:rank*proc_len) =  host_name
    do n=1,nprocs
      if (n.gt.1) then
          call MPI_Bcast(host_names((n-1)*proc_len+1:n*proc_len),MPI_MAX_PROCESSOR_NAME, MPI_CHARACTER, n-1, comm,code)
      else
        call MPI_Bcast(host_names(1:proc_len),MPI_MAX_PROCESSOR_NAME, MPI_CHARACTER, n-1, comm,code)
      end if
    end do
    call a_sort(host_names,my_compare)
    color = 0
    DO n = 1,nprocs
      if((n.gt.1)) then
          if((my_compare( host_names(((n-2)*proc_len+1):(n-1)*proc_len),  host_names(((n-1)*proc_len+1):n*proc_len) )) == 1) then !!line 1!! 
            color = color+1
        end if
          if(my_compare(host_name, host_names((n-1)*proc_len+1:n*proc_len)) == 1) then !!line 2!! 
          exit
        end if
      else
          if(my_compare(host_name, host_names(1:proc_len)) == 1) then !!line 3!! 
            exit
        end if
      end if
    END DO
    call MPI_Comm_split(comm, color, 0, nodeComm,code)
    CALL MPI_Comm_rank(nodeComm, myrank,code)
    write(*,*) 'Assigning device', myrank, 'to process on node', host_name,' on rank', rank,''
    ! Assign device to MPI process
    ierr = cudaSetDevice(myrank)
    if (ierr.ne.0) then
      print *, cudaGetErrorString(ierr)
      stop
    endif
 end if
 end subroutine MPI_to_gpu_assign

将排序和其他必需的功能定义为（我从此处使用）：

module sort
  implicit none
  contains
    subroutine To_lower(str)
       character(len=*), intent(in out) :: str
       integer :: i
       do i = 1, len(str)
         select case(str(i:i))
           case("A":"Z")
             str(i:i) = achar(iachar(str(i:i))+32)
         end select
       end do
     end subroutine To_Lower
    integer function my_compare(a, b)
      character(*), intent(in) :: a, b
      character(len=max(len(a),len(b))) :: a1, b1
      a1 = a
      b1 = b
      call to_lower(b1)
      call to_lower(a1)
      if ( len(trim(a)) > len(trim(b)) ) then
         my_compare = -1
      elseif ( len(trim(a)) == len(trim(b)) ) then
         if ( a1 > b1 ) then
            my_compare = 1
         else
            my_compare = -1
         end if
      else
         my_compare = 1
      end if
    end function my_compare
  subroutine a_sort(a, cc)
    character(len=*), dimension(:), intent(inout) :: a
    interface
       integer function cc(a, b)
         character(len=*), intent(in) :: a, b
       end function cc
    end interface
    integer :: i, j, increment
    character(len=max(len(a), 10)) :: temp
    increment = size(a) / 2
    do while ( increment > 0 )
       do i = increment+1, size(a)
          j = i
          temp = a(i)
          do while ( j >= increment+1 .and. cc(a(j-increment), temp) > 0)
             a(j) = a(j-increment)
             j = j - increment
          end do
          a(j) = temp
       end do
       if ( increment == 2 ) then
          increment = 1
       else
          increment = increment * 5 / 11
       end if
    end do
  end subroutine a_sort
end module Sort

，但这似乎不起作用，给我以下错误：

PGF90-S-0446-Argument number 1 to my_compare: rank mismatch (line 1)
PGF90-S-0446-Argument number 2 to my_compare: rank mismatch (line 1)
PGF90-S-0446-Argument number 1 to my_compare: rank mismatch (line 2)
PGF90-S-0446-Argument number 2 to my_compare: rank mismatch (line 2)
PGF90-S-0446-Argument number 1 to my_compare: rank mismatch (line 3)
PGF90-S-0446-Argument number 2 to my_compare: rank mismatch (line 3)

谁能帮助我为什么发生此错误？

您的错误与CUDA无关，这是一个基本的fortran错误。您声明在调用代码中具有长度为1个字符的数组：

 character::    host_name(MPI_MAX_PROCESSOR_NAME)
 character:: host_names(nprocs*MPI_MAX_PROCESSOR_NAME)

您从它们中进行数组部分，然后将它们传递给my_compare：

my_compare( host_names(((n-2)*proc_len+1):(n-1)*proc_len),  host_names(((n-1)*proc_len+1):n*proc_len) )

但是my_compare期望标量字符：

integer function my_compare(a, b)
  character(*), intent(in) :: a, b

这不兼容。您也应该在主代码中使用标量字符：

 character(MPI_MAX_PROCESSOR_NAME) ::    host_name
 character(nprocs*MPI_MAX_PROCESSOR_NAME) :: host_names

实际上，在Fortran中最好的是使用一系列主机名

 character(MPI_MAX_PROCESSOR_NAME) :: host_names(nprocs)

但是您必须更改来自C的大量代码。但这将是简化的。