不同数量的处理器最终会得到不同的结果

Different number of processors end up with different results?

本文关键字：结果处理器更新时间：2023-10-16

我是MPI的新手。试图近似求解一个PDE.一个1000乘1000的数组。除了第一行和最后一行之外，在每次迭代时，每个元素都被更新为其8个邻居的平均值。

我的代码运行，但在小数点后三位的结果与使用不同数量的处理器略有不同。我想我的通信正在失去准确性？由于C++逐行存储数组，所以我按行对大数组进行分区。

这是代码。

#include <iostream>
#include <mpi.h>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, const char * argv[])
{
    // Initialize the MPI environment
    MPI_Init(NULL,NULL);
    int p;
    MPI_Comm_size(MPI_COMM_WORLD, &p);
    int id;
    MPI_Comm_rank(MPI_COMM_WORLD, &id);
    const double pi = 3.1415926;
    int n;
    n=atoi(argv[argc-1]);
    //calculate the starting and ending column indices
    int m=floor((n-2)/p);
    int r=n-2-m*p;
    //dividing row 1 to row n-2 by rows since in c/c++ arrays are stored in row-wise
    //therefore n-2=(m+1)*r+m*(p-r)
    //the first r processors get m+1 rows, the rest get m rows
    //starting row, ending row in the original A, width of matrix
    int start_row, end_row, width;
    if (id<=r-1){
        start_row=1+id*(m+1);
        end_row=start_row+m;
        width=m+1;
    }
    else {
        start_row=1+r*(m+1)+(id-r)*m;
        end_row=start_row+m-1;
        width=m;
    }
    //printf("mpi debug 1");
    printf("on processor %d, starting row is %d, ending row is %d n",id,start_row,end_row);
    //id of the processor before and after
    //id_before is not significant for id==0
    //id_after is not
    int id_before, id_after;
    if (id==0)
        id_before=p-1;
    else
        id_before=id-1;
    if (id==p-1)
        id_after=0;
    else
        id_after=id+1;
    //printf("debug000");

    //initialize the local matrix
    //**** better way to initialize?
    double a[width][n], b[width][n];
    for (int i=0; i<width; i++)
        for (int j=0; j<n; j++)
            a[i][j]=0.5;
    //two 1d arrays to store the halo cells
    double halo_before[n], halo_after[n];
    if (id==0){
        for (int j=0; j<n; j++){
            halo_before[j]=0.0;
        }
    }
    if (id==p-1){
        for (int j=0; j<n; j++) {
            halo_after[j]=5*sin(M_PI*((double)j/n)*((double)j/n));
        }
    }
    MPI_Barrier(MPI_COMM_WORLD);
    //std::cout << " the sin function is" << 5*sin(M_PI*((double)1/1)*((double)1/2)) << "n" <<std::endl;
    //set id=0 to be the root processor and call
    double start_time, end_time;
    if (id==0)
        start_time=MPI_Wtime();
    MPI_Status status_before, status_after;
    MPI_Request request_before, request_after;
    /////////////////////////////////////////////////////////
    //to dubug, print out arrays
    //std::cout<< " the array on processor " << id <<" to start is n" << std::endl;
    //for (int i=0; i<width; i++){
    //    for (int j=0; j<n; j++){
    //        std::cout << a[i][j] << " ";
    //        if (j==n-1)
    //            std::cout << "n" <<std::endl;
    //    }
    //}
    //to debug print out halos
    //std::cout << "halo_before on processor " << id << " to start with isn" << std::endl;
    //for (int i=0;i<n;i++){
    //    std::cout << halo_before[i] << " ";
    //    if (i==n-1)
    //        std::cout <<"n" <<std::endl;
    //}
    //std::cout << "halo_after on processor " << id << " to start with isn" << std::endl;
    //for (int i=0;i<n;i++){
    //    std::cout << halo_after[i] << " ";
    //    if (i==n-1)
    //        std::cout <<"n" <<std::endl;
    //}
    //////////////////////////////////////////////////////
    //begin iteration
    for (int t=0; t<500; t++){
        //unblocking send
        //send first row to id_before:
        //how should I use tag?
        if (id>0){
            MPI_Isend(&a[0][0], n, MPI_DOUBLE, id_before, t , MPI_COMM_WORLD, &request_before);
        }
        if (id<p-1){
            //send the last row to id_after
            MPI_Isend(&a[width-1][0], n, MPI_DOUBLE, id_after, t, MPI_COMM_WORLD, &request_after);
        }
        //printf("dubug0");
        //update the entries that do not depend on halos
        //local row=1 to row=width-2
        //add if (width>3)??
        int j_b, j_a;
        for (int i=1; i<width-1; i++){
            for (int j=0; j<n; j++){
                j_b=(n+j-1)%n;
                j_a=(j+1)%n;
                b[i][j]=(a[i-1][j_b]+a[i-1][j]+a[i-1][j_a]+a[i][j_b]+a[i][j_a]+a[i+1][j_b]+a[i+1][j]+a[i+1][j_a])/8;
            }
        }
        //printf("dubug1");
        //blocking receive
        //may consider unblocking receive
        //receive from id_before and store in halo_before
        //not sure about status
        if (id>0){
            MPI_Recv(&halo_before[0], n, MPI_DOUBLE, id_before ,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        }
        //receive from id_after and store in halo_after
        if (id<p-1){
            MPI_Recv(&halo_after[0], n, MPI_DOUBLE, id_after,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        }
        //to debug print out halos
        //std::cout << "halo_before on processor " << id << " at iteration " << t<< " isn" <<std::endl;
        //for (int i=0;i<n;i++){
        //    std::cout << halo_before[i] << " ";
        //    if (i==n-1)
        //        std::cout <<"n" <<std::endl;
        //}
        //std::cout << "halo_after on processor " << id << " at iteration " << t<< " isn" <<std::endl;
        //for (int i=0;i<n;i++){
        //    std::cout << halo_after[i] << " ";
        //    if (i==n-1)
        //        std::cout <<"n" <<std::endl;
        //}

        //update entries that depend on halos
        //bugs here, what if width==1???
        if (width==1){
            for (int j=0; j<n; j++){
                j_a=(n+j-1)%n;
                j_b=(j+1)%n;
                b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+halo_after[j_b]+halo_after[j]+halo_after[j_a])/8;
            }
        }
        else{
        for (int j=0; j<n; j++){
            j_a=(n+j-1)%n;
            j_b=(j+1)%n;
            b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+a[1][j_b]+a[1][j]+a[1][j_a])/8;
            b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;
        }
        }
        //copy to b
        //but make sure the send have been completed
        if (id>0)
            MPI_Wait(&request_before,MPI_STATUS_IGNORE);
        if (id<p-1)
            MPI_Wait(&request_after,MPI_STATUS_IGNORE);

        for (int i=0; i<width; i++)
            for (int j=0; j<n; j++)
                a[i][j]=b[i][j];
        //to dubug, print out arrays
        //std::cout<< " the array on processor " << id <<" at iteration " << t <<" is n"<< std::endl;
        //for (int i=0; i<width; i++){
        //    for (int j=0; j<n; j++){
        //    std::cout << a[i][j] << " ";
        //    if (j==n-1)
        //        std::cout << "n" <<std::endl;
        //    }
        //}

    }
    //calculate the sum
    double sum=0.0;
    for (int i=0; i<width; i++)
        sum += a[i][i+start_row];
    double total_sum;
    //send to root processor
    MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);
    if (id==0){
        end_time=MPI_Wtime();
        //double sum_receive[p];
        //double sum_calc;
        //for (int i=0; i<p; i++){
        //    MPI_Recv(&sum_receive[i], 1, MPI_DOUBLE, i, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        //    sum_calc += sum_receive[i];
        //}
        printf("time elapse is %f n", end_time-start_time);
        printf("at root processor %d, the calculated sum is %f, n", id, total_sum+5*sin(M_PI*((double)(n-1)/n)*((double)(n-1)/n)));
    }

    MPI_Finalize();
    return 0;

}

您的一行代码中有一个简单的拼写错误。此：

        b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;

应该是这样的（注意第4个术语；j_a，而不是第二个j_b）：

        b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_a]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;

因为这发生在每个域的最后一行，所以导致的确切错误量将取决于域边界，例如，您有多少处理器。

现在，在您发布的代码中出现这种错误基本上是不可避免的原因是，经过一些修改，相同的计算（从a到b）发生不少于3次。这是一颗滴答作响的定时炸弹；最终一个会被更新，而其他的会变得与第一个不一致，或者对其他的更新最终会出现一些错误。

这里有很多方法可以减少复制量，既可以澄清代码，也可以避免这类错误。最好是将光晕合并到a和b阵列中，在数据之前和之后添加一行额外的行以包括数据-这样，您就不必担心宽度是否为1，并单独处理结束行。此外，定义一个基于a更新b的行或元素的函数，并使用该函数而不是重复代码。

下面是一个清理代码的例子，它将位封装到例程中，将n和宽度与包含的光晕区域等保持一致。

#include <mpi.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
int min2i(int a, int b) {
    int result = a;
    if (b < a) result = b;
    return result;
}
void decomposition(const int n, const int nprocs, const int id,
                   int *start_row, int *width, int *id_before, int *id_after) {
    const int nrows = n;
    const int m = nrows/nprocs;
    const int r = nrows % nprocs;
    *width = m;
    if (id < r) (*width)++;
    *start_row = 1 + id*m + min2i(id,r);
    *id_before = (id > 0 ? id-1 : MPI_PROC_NULL);
    *id_after  = (id < nprocs-1 ? id+1 : MPI_PROC_NULL);
}
void startBC(const int n, const int width, double a[][n+2], double b[][n+2],
             const int id_before, const int id_after, const int t, MPI_Request *req) {
    MPI_Isend(&a[1][1],     n, MPI_DOUBLE, id_before, 2*t  , MPI_COMM_WORLD, &req[0]);
    MPI_Isend(&a[width][1], n, MPI_DOUBLE, id_after , 2*t+1, MPI_COMM_WORLD, &req[1]);
}
void finishBC(const int n, const int width, double a[][n+2], double b[][n+2],
              const int id_before, const int id_after, const int t, MPI_Request *req) {
    MPI_Status stats[2];
    MPI_Recv(&a[0][1],       n, MPI_DOUBLE, id_before, 2*t+1, MPI_COMM_WORLD, &stats[0]);
    MPI_Recv(&a[width+1][1], n, MPI_DOUBLE, id_after,  2*t  , MPI_COMM_WORLD, &stats[1]);
    for (int i=0; i<width+2; i++) {
        a[i][0]   = a[i][n];
        a[i][n+1] = a[i][1];
    }
    MPI_Waitall(2, req, stats);
}
void updateRow(const int n, const int width, double a[][n+2], double b[][n+2], const int row) {
    for (int j=1; j<=n; j++)
        b[row][j]=( a[row-1][j-1] + a[row-1][j] + a[row-1][j+1]
                   +a[ row ][j-1]               + a[ row ][j+1]
                   +a[row+1][j-1] + a[row+1][j] + a[row+1][j+1])/8;
}
int main(int argc, const char * argv[])
{
    int p, id;
    MPI_Init(NULL,NULL);
    MPI_Comm_size(MPI_COMM_WORLD, &p);
    MPI_Comm_rank(MPI_COMM_WORLD, &id);
    const double pi = 3.1415926;
    int n=atoi(argv[argc-1]);
    int width, start_row, id_before, id_after;
    decomposition(n, p, id, &start_row, &width, &id_before, &id_after);
    double a[width+2][n+2], b[width+2][n+2];
    for (int i=0; i<width+2; i++)
        for (int j=0; j<n+2; j++)
            a[i][j]=0.5;
    if (id==p-1)
        for (int j=0; j<n+2; j++)
            a[width+1][j]=5*sin(pi*((double)(j-1)/n)*((double)(j-1)/n));
    if (id==0)
        for (int j=0; j<n+2; j++)
            a[0][j]=0.;
    double start_time, end_time;
    if (id==0)
        start_time=MPI_Wtime();
    MPI_Request reqs[2];
    //begin iteration
    for (int t=0; t<2; t++){
        startBC(n, width, a, b, id_before, id_after, t, reqs);
        /* interior rows */
        for (int row=2; row<width; row++)
            updateRow(n, width, a, b, row);
        finishBC(n, width, a, b, id_before, id_after, t, reqs);
        /* boundary rows */
        updateRow(n, width, a, b, 1);
        updateRow(n, width, a, b, width);
        for (int i=1; i<width+1; i++)
            for (int j=1; j<n+1; j++)
                a[i][j]=b[i][j];
    }
    //calculate the sum
    double sum=0.0;
    for (int i=1; i<width+1; i++)
        sum += a[i][i+start_row-1];
    double total_sum;
    MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
    if (id==0){
        end_time=MPI_Wtime();
        printf("time elapse is %f n", end_time-start_time);
        printf("at root processor %d, the calculated sum is %f, n", id, total_sum+5*sin(pi*((double)(n-1)/n)*((double)(n-1)/n)));
    }

    MPI_Finalize();
    return 0;
}