不同数量的处理器最终会得到不同的结果
Different number of processors end up with different results?
我是MPI的新手。试图近似求解一个PDE.一个1000乘1000的数组。除了第一行和最后一行之外,在每次迭代时,每个元素都被更新为其8个邻居的平均值。
我的代码运行,但在小数点后三位的结果与使用不同数量的处理器略有不同。我想我的通信正在失去准确性?由于C++逐行存储数组,所以我按行对大数组进行分区。
这是代码。
#include <iostream>
#include <mpi.h>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, const char * argv[])
{
// Initialize the MPI environment
MPI_Init(NULL,NULL);
int p;
MPI_Comm_size(MPI_COMM_WORLD, &p);
int id;
MPI_Comm_rank(MPI_COMM_WORLD, &id);
const double pi = 3.1415926;
int n;
n=atoi(argv[argc-1]);
//calculate the starting and ending column indices
int m=floor((n-2)/p);
int r=n-2-m*p;
//dividing row 1 to row n-2 by rows since in c/c++ arrays are stored in row-wise
//therefore n-2=(m+1)*r+m*(p-r)
//the first r processors get m+1 rows, the rest get m rows
//starting row, ending row in the original A, width of matrix
int start_row, end_row, width;
if (id<=r-1){
start_row=1+id*(m+1);
end_row=start_row+m;
width=m+1;
}
else {
start_row=1+r*(m+1)+(id-r)*m;
end_row=start_row+m-1;
width=m;
}
//printf("mpi debug 1");
printf("on processor %d, starting row is %d, ending row is %d n",id,start_row,end_row);
//id of the processor before and after
//id_before is not significant for id==0
//id_after is not
int id_before, id_after;
if (id==0)
id_before=p-1;
else
id_before=id-1;
if (id==p-1)
id_after=0;
else
id_after=id+1;
//printf("debug000");
//initialize the local matrix
//**** better way to initialize?
double a[width][n], b[width][n];
for (int i=0; i<width; i++)
for (int j=0; j<n; j++)
a[i][j]=0.5;
//two 1d arrays to store the halo cells
double halo_before[n], halo_after[n];
if (id==0){
for (int j=0; j<n; j++){
halo_before[j]=0.0;
}
}
if (id==p-1){
for (int j=0; j<n; j++) {
halo_after[j]=5*sin(M_PI*((double)j/n)*((double)j/n));
}
}
MPI_Barrier(MPI_COMM_WORLD);
//std::cout << " the sin function is" << 5*sin(M_PI*((double)1/1)*((double)1/2)) << "n" <<std::endl;
//set id=0 to be the root processor and call
double start_time, end_time;
if (id==0)
start_time=MPI_Wtime();
MPI_Status status_before, status_after;
MPI_Request request_before, request_after;
/////////////////////////////////////////////////////////
//to dubug, print out arrays
//std::cout<< " the array on processor " << id <<" to start is n" << std::endl;
//for (int i=0; i<width; i++){
// for (int j=0; j<n; j++){
// std::cout << a[i][j] << " ";
// if (j==n-1)
// std::cout << "n" <<std::endl;
// }
//}
//to debug print out halos
//std::cout << "halo_before on processor " << id << " to start with isn" << std::endl;
//for (int i=0;i<n;i++){
// std::cout << halo_before[i] << " ";
// if (i==n-1)
// std::cout <<"n" <<std::endl;
//}
//std::cout << "halo_after on processor " << id << " to start with isn" << std::endl;
//for (int i=0;i<n;i++){
// std::cout << halo_after[i] << " ";
// if (i==n-1)
// std::cout <<"n" <<std::endl;
//}
//////////////////////////////////////////////////////
//begin iteration
for (int t=0; t<500; t++){
//unblocking send
//send first row to id_before:
//how should I use tag?
if (id>0){
MPI_Isend(&a[0][0], n, MPI_DOUBLE, id_before, t , MPI_COMM_WORLD, &request_before);
}
if (id<p-1){
//send the last row to id_after
MPI_Isend(&a[width-1][0], n, MPI_DOUBLE, id_after, t, MPI_COMM_WORLD, &request_after);
}
//printf("dubug0");
//update the entries that do not depend on halos
//local row=1 to row=width-2
//add if (width>3)??
int j_b, j_a;
for (int i=1; i<width-1; i++){
for (int j=0; j<n; j++){
j_b=(n+j-1)%n;
j_a=(j+1)%n;
b[i][j]=(a[i-1][j_b]+a[i-1][j]+a[i-1][j_a]+a[i][j_b]+a[i][j_a]+a[i+1][j_b]+a[i+1][j]+a[i+1][j_a])/8;
}
}
//printf("dubug1");
//blocking receive
//may consider unblocking receive
//receive from id_before and store in halo_before
//not sure about status
if (id>0){
MPI_Recv(&halo_before[0], n, MPI_DOUBLE, id_before ,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
//receive from id_after and store in halo_after
if (id<p-1){
MPI_Recv(&halo_after[0], n, MPI_DOUBLE, id_after,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
//to debug print out halos
//std::cout << "halo_before on processor " << id << " at iteration " << t<< " isn" <<std::endl;
//for (int i=0;i<n;i++){
// std::cout << halo_before[i] << " ";
// if (i==n-1)
// std::cout <<"n" <<std::endl;
//}
//std::cout << "halo_after on processor " << id << " at iteration " << t<< " isn" <<std::endl;
//for (int i=0;i<n;i++){
// std::cout << halo_after[i] << " ";
// if (i==n-1)
// std::cout <<"n" <<std::endl;
//}
//update entries that depend on halos
//bugs here, what if width==1???
if (width==1){
for (int j=0; j<n; j++){
j_a=(n+j-1)%n;
j_b=(j+1)%n;
b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+halo_after[j_b]+halo_after[j]+halo_after[j_a])/8;
}
}
else{
for (int j=0; j<n; j++){
j_a=(n+j-1)%n;
j_b=(j+1)%n;
b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+a[1][j_b]+a[1][j]+a[1][j_a])/8;
b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;
}
}
//copy to b
//but make sure the send have been completed
if (id>0)
MPI_Wait(&request_before,MPI_STATUS_IGNORE);
if (id<p-1)
MPI_Wait(&request_after,MPI_STATUS_IGNORE);
for (int i=0; i<width; i++)
for (int j=0; j<n; j++)
a[i][j]=b[i][j];
//to dubug, print out arrays
//std::cout<< " the array on processor " << id <<" at iteration " << t <<" is n"<< std::endl;
//for (int i=0; i<width; i++){
// for (int j=0; j<n; j++){
// std::cout << a[i][j] << " ";
// if (j==n-1)
// std::cout << "n" <<std::endl;
// }
//}
}
//calculate the sum
double sum=0.0;
for (int i=0; i<width; i++)
sum += a[i][i+start_row];
double total_sum;
//send to root processor
MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
if (id==0){
end_time=MPI_Wtime();
//double sum_receive[p];
//double sum_calc;
//for (int i=0; i<p; i++){
// MPI_Recv(&sum_receive[i], 1, MPI_DOUBLE, i, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
// sum_calc += sum_receive[i];
//}
printf("time elapse is %f n", end_time-start_time);
printf("at root processor %d, the calculated sum is %f, n", id, total_sum+5*sin(M_PI*((double)(n-1)/n)*((double)(n-1)/n)));
}
MPI_Finalize();
return 0;
}
您的一行代码中有一个简单的拼写错误。此:
b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;
应该是这样的(注意第4个术语;j_a
,而不是第二个j_b
):
b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_a]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;
因为这发生在每个域的最后一行,所以导致的确切错误量将取决于域边界,例如,您有多少处理器。
现在,在您发布的代码中出现这种错误基本上是不可避免的原因是,经过一些修改,相同的计算(从a到b)发生不少于3次。这是一颗滴答作响的定时炸弹;最终一个会被更新,而其他的会变得与第一个不一致,或者对其他的更新最终会出现一些错误。
这里有很多方法可以减少复制量,既可以澄清代码,也可以避免这类错误。最好是将光晕合并到a和b阵列中,在数据之前和之后添加一行额外的行以包括数据-这样,您就不必担心宽度是否为1,并单独处理结束行。此外,定义一个基于a更新b的行或元素的函数,并使用该函数而不是重复代码。
下面是一个清理代码的例子,它将位封装到例程中,将n和宽度与包含的光晕区域等保持一致。
#include <mpi.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
int min2i(int a, int b) {
int result = a;
if (b < a) result = b;
return result;
}
void decomposition(const int n, const int nprocs, const int id,
int *start_row, int *width, int *id_before, int *id_after) {
const int nrows = n;
const int m = nrows/nprocs;
const int r = nrows % nprocs;
*width = m;
if (id < r) (*width)++;
*start_row = 1 + id*m + min2i(id,r);
*id_before = (id > 0 ? id-1 : MPI_PROC_NULL);
*id_after = (id < nprocs-1 ? id+1 : MPI_PROC_NULL);
}
void startBC(const int n, const int width, double a[][n+2], double b[][n+2],
const int id_before, const int id_after, const int t, MPI_Request *req) {
MPI_Isend(&a[1][1], n, MPI_DOUBLE, id_before, 2*t , MPI_COMM_WORLD, &req[0]);
MPI_Isend(&a[width][1], n, MPI_DOUBLE, id_after , 2*t+1, MPI_COMM_WORLD, &req[1]);
}
void finishBC(const int n, const int width, double a[][n+2], double b[][n+2],
const int id_before, const int id_after, const int t, MPI_Request *req) {
MPI_Status stats[2];
MPI_Recv(&a[0][1], n, MPI_DOUBLE, id_before, 2*t+1, MPI_COMM_WORLD, &stats[0]);
MPI_Recv(&a[width+1][1], n, MPI_DOUBLE, id_after, 2*t , MPI_COMM_WORLD, &stats[1]);
for (int i=0; i<width+2; i++) {
a[i][0] = a[i][n];
a[i][n+1] = a[i][1];
}
MPI_Waitall(2, req, stats);
}
void updateRow(const int n, const int width, double a[][n+2], double b[][n+2], const int row) {
for (int j=1; j<=n; j++)
b[row][j]=( a[row-1][j-1] + a[row-1][j] + a[row-1][j+1]
+a[ row ][j-1] + a[ row ][j+1]
+a[row+1][j-1] + a[row+1][j] + a[row+1][j+1])/8;
}
int main(int argc, const char * argv[])
{
int p, id;
MPI_Init(NULL,NULL);
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_Comm_rank(MPI_COMM_WORLD, &id);
const double pi = 3.1415926;
int n=atoi(argv[argc-1]);
int width, start_row, id_before, id_after;
decomposition(n, p, id, &start_row, &width, &id_before, &id_after);
double a[width+2][n+2], b[width+2][n+2];
for (int i=0; i<width+2; i++)
for (int j=0; j<n+2; j++)
a[i][j]=0.5;
if (id==p-1)
for (int j=0; j<n+2; j++)
a[width+1][j]=5*sin(pi*((double)(j-1)/n)*((double)(j-1)/n));
if (id==0)
for (int j=0; j<n+2; j++)
a[0][j]=0.;
double start_time, end_time;
if (id==0)
start_time=MPI_Wtime();
MPI_Request reqs[2];
//begin iteration
for (int t=0; t<2; t++){
startBC(n, width, a, b, id_before, id_after, t, reqs);
/* interior rows */
for (int row=2; row<width; row++)
updateRow(n, width, a, b, row);
finishBC(n, width, a, b, id_before, id_after, t, reqs);
/* boundary rows */
updateRow(n, width, a, b, 1);
updateRow(n, width, a, b, width);
for (int i=1; i<width+1; i++)
for (int j=1; j<n+1; j++)
a[i][j]=b[i][j];
}
//calculate the sum
double sum=0.0;
for (int i=1; i<width+1; i++)
sum += a[i][i+start_row-1];
double total_sum;
MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
if (id==0){
end_time=MPI_Wtime();
printf("time elapse is %f n", end_time-start_time);
printf("at root processor %d, the calculated sum is %f, n", id, total_sum+5*sin(pi*((double)(n-1)/n)*((double)(n-1)/n)));
}
MPI_Finalize();
return 0;
}
相关文章:
- 为什么"do while"循环不断退出,即使条件计算结果为 false?
- #定义c-预处理器常量..我做错了什么
- valgrind-hellgrind与泄漏检查的结果不同
- 预处理器:插入结构名称中的前一个行号
- 如何在c++中实现处理器调度模拟器
- 用C++20 fmt限制结果的总大小
- 如何返回一个类的两个对象相加的结果
- 使用QProcess执行命令,并将结果存储在QStringList中
- 如果我std::dynamic_pointer_cast并且底层dynamic_cast的结果为null,那么返回的sh
- C/C++预处理器是否可以检测一些编译器选项
- 在没有定义返回类型的函数中返回布尔值,并将结果保存在无错误的char编译中-为什么
- 序列化,没有库的整数,得到奇怪的结果
- 使用取消引用的指针的多态性会产生意外的结果.为什么?
- 要与"if constexpr"一起使用的编译时消息(在预处理器之后)
- 在更改for循环的第三部分后,未使用for循环结果
- 在clang++预处理器中确定gcc工具链版本
- 如何在预处理器时追加到__FILE__并包含结果
- 是否可以查看"C++"预处理器的结果
- 不同数量的处理器最终会得到不同的结果
- 将处理器结果保存到MPI中的一个数组中