Boost mpi hangs on
Boost mpi hangs on
我有一个简单的异步消息程序,如下所示。我在两台有16个进程的pc上运行它。这些进程以4x4矩阵的形式连接在一个圆环上。因此,在main函数的开始,您将看到进程的邻居。我要做的是实现一个检查点算法。因此,当i = 5或10(假设i是时间)时,每个进程都会向其所有邻居发送消息。当一个进程接收到一条消息时,它会为同一进程启动一个新的指令。但是如果i为10,则不会启动irecv,因为在此之后不会收到任何消息。在程序结束时,所有进程等待未接收的消息。
/* Demo_01_Main.cpp */
#include <boost/mpi.hpp>
#include <boost/serialization/string.hpp>
#include <string>
#include <iostream>
#include <fstream>
#include <map>
using namespace std;
class Packet{
friend class boost::serialization::access;
private:
int receiver;
int sender;
int data;
public:
Packet(){
receiver = 0;
sender = 0;
data = 0;
}
Packet(int receiver, int sender, int data){
this->receiver = receiver;
this->sender = sender;
this->data = data;
}
~Packet(){}
int getData() {
return data;
}
void setData(int data) {
this->data = data;
}
int getReceiver() {
return receiver;
}
void setReceiver(int receiver) {
this->receiver = receiver;
}
int getSender() {
return sender;
}
void setSender(int sender) {
this->sender = sender;
}
template<class Archive>
void serialize(Archive& ar, const unsigned int version) {
ar & receiver;
ar & sender;
ar & data;
}
string toString(){
stringstream ss;
ss << "Packet = [Data: " << data << ", Receiver: " << receiver << ", Sender: " << sender << "]";
return ss.str();
}
};
int rank;
void log(string str){
ofstream outfile;
stringstream logFileName;
logFileName << "log_" << rank << ".txt";
outfile.open(logFileName.str().c_str(), std::ios_base::app);
outfile << str;
outfile.close();
}
int main(int argc, char* argv[]){
map<int, boost::mpi::request> mpiReceiveRequest;
map<int, boost::mpi::request> mpiSendRequest;
map<int, Packet *> receivedData;
vector<int> neighbors;
boost::mpi::environment env(argc, argv);
boost::mpi::communicator world;
rank = world.rank();
if(rank == 0){
neighbors.push_back(1);
neighbors.push_back(3);
neighbors.push_back(4);
neighbors.push_back(5);
neighbors.push_back(7);
neighbors.push_back(12);
neighbors.push_back(13);
neighbors.push_back(15);
}
else if(rank == 1){
neighbors.push_back(0);
neighbors.push_back(2);
neighbors.push_back(4);
neighbors.push_back(5);
neighbors.push_back(6);
neighbors.push_back(12);
neighbors.push_back(13);
neighbors.push_back(14);
}
else if(rank == 2){
neighbors.push_back(1);
neighbors.push_back(3);
neighbors.push_back(5);
neighbors.push_back(6);
neighbors.push_back(7);
neighbors.push_back(13);
neighbors.push_back(14);
neighbors.push_back(15);
}
else if(rank == 3){
neighbors.push_back(0);
neighbors.push_back(2);
neighbors.push_back(4);
neighbors.push_back(6);
neighbors.push_back(7);
neighbors.push_back(12);
neighbors.push_back(14);
neighbors.push_back(15);
}
else if(rank == 4){
neighbors.push_back(0);
neighbors.push_back(1);
neighbors.push_back(3);
neighbors.push_back(5);
neighbors.push_back(7);
neighbors.push_back(8);
neighbors.push_back(9);
neighbors.push_back(11);
}
else if(rank == 5){
neighbors.push_back(0);
neighbors.push_back(1);
neighbors.push_back(2);
neighbors.push_back(4);
neighbors.push_back(6);
neighbors.push_back(8);
neighbors.push_back(9);
neighbors.push_back(10);
}
else if(rank == 6){
neighbors.push_back(1);
neighbors.push_back(2);
neighbors.push_back(3);
neighbors.push_back(5);
neighbors.push_back(7);
neighbors.push_back(9);
neighbors.push_back(10);
neighbors.push_back(11);
}
else if(rank == 7){
neighbors.push_back(0);
neighbors.push_back(2);
neighbors.push_back(3);
neighbors.push_back(4);
neighbors.push_back(6);
neighbors.push_back(8);
neighbors.push_back(10);
neighbors.push_back(11);
}
else if(rank == 8){
neighbors.push_back(4);
neighbors.push_back(5);
neighbors.push_back(7);
neighbors.push_back(9);
neighbors.push_back(11);
neighbors.push_back(12);
neighbors.push_back(13);
neighbors.push_back(15);
}
else if(rank == 9){
neighbors.push_back(4);
neighbors.push_back(5);
neighbors.push_back(6);
neighbors.push_back(8);
neighbors.push_back(10);
neighbors.push_back(12);
neighbors.push_back(13);
neighbors.push_back(14);
}
else if(rank == 10){
neighbors.push_back(5);
neighbors.push_back(6);
neighbors.push_back(7);
neighbors.push_back(9);
neighbors.push_back(11);
neighbors.push_back(13);
neighbors.push_back(14);
neighbors.push_back(15);
}
else if(rank == 11){
neighbors.push_back(4);
neighbors.push_back(6);
neighbors.push_back(7);
neighbors.push_back(8);
neighbors.push_back(10);
neighbors.push_back(12);
neighbors.push_back(14);
neighbors.push_back(15);
}
else if(rank == 12){
neighbors.push_back(0);
neighbors.push_back(1);
neighbors.push_back(3);
neighbors.push_back(8);
neighbors.push_back(9);
neighbors.push_back(11);
neighbors.push_back(13);
neighbors.push_back(15);
}
else if(rank == 13){
neighbors.push_back(0);
neighbors.push_back(1);
neighbors.push_back(2);
neighbors.push_back(8);
neighbors.push_back(9);
neighbors.push_back(10);
neighbors.push_back(12);
neighbors.push_back(14);
}
else if(rank == 14){
neighbors.push_back(1);
neighbors.push_back(2);
neighbors.push_back(3);
neighbors.push_back(9);
neighbors.push_back(10);
neighbors.push_back(11);
neighbors.push_back(13);
neighbors.push_back(15);
}
else if(rank == 15){
neighbors.push_back(0);
neighbors.push_back(2);
neighbors.push_back(3);
neighbors.push_back(8);
neighbors.push_back(10);
neighbors.push_back(11);
neighbors.push_back(12);
neighbors.push_back(14);
}
for(int i=0; i<8; i++){
Packet * packet = new Packet();
receivedData[neighbors[i]] = packet;
mpiReceiveRequest[neighbors[i]] = world.irecv(neighbors[i], 100, *packet);
}
for(int i=1; i<=10; i++){
if(i%5 == 0){ // Checkpoint time
for(int j=0; j<8; j++){
Packet * p = new Packet(neighbors[j], rank, i);
mpiSendRequest[neighbors[j]] = world.isend(neighbors[j], 100, *p);
log("Sending: ");
log(p->toString());
log("n");
}
}
for(int j=0; j<8; j++){
if(mpiReceiveRequest[neighbors[j]].test()){
Packet * p = receivedData[neighbors[j]];
log("Received: ");
log(receivedData[neighbors[j]]->toString());
log("n");
if(p->getData() != 10){
Packet * packet = new Packet();
receivedData[neighbors[j]] = packet;
mpiReceiveRequest[neighbors[j]] = world.irecv(neighbors[j], 100, *packet);
}
}
}
}
for(int i=0; i<8; i++){
stringstream ss;
ss << " Wait from: " << neighbors[i] << endl;
log(ss.str());
mpiReceiveRequest[neighbors[i]].wait();
log("Received: ");
log(receivedData[neighbors[i]]->toString());
log("n");
}
stringstream ss;
ss << rank << " is done" << endl;
log(ss.str());
return 0;
}
问题是它在等待命令时挂起。同样,一些接收到的消息包含无意义的数据。例如,进程7的输出文件如下:
Received: Packet = [Data: 5, Receiver: 7, Sender: 10]
Received: Packet = [Data: 5, Receiver: 7, Sender: 11]
Received: Packet = [Data: 5, Receiver: 7, Sender: 0]
Received: Packet = [Data: 5, Receiver: 7, Sender: 4]
Sending: Packet = [Data: 5, Receiver: 0, Sender: 7]
Sending: Packet = [Data: 5, Receiver: 2, Sender: 7]
Sending: Packet = [Data: 5, Receiver: 3, Sender: 7]
Sending: Packet = [Data: 5, Receiver: 4, Sender: 7]
Sending: Packet = [Data: 5, Receiver: 6, Sender: 7]
Sending: Packet = [Data: 5, Receiver: 8, Sender: 7]
Sending: Packet = [Data: 5, Receiver: 10, Sender: 7]
Sending: Packet = [Data: 5, Receiver: 11, Sender: 7]
Wait from: 0
Received: Packet = [Data: 537985024, Receiver: 0, Sender: 0]
Wait from: 2
我不知道问题出在哪里
问题是在调用等待命令之前测试并通过的请求。实际上,我意识到这也是我之前的一个问题的原因:等待已经完成的请求
所以,我修改了我的代码如下,现在它运行:/* ...
else if(rank == 15){
neighbors.push_back(0);
neighbors.push_back(2);
neighbors.push_back(3);
neighbors.push_back(8);
neighbors.push_back(10);
neighbors.push_back(11);
neighbors.push_back(12);
neighbors.push_back(14);
}
...After assigning neighbors
*/
for(int i=0; i<8; i++){
Packet * packet = new Packet();
receivedData[neighbors[i]] = packet;
mpiReceiveRequest[i] = world.irecv(neighbors[i], 100, *packet);
}
vector<int> completed;
for(int i=1; i<=10; i++){
if(i%5 == 0){ // Checkpoint time
for(int j=0; j<8; j++){
Packet * p = new Packet(neighbors[j], rank, i);
mpiSendRequest.push_back(world.isend(neighbors[j], 100, *p));
log("Sending: ");
log(p->toString());
log("n");
}
}
for(int j=0; j<8; j++){
vector<int>::iterator it = completed.begin();
bool passed = false;
while(it != completed.end()){
if(*it == j){
passed = true;
break;
}
it++;
}
if(!passed){
if(mpiReceiveRequest[j].test()){
completed.push_back(j);
Packet * p = receivedData[neighbors[j]];
if(p->getData() != 10){
Packet * packet = new Packet();
receivedData[neighbors[j]] = packet;
mpiReceiveRequest[j] = world.irecv(neighbors[j], 100, *packet);
completed.pop_back();
}
}
}
}
}
vector<boost::mpi::request> reqs;
for(int i=0; i<8; i++){
vector<int>::iterator it = completed.begin();
bool passed = false;
while(it != completed.end()){
if(*it == i){
passed = true;
break;
}
it++;
}
if(!passed){
mpiReceiveRequest[i].wait();
reqs.push_back(mpiReceiveRequest[i]);
}
}
stringstream ss;
ss << rank << " is done" << endl;
log(ss.str());
return 0;
} // End of main
主要的区别是将完成的请求存储在一个vector中,而不是在最后等待它们。
相关文章:
- 瓦尔格林德:数学函数"Conditional jump or move depends on uninitialised value(s)"
- 循环挂起迭代的 std::擦除 on std::list
- SIGSEGV on Boost UDP 套接字关闭 - tcache_get at malloc.c.
- CPP 中的瓦尔格林德和记忆泄漏:"Conditional jump or move depends on uninitialised values"
- std::bind on statd::array 的运算符 []
- 您将如何连接"on the fly"文本+整数并将其传递给函数?
- 如何修复输出日志中的"EnableInput can only be specified on a Pawn for its Controller"错误
- VS2019 - Sudo Remote Debugging on Linux with Cmake project
- C++ library with Tensorflow on Android
- SFML 交叉编译 for Windows on Linux.
- How to recover system gcc compiler on centos 6
- Cmake with Flex/Bison on windows
- 按原样保存用户输入 - Ruby on Rails
- OpenAL C++ on Linux
- Boost::process on Windows - with MinGW?
- CMake on FindOpenGL.cmake 中的错误,当使用导入的目标 OpenGL::GLU?
- Utilization of atomic_flag on C++
- XInputGetState hangs
- anaconda cythonize C++ on windows 10
- Boost mpi hangs on