superlu_dist
superlu_dist copied to clipboard
CombBLAS: Hang in MPI_Barrier when using HWPM together with 3D factorisation
I encountered an issue when using SuperLU_dist's 3D factorisation algorithm (e.g., pddrive3d) together with CombBLAS for heavy-weight perfect matching (HWPM). This applies to the case where the depth of the process grid is greater than one.
In this case, only a subset of the MPI processes (i.e., processes on layer zero in the z-direction) make calls to CombBLAS to perform the reordering.
The following lines from pdgssvx3d.c are relevant:
https://github.com/xiaoyeli/superlu_dist/blob/98695a40807ab962cc7d7ec140fa65a8d485bb31/SRC/pdgssvx3d.c#L632 https://github.com/xiaoyeli/superlu_dist/blob/98695a40807ab962cc7d7ec140fa65a8d485bb31/SRC/pdgssvx3d.c#L969
Unfortunately, parts of CombBLAS incorrectly perform barrier synchronisation with MPI_COMM_WORLD instead of using the MPI communicator that was passed in as an argument. As a result, SuperLU_dist simply hangs with a stack backtrace like this:
#0 0x0000155524153547 in sched_yield () from /lib64/libc.so.6
#1 0x000015552578e077 in MPIR_Wait_impl.part.0 () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#2 0x000015552652e3a6 in MPIC_Wait () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#3 0x0000155526540c11 in MPIC_Sendrecv () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#4 0x0000155526448a2c in MPIR_Allreduce_intra_recursive_doubling () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#5 0x0000155524a290b1 in MPIR_Allreduce_intra_auto () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#6 0x0000155524a29295 in MPIR_Allreduce_impl () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#7 0x000015552672db84 in MPIR_CRAY_Allreduce () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#8 0x000015552656d60c in MPIR_Get_contextid_sparse_group () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#9 0x000015552656b51c in MPII_Comm_copy () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#10 0x0000155524cedd21 in MPIR_Comm_dup_impl () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#11 0x000015552656f890 in MPIDI_CRAY_Setup_Shared_Mem_Coll () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#12 0x000015552672e983 in MPIR_CRAY_Barrier () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#13 0x0000155524b16ab0 in MPIR_Barrier () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#14 0x0000155524b16fef in PMPI_Barrier () from /opt/cray/pe/lib64/libmpi_gnu_91.so.12
#15 0x0000155554fd2cd2 in combblas::WeightedGreedy<combblas::SpParMat<long, double, combblas::SpCCols<long, double> >, long> (A=..., mateRow2Col=..., mateCol2Row=...,
degCol=...) at /global/common/software/m2957/simber/emimm/x86-milan_gnu_libsci-1.0/include/CombBLAS/BipartiteMatchings/BPMaximalMatching.h:293
#16 0x0000155554fddf60 in combblas::AWPM<long, double> (A1=..., mateRow2Col=..., mateCol2Row=..., optimizeProd=optimizeProd@entry=true)
at /global/common/software/m2957/simber/emimm/x86-milan_gnu_libsci-1.0/include/CombBLAS/BipartiteMatchings/ApproxWeightPerfectMatching.h:1191
#17 0x0000155554f98a78 in dGetHWPM (A=<optimized out>, grid=<optimized out>, ScalePermstruct=0x7fffffff5160)
at /global/common/software/m2957/simber/emimm/x86-milan_gnu_libsci-1.0/src/superlu_dist-8.1.2/SRC/dHWPM_CombBLAS.hpp:122
#18 0x0000155554f990f9 in d_c2cpp_GetHWPM (A=A@entry=0x7fffffff5130, grid=grid@entry=0x7fffffff5208, ScalePermstruct=ScalePermstruct@entry=0x7fffffff5160)
at /global/common/software/m2957/simber/emimm/x86-milan_gnu_libsci-1.0/src/superlu_dist-8.1.2/SRC/d_c2cpp_GetHWPM.cpp:56
#19 0x0000155554f84ae9 in pdgssvx3d (options=0x7fffffff5260, A=0x7fffffff5130, ScalePermstruct=0x7fffffff5160, B=0x5fd1080, ldb=20573, nrhs=1, grid3d=0x7fffffff51e0,
LUstruct=0x7fffffff5110, SOLVEstruct=0x7fffffff5190, berr=0xb323c0, stat=0x7fffffff5310, info=0x7fffffff50d4)
at /global/common/software/m2957/simber/emimm/x86-milan_gnu_libsci-1.0/src/superlu_dist-8.1.2/SRC/pdgssvx3d.c:969
#20 0x000000000040278f in main (argc=<optimized out>, argv=<optimized out>)
at /global/common/software/m2957/simber/emimm/x86-milan_gnu_libsci-1.0/src/superlu_dist-8.1.2/EXAMPLE/pddrive3d.c:443
The following patch can be applied to CombBLAS 1.6.2 to fix the issue:
--- BipartiteMatchings/BPMaximalMatching.h.orig 2022-10-20 19:25:37.085532000 -0700
+++ BipartiteMatchings/BPMaximalMatching.h 2022-10-20 19:28:39.813003451 -0700
@@ -27,9 +27,10 @@
{
typedef VertexTypeML < IT, IT> VertexType;
+ MPI_Comm comm = A.getcommgrid()->GetWorld();
int nprocs, myrank;
- MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
- MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
+ MPI_Comm_size(comm,&nprocs);
+ MPI_Comm_rank(comm,&myrank);
int nthreads = 1;
#ifdef _OPENMP
#pragma omp parallel
@@ -77,7 +78,7 @@
cout << "=======================================================\n";
}
#endif
- MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Barrier(comm);
while(curUnmatchedCol !=0 && curUnmatchedRow!=0 && newlyMatched != 0 )
@@ -179,7 +180,7 @@
#endif
curUnmatchedCol = unmatchedCol.getnnz();
curUnmatchedRow = unmatchedRow.getnnz();
- MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Barrier(comm);
}
@@ -251,8 +252,9 @@
#endif
PreAllocatedSPA<VertexType> SPA(A.seq(), nthreads*4);
int nprocs, myrank;
- MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
- MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
+ MPI_Comm comm = A.getcommgrid()->GetWorld();
+ MPI_Comm_size(comm,&nprocs);
+ MPI_Comm_rank(comm,&myrank);
double tStart = MPI_Wtime();
@@ -290,7 +292,7 @@
cout << "=======================================================\n";
}
#endif
- MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Barrier(comm);
while(curUnmatchedCol !=0 && curUnmatchedRow!=0 && newlyMatched != 0 )
@@ -350,7 +352,7 @@
#endif
curUnmatchedCol = unmatchedCol.getnnz();
curUnmatchedRow = unmatchedRow.getnnz();
- MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Barrier(comm);
}
@@ -404,7 +406,8 @@
{
typedef VertexTypeML < IT, IT> VertexType;
int myrank;
- MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
+ MPI_Comm comm = A.getcommgrid()->GetWorld();
+ MPI_Comm_rank(comm,&myrank);
FullyDistSpVec<IT, IT> fringeRow(A.getcommgrid(), A.getnrow());
FullyDistSpVec<IT, IT> fringeCol(A.getcommgrid(), A.getncol());
FullyDistSpVec<IT, IT> unmatchedRow(mateRow2Col, [](IT mate){return mate==-1;});
@azadcse, @aydinbuluc :
Can you please take a look at this bug report and the proposed patch in BipartiteMatchings/BPMaximalMatching.h ?
@jamtrott refers to v1.6.2, but I checked the latest v2.0, there are still a few places using MPI_COMM_WORLD, so his fixes should also be applied in v2.0.