ompi
ompi copied to clipboard
Open MPI hangs on MPI_Comm_split_type
Thank you for taking the time to submit an issue!
Background information
What version of Open MPI are you using? (e.g., v3.0.5, v4.0.2, git branch name and hash, etc.)
$ mpiexec --version
mpiexec (Open MPI) 5.0.0
Report bugs to https://www.open-mpi.org/community/help/
Describe how Open MPI was installed (e.g., from a source/distribution tarball, from a git clone, from an operating system distribution package, etc.)
curl -O https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.0.tar.bz2
tar -jxf openmpi-5.0.0.tar.bz2export PATH=/localdisk/yigoshev/mpi/openmpi-5.0.0-built/bin:$PATH
cd openmpi-5.0.0/
./configure --prefix=<path_to_ompi>
make -j44 all
pip install sphinx_rtd_theme # for some reason openmpi requires this package to install
pip install recommonmark # for some reason openmpi requires this package to install
make -j44 all
make install
export PATH=<path_to_ompi>/bin:$PATH
pip install --no-cache-dir mpi4py
If you are building/installing from a git clone, please copy-n-paste the output from git submodule status.
Please describe the system on which you are running
- Operating system/version:
$ cat /etc/os-release
PRETTY_NAME="Ubuntu 22.04.3 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.3 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy
- Computer hardware: Intel(R) Xeon(R) Platinum 8276L CPU @ 2.20GHz
- Network type: tcp
Details of the problem
Please describe, in detail, the problem that you are having, including the behavior you expect to see, the actual behavior that you are seeing, steps to reproduce the problem, etc. It is most helpful if you can attach a small program that a developer can use to reproduce your problem.
We see OpenMPI hanging on MPI_Comm_split_type while we expect it to working.
Note: If you include verbatim output (or a code block), please use a GitHub Markdown code block like below:
C example
#include <stdio.h>
#include <stdlib.h>
#include <sys/param.h>
#include <sys/types.h>
#include <unistd.h>
#include <mpi.h>
int main(int argc, char *argv[])
{
int msg, rc;
MPI_Comm parent, child,intracomm,intercomm;
int rank, size;
char hostname[1024];
pid_t pid;
char *env_rank, *env_nspace;
MPI_Info info;
env_rank = getenv("PMIX_RANK");
env_nspace = getenv("PMIX_NAMESPACE");
pid = getpid();
gethostname(hostname, 1024);
printf("[%s:%s pid %ld] starting up on node %s!\n", env_nspace, env_rank, (long) pid, hostname);
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
printf("%d completed MPI_Init\n", rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_get_parent(&parent);
/* If we get COMM_NULL back, then we're the parent */
if (MPI_COMM_NULL == parent) {
pid = getpid();
printf("Parent [pid %ld] about to spawn!\n", (long) pid);
MPI_Info_create(&info);
rc = MPI_Comm_spawn(argv[0], MPI_ARGV_NULL, 8, info, 0, MPI_COMM_WORLD, &child,
MPI_ERRCODES_IGNORE);
MPI_Intercomm_merge(child, 0, &intracomm);
if (MPI_SUCCESS != rc) {
printf("Child failed to spawn\n");
return rc;
}
printf("Parent done with spawn\n");
MPI_Comm_disconnect(&child);
printf("Parent disconnected\n");
}
/* Otherwise, we're the child */
else {
printf("In Child process\n");
MPI_Intercomm_merge(parent, 1, &intracomm);
MPI_Comm_disconnect(&parent);
printf("Child %d disconnected\n", rank);
}
MPI_Barrier(intracomm);
MPI_Comm new_comm;
MPI_Comm_split_type(intracomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &new_comm);
MPI_Finalize();
fprintf(stderr, "%d: exiting\n", pid);
return 0;
}
Python example
# reproducer.py
import sys
import mpi4py
mpi4py.rc(recv_mprobe=False, initialize=False)
from mpi4py import MPI # noqa: E402
MPI.Init_thread()
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
parent_comm = MPI.Comm.Get_parent()
if rank == 0 and parent_comm == MPI.COMM_NULL and size == 1:
nprocs_to_spawn = 8 # everything works on 128 and lower values
args = ["reproducer.py"]
info = MPI.Info.Create()
intercomm = MPI.COMM_SELF.Spawn(
sys.executable,
args,
maxprocs=nprocs_to_spawn,
info=info,
root=rank,
)
comm = intercomm.Merge(high=False)
intercomm.Disconnect()
else:
comm = parent_comm.Merge(high=True)
parent_comm.Disconnect()
comm.Barrier()
host_comm = comm.Split_type(MPI.COMM_TYPE_SHARED)
MPI.Finalize()
for C example
$ mpicc reproducer.c -o reproducer
$ mpiexec -n 1 reproducer
for Python example
$ mpiexec -n 1 python reproducer.py
Interesting I don't see a hang but i get an error coming out the of the TCP BTL when using main. I also tried the patch in PR #12119 but it didn't help.
[er-head][[56247,2],5][btl_tcp_endpoint.c:668:mca_btl_tcp_endpoint_recv_connect_ack] received unexpected process identifier: got [[56247,2],0] expected [[56247,1],0]
[er-head][[56247,2],5][btl_tcp_endpoint.c:668:mca_btl_tcp_endpoint_recv_connect_ack] received unexpected process identifier: got [[56247,2],0] expected [[56247,1],0]
[er-head][[56247,2],4][btl_tcp_endpoint.c:668:mca_btl_tcp_endpoint_recv_connect_ack] received unexpected process identifier: got [[56247,2],0] expected [[56247,1],0]
[er-head][[56247,2],2][btl_tcp_endpoint.c:668:mca_btl_tcp_endpoint_recv_connect_ack] received unexpected process identifier: got [[56247,2],0] expected [[56247,1],0]
[er-head][[56247,2],6][btl_tcp_endpoint.c:668:mca_btl_tcp_endpoint_recv_connect_ack] received unexpected process identifier: got [[56247,2],0] expected [[56247,1],0]
[er-head][[56247,2],4][btl_tcp_endpoint.c:668:mca_btl_tcp_endpoint_recv_connect_ack] received unexpected process identifier: got [[56247,2],0] expected [[56247,1],0]
[er-head][[56247,2],2][btl_tcp_endpoint.c:668:mca_btl_tcp_endpoint_recv_connect_ack] received unexpected process identifier: got [[56247,2],0] expected [[56247,1],0]
[er-head][[56247,2],6][btl_tcp_endpoint.c:668:mca_btl_tcp_endpoint_recv_connect_ack] received unexpected process identifier: got [[56247,2],0] expected [[56247,1],0]
[er-head][[56247,1],0][btl_tcp_proc.c:400:mca_btl_tcp_proc_create] opal_modex_recv: failed with return value=-46
[er-head][[56247,1],0][btl_tcp_proc.c:400:mca_btl_tcp_proc_create] opal_modex_recv: failed with return value=-46
@YarShev Is this issue only observed with Open MPI 5.0.0? Does 4.1.x manifest similar symptoms?
@wenduwan, 4.1.5 also hangs on my side.
4.1.5 also hangs on my side
☹️
Well that's not good news. Adding another label to this...