ompi
ompi copied to clipboard
Fortran Neighbor_Alltoallw fails with `MPI_ERR_TYPE: invalid datatype`
Hi all,
I'm running a Xubuntu 20.04.5 LTS with an OpenMPI 4.0.3, which was installed via the package manager. I also used a the 5.0.x branch of OpenMPI, which I compiled with gcc-11.2.0 for testing:
$ git log | head
commit 2ade147b0509fc70d600889fa74b0f4b972f1824
Merge: 647d793057 840174cf23
Please consider the following test code, where I set up a three dimensional cartesian non-periodic communicator and call a neighbor_alltoallw:
PROGRAM neighbor_alltoallw_cart
USE, INTRINSIC :: ISO_FORTRAN_ENV
USE, INTRINSIC :: ISO_C_BINDING
USE mpi
IMPLICIT NONE
CHARACTER(LEN=64) :: cmdargstr = ""
INTEGER :: comm_size
INTEGER :: my_rank
INTEGER :: nints = 0
INTEGER, DIMENSION(:), ALLOCATABLE :: sbuffer
INTEGER, DIMENSION(:), ALLOCATABLE :: sendcounts
INTEGER(KIND=MPI_ADDRESS_KIND), DIMENSION(:), ALLOCATABLE :: sdispls
INTEGER, DIMENSION(:), ALLOCATABLE :: sendtypes
INTEGER, DIMENSION(:), ALLOCATABLE :: rbuffer
INTEGER, DIMENSION(:), ALLOCATABLE :: recvcounts
INTEGER(KIND=MPI_ADDRESS_KIND), DIMENSION(:), ALLOCATABLE :: rdispls
INTEGER, DIMENSION(:), ALLOCATABLE :: recvtypes
INTEGER :: nstot, nrtot
INTEGER :: dummyint
INTEGER, PARAMETER :: ndims = 3
INTEGER, DIMENSION(ndims) :: dims = [2,2,1]
LOGICAL, DIMENSION(ndims) :: periods = [.FALSE., .FALSE., .FALSE.]
LOGICAL, PARAMETER :: reorder = .FALSE.
INTEGER :: comm_cart
INTEGER, PARAMETER :: nneighbors = 2*ndims
INTEGER, DIMENSION(nneighbors) :: neighbors
INTEGER :: i, ineighbor
LOGICAL :: valid_data
INTEGER :: refval
INTEGER :: ierr
CALL MPI_Init(ierr)
! Get the number of processes
CALL MPI_Comm_size(MPI_COMM_WORLD, comm_size, ierr);
! Get rank of processes
CALL MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr);
! require cmd-line argument
IF (COMMAND_ARGUMENT_COUNT() < 1) THEN
WRITE(UNIT=OUTPUT_UNIT, FMT="(A)") "./neighbor_alltoallw_cart <msgsize in integers>"
STOP 1
END IF
! requires precicely 4 processes
IF (comm_size /= 4) THEN
WRITE(UNIT=OUTPUT_UNIT, FMT="(A)") "requires precicely 4 processes. Start with -np 4!"
STOP 1
ENDIF
! create the cartesian communicator
CALL MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, periods, reorder, comm_cart, ierr)
! fill the neighbor list
neighbors(:) = -1
SELECT CASE (my_rank)
CASE (0)
neighbors(2) = 2
neighbors(4) = 1
CASE (1)
neighbors(2) = 3
neighbors(3) = 0
CASE (2)
neighbors(1) = 0
neighbors(4) = 3
CASE (3)
neighbors(1) = 1
neighbors(3) = 2
END SELECT
! Allocating send/recv buffer
CALL GET_COMMAND_ARGUMENT(1,cmdargstr)
READ(UNIT=cmdargstr, FMT=*) nints
nints = nints + my_rank
ALLOCATE(sendcounts(nneighbors))
ALLOCATE(sdispls(nneighbors))
ALLOCATE(sendtypes(nneighbors))
nstot = 0
DO ineighbor = 0, nneighbors - 1
IF (neighbors(ineighbor+1) /= -1) THEN
sendcounts(ineighbor+1) = nints
ELSE
sendcounts(ineighbor+1) = 0
END IF
sdispls(ineighbor+1) = INT(nstot*C_SIZEOF(dummyint), MPI_ADDRESS_KIND)
sendtypes(ineighbor+1) = MPI_INTEGER
nstot = nstot + sendcounts(ineighbor+1)
END DO
ALLOCATE(sbuffer(nstot))
sbuffer(:) = my_rank
ALLOCATE(recvcounts(nneighbors))
ALLOCATE(rdispls(nneighbors))
ALLOCATE(recvtypes(nneighbors))
nrtot = 0
DO ineighbor = 0, nneighbors - 1
IF (neighbors(ineighbor+1) /= -1) THEN
recvcounts(ineighbor+1) = nints - my_rank + neighbors(ineighbor+1)
ELSE
recvcounts(ineighbor+1) = 0
END IF
rdispls(ineighbor+1) = INT(nrtot*C_SIZEOF(dummyint), MPI_ADDRESS_KIND)
recvtypes(ineighbor+1) = MPI_INTEGER
nrtot = nrtot + recvcounts(ineighbor+1)
END DO
ALLOCATE(rbuffer(nrtot))
rbuffer(:) = -1
! Message cycle
CALL MPI_Neighbor_alltoallw(sbuffer, sendcounts, sdispls, sendtypes, &
rbuffer, recvcounts, rdispls, recvtypes, &
comm_cart, ierr)
! validate data
valid_data = .TRUE.
DO ineighbor = 0, nneighbors - 1
refval = neighbors(ineighbor+1)
DO i = 1, recvcounts(ineighbor+1)
IF (rbuffer(i+rdispls(ineighbor+1)/C_SIZEOF(dummyint)) /= refval) THEN
WRITE(UNIT=OUTPUT_UNIT, FMT="(A,I4,A,I4)") &
"Rank ", my_rank, " received faulty data from rank ", neighbors(ineighbor+1)
valid_data = .FALSE.
EXIT
END IF
END DO
END DO
DEALLOCATE(rbuffer)
DEALLOCATE(recvcounts)
DEALLOCATE(rdispls)
DEALLOCATE(recvtypes)
DEALLOCATE(sbuffer)
DEALLOCATE(sendcounts)
DEALLOCATE(sdispls)
DEALLOCATE(sendtypes)
CALL MPI_Finalize(ierr)
IF (.NOT.valid_data) STOP 1
END PROGRAM neighbor_alltoallw_cart
I compiled it with
$ mpif90 -o neighbor_alltoallw_cart.x neighbor_alltoallw_cart.F90
and executed it with
$ mpirun -np 4 --map-by :OVERSUBSCRIBE ./neighbor_alltoallw_cart.x 123
[thalia:554635] *** An error occurred in MPI_Neighbor_alltoallw
[thalia:554635] *** reported by process [2675113985,2]
[thalia:554635] *** on communicator MPI_COMMUNICATOR 3
[thalia:554635] *** MPI_ERR_TYPE: invalid datatype
[thalia:554635] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[thalia:554635] *** and potentially your MPI job)
[thalia:554629] PMIX ERROR: UNREACHABLE in file ../../../src/server/pmix_server.c at line 2193
[thalia:554629] 3 more processes have sent help message help-mpi-errors.txt / mpi_errors_are_fatal
[thalia:554629] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
When switching to the OpenMPI 5.0.x I get a similar error of an invalid datatype:
$ mpirun -np 4 --map-by :OVERSUBSCRIBE ./neighbor_alltoallw_cart.x 123
[thalia:00000] *** An error occurred in MPI_Neighbor_alltoallw
[thalia:00000] *** reported by process [2751791105,1]
[thalia:00000] *** on communicator MPI COMMUNICATOR 3 CREATE FROM 0
[thalia:00000] *** MPI_ERR_TYPE: invalid datatype
[thalia:00000] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[thalia:00000] *** and MPI will try to terminate your MPI job as well)
I tested the exact same code with intel-mpi and NEC-mpi, and both execute them perfectly, without any hiccups. I checked the code and cannot find any errors. It seems to me to be fully standard compliant. Maybe it is a problem with the OpenMPI?
Thanks and best regards, Felix