ufs-weather-model
ufs-weather-model copied to clipboard
Coupled, wam and Debug GNU Cases failing on Hera Rocky8
Description
These cases are currently crashing when run on Hera's new Rocky8 OS. These tests are the three listed below. control_wam_debug_gnu cpld_control_p8_gnu cpld_debug_p8_gnu
To Reproduce:
- ssh onto Hera and ensure you're on Hera nodes 5-12 for Rocky8 OS
- Clone repository for PR #2194 or ufs-community:develop following the merging of the PR.
- run ./rt.sh against control_wam_debug_gnu or cpld_control_p8_gnu or cpld_debug_p8_gnu
Additional context
cpld_control_p8_gnu and cpld_debug_p8_gnu dumps crashing err osc pt2pt during run_test.sh.
control_wam_debug_gnu shows slightly different crashing behavior
and PET000.ESMF_LogFile shows
nectedCplFields FV3 Import Field vtype is not connected 20240321 013524.316 INFO PET000 /scratch2/NCEPDEV/marine/Jong.Kim/UFS-RT/pr-2194/FV3/cpl/module_cplfields.F90:545 realizeConnectedCplFields FV3 Import Field stype is not connected 20240321 013524.316 INFO PET000 /scratch2/NCEPDEV/marine/Jong.Kim/UFS-RT/pr-2194/FV3/cpl/module_cplfields.F90:545 realizeConnectedCplFields FV3 Import Field vfrac is not connected
Solutions attempted but unsuccessful have included
-
forcing to run on nodes 5-12
-
adding -mcmodel=large into gnu.cmake file adjusting gnu.cmake as below and testing medium and large setting
#if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" AND ${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "arm64")
set( CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} -mcmodel=small" )
#else() set( CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} -mcmodel=medium" ) #endif()
Output
Related Hera Rocky8 PR #2194
@DomHeinzeller installed gnu-12 on Hera: /scratch1/NCEPDEV/nems/role.epic/spack-stack/modulefiles - gcc/12.2.0 New openmpi is installed at /scratch1/NCEPDEV/nems/role.epic/gnu/modulefiles - openmpi/4.1.6 spack-stack using these two installations is here: /scratch1/NCEPDEV/nems/role.epic/spack-stack/spack-stack-1.5.1/envs/unified-env-gnu12 I tried it and it didn't work.
I don't know if this will help, but I was able to get the following lua modulefile to load OK. I have yet to test running a test though:
-->cat ../modulefiles/ufs_hera.gnu.lua
help([[
loads UFS Model prerequisites for Hera/GNU
]])
-- prepend_path("MODULEPATH", "/scratch1/NCEPDEV/nems/role.epic/spack-stack/spack-stack-1.5.1/envs/unified-env-rocky8/install/modulefiles/Core")
-- EPIC GNU 12.2 INSTALL
prepend_path("MODULEPATH", "/scratch1/NCEPDEV/nems/role.epic/spack-stack/modulefiles")
gcc_ver=os.getenv("gcc_ver") or "12.2.0"
load(pathJoin("gcc", gcc_ver))
--EPIC OpenMPI 4.1.6 INSTALL
prepend_path("MODULEPATH", "/scratch1/NCEPDEV/nems/role.epic/gnu/modulefiles")
openmpi_ver=os.getenv("openmpi_ver") or "4.1.6"
load(pathJoin("openmpi", openmpi_ver))
prepend_path("MODULEPATH", "/scratch1/NCEPDEV/nems/role.epic/spack-stack/spack-stack-1.5.1/envs/unified-env-gnu12/install/modulefiles/Core")
stack_gnu_ver=os.getenv("stack_gnu_ver") or "12.2.0"
load(pathJoin("stack-gcc", stack_gnu_ver))
stack_openmpi_ver=os.getenv("stack_openmpi_ver") or "4.1.6"
load(pathJoin("stack-openmpi", stack_openmpi_ver))
cmake_ver=os.getenv("cmake_ver") or "3.23.1"
load(pathJoin("cmake", cmake_ver))
load("ufs_common")
nccmp_ver=os.getenv("nccmp_ver") or "1.9.0.1"
load(pathJoin("nccmp", nccmp_ver))
setenv("CC", "mpicc")
setenv("CXX", "mpic++")
setenv("FC", "mpif90")
setenv("CMAKE_Platform", "hera.gnu")
whatis("Description: UFS build environment")
@jkbk2004 @RatkoVasic-NOAA did SA's on hera install Slurm correctly?
srun: lua: This job was submitted from a host running Rocky 8. Assigning job to el8 reservation.
7: [h18c53:3565779] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
12: [h18c53:3565784] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
14: [h18c53:3565786] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
15: [h18c53:3565787] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
21: [h18c53:3565793] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
24: [h18c53:3565796] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
27: [h18c53:3565799] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
28: [h18c53:3565800] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
29: [h18c53:3565801] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
34: [h18c53:3565806] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
39: [h18c53:3565811] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
0: [h18c53:3565772] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
1: [h18c53:3565773] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
2: [h18c53:3565774] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
3: [h18c53:3565775] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
4: [h18c53:3565776] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
5: [h18c53:3565777] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
6: [h18c53:3565778] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
8: [h18c53:3565780] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
9: [h18c53:3565781] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
10: [h18c53:3565782] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
11: [h18c53:3565783] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
13: [h18c53:3565785] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
16: [h18c53:3565788] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
17: [h18c53:3565789] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
126: [h18c56:1664185] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
18: [h18c53:3565790] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
19: [h18c53:3565791] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
20: [h18c53:3565792] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
22: [h18c53:3565794] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
23: [h18c53:3565795] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
25: [h18c53:3565797] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
26: [h18c53:3565798] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
30: [h18c53:3565802] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
31: [h18c53:3565803] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
32: [h18c53:3565804] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
33: [h18c53:3565805] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
35: [h18c53:3565807] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
36: [h18c53:3565808] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
37: [h18c53:3565809] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
38: [h18c53:3565810] OPAL ERROR: Unreachable in file ../../../../../opal/mca/pmix/pmix3x/pmix3x_client.c at line 111
29: --------------------------------------------------------------------------
29: The application appears to have been direct launched using "srun",
29: but OMPI was not built with SLURM's PMI support and therefore cannot
29: execute. There are several options for building PMI support under
29: SLURM, depending upon the SLURM version you are using:
29:
29: version 16.05 or later: you can use SLURM's PMIx support. This
29: requires that you configure and build SLURM --with-pmix.
29:
29: Versions earlier than 16.05: you must use either SLURM's PMI-1 or
29: PMI-2 support. SLURM builds PMI-1 by default, or you can manually
29: install PMI-2. You must then build Open MPI using --with-pmi pointing
29: to the SLURM PMI library location.
29:
29: Please configure as appropriate and try again.
@BrianCurtis-NOAA With: Hera:/home/Ratko.Vasic>srun --mpi=list MPI plugin types are... none cray_shasta pmi2 You might try option with srun in job_card: srun --mpi=pmi2 -n xxx..... (let's see what happens).
@RatkoVasic-NOAA same error for me with that added --mpi=pmi2
sounds like slurm/rocky8 is another inconsistence. @RatkoVasic-NOAA @zach1221 @FernandoAndrade-NOAA let me know if you want me to send email to get SA involved here.
sounds like slurm/rocky8 is another inconsistence. @RatkoVasic-NOAA @zach1221 @FernandoAndrade-NOAA let me know if you want me to send email to get SA involved here.
Yes, please. Go ahead and send if you don't mind.
@RatkoVasic-NOAA When openMPI is built did you have to specify the location of Slurm? If yes, what was specified as the location for Slurm?
@rreddy2001 here is configure for openMPI:
cd /scratch1/NCEPDEV/nems/role.epic/installs/openmpi-4.1.6
module purge
module use /scratch1/NCEPDEV/nems/role.epic/spack-stack/modulefiles
module load gcc/12.2.0 cmake/3.28.1
export LD_LIBRARY_PATH=/scratch1/NCEPDEV/nems/role.epic/openpmi/4.1.6/lib
./configure --prefix=/scratch1/NCEPDEV/nems/role.epic/openpmi/4.1.6 \
CC=gcc \
CXX=g++ \
FC=gfortran \
CFLAGS=-m64 FCFLAGS=-m64 \
--with-wrapper-cflags=-m64 \
--with-wrapper-cxxflags=-m64 \
--with-wrapper-fcflags=-m64 \
--with-hwloc=internal --with-libevent=internal \
--with-pmix=internal \
--enable-mpirun-prefix-by-default \
--with-slurm \
--with-mpi-param-check \
--enable-mpi-fortran
make -j6 all && make install
My recommendation would be to build OpenMPI with a pmi library that is being used by slurm. The pmi library used by slurm on Hera is at /apps/slurm_hera/default/lib
.
@rreddy2001
scontrol show config
shows that slurm config file is at /apps/slurm/23.11.3/etc/slurm.conf
. However ls /apps/slurm/
shows only 21.08.8-2 22.05.9 22.05.9p2 23.11.1 ctld current d dbd default logues slurm src tools
. So there is no such folder /apps/slurm/23.11.3/
. Slurm is not configured properly on Rocky8.
How about using the same options for compiling MPI as on Hercules (see spack-stack documentation). I remember on Rocky9 the slurm installs are all in /opt/slurm*/
and they are [supposed to be] consistent across all nodes
Updates for the new gnu/13.3.0 compiler with openmpi/4.1.6 used to built spack-stack/1.6.0 environment with the packages for the WM and SRW. All regression tests for Hera gnu pass; rt.sh was run with "-c" version that generates a new baseline. For the update details, see a comment in PR #2093
https://github.com/ufs-community/ufs-weather-model/pull/2093#issuecomment-2143694396