AMD compiler error with large GPU kernel in SHOC: unhandled SGPR spill to memory
Using HiP support E3SM-MMF code, https://github.com/xyuan/e3sm_p3_shoc/tree/e3sm_p3_shoc_hip
to build on crusher with AMD compiler, and with following modules
[[email protected] cmake-bld]$ module list Currently Loaded Modules:
- amd/5.1.0 5) xalt/1.3.0 9) craype-accel-amd-gfx90a 13) subversion/1.14.0 17) cray-libsci/21.08.1.2
- craype/2.7.15 6) DefApps/default 10) rocm/4.5.2 14) git/2.31.1 18) cray-hdf5-parallel/1.12.0.7
- cray-dsmml/0.2.2 7) libfabric/1.15.0.0 11) cray-mpich/8.1.12 15) cmake/3.22.2 19) cray-netcdf-hdf5parallel/4.7.4.7
- PrgEnv-amd/8.3.3 8) craype-network-ofi 12) cray-python/3.9.4.2 16) zlib/1.2.11 20) cray-parallel-netcdf/1.12.1.7
There is "unhandled SGPR spill to memory" due to large GPU kernel, see below: further investegated details by commenting out some kernel code show that the build passes without any problem.
cd /gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/scream/src/physics/shoc && python3 /gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/Tools/e3sm_compile_wrap.py /opt/cray/pe/craype/2.7.15/bin/CC -DKOKKOS_DEPENDENCE -DMPICH_SKIP_MPICXX -DSCREAM_CONFIG_IS_CMAKE -DSPDLOG_COMPILED_LIB -D__HIP_ROCclr__ -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/../share -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/scream/src -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/ekat/src -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/ekat/src/ekat/ekat_f90_modules -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos/core/src -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos/containers/src -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/containers/src -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos/algorithms/src -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/algorithms/src -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/yaml-cpp/include -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/spdlog/include -Wall -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -I/opt/rocm-4.5.2/hip/include -I/opt/rocm-4.5.2/llvm/bin/../lib/clang/13.0.0 -I/opt/rocm-4.5.2/include -O3 -DNDEBUG -fno-gpu-rdc -x hip --rocm-path=/opt/rocm-4.5.2 --offload-arch=gfx90a -std=gnu++17 -MD -MT scream/src/physics/shoc/CMakeFiles/shoc.dir/shoc_functions_f90.cpp.o -MF CMakeFiles/shoc.dir/shoc_functions_f90.cpp.o.d -o CMakeFiles/shoc.dir/shoc_functions_f90.cpp.o -c /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.cpp No supported cpu target is set, CRAY_CPU_TARGET=x86-64 will be used. Load a valid targeting module or set CRAY_CPU_TARGET In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.cpp:1: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.hpp:4: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/share/scream_types.hpp:5: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src/ekat/kokkos/ekat_kokkos_types.hpp:5: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src/ekat/kokkos/ekat_kokkos_meta.hpp:4: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Core.hpp:51: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Core_fwd.hpp:52: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Macros.hpp:110: In file included from /gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos/KokkosCore_Config_SetupBackend.hpp:47: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp:53: In file included from /opt/rocm-4.5.2/hip/include/hip/hip_runtime.h:62: In file included from /opt/rocm-4.5.2/hip/include/hip/amd_detail/amd_hip_runtime.h:85: /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_fetch_functions.h:143:5: warning: unused variable 's' [-Wunused-variable] TEXTURE_PARAMETERS_INIT; ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_fetch_functions.h:37:42: note: expanded from macro 'TEXTURE_PARAMETERS_INIT' unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_fetch_functions.h:263:5: warning: unused variable 's' [-Wunused-variable] TEXTURE_PARAMETERS_INIT; ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_fetch_functions.h:37:42: note: expanded from macro 'TEXTURE_PARAMETERS_INIT' unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_fetch_functions.h:273:5: warning: unused variable 's' [-Wunused-variable] TEXTURE_PARAMETERS_INIT; ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_fetch_functions.h:37:42: note: expanded from macro 'TEXTURE_PARAMETERS_INIT' unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; ^ In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.cpp:1: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.hpp:4: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/share/scream_types.hpp:5: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src/ekat/kokkos/ekat_kokkos_types.hpp:5: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src/ekat/kokkos/ekat_kokkos_meta.hpp:4: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Core.hpp:51: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Core_fwd.hpp:52: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Macros.hpp:110: In file included from /gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos/KokkosCore_Config_SetupBackend.hpp:47: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp:53: In file included from /opt/rocm-4.5.2/hip/include/hip/hip_runtime.h:62: In file included from /opt/rocm-4.5.2/hip/include/hip/amd_detail/amd_hip_runtime.h:86: /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_indirect_functions.h:69:5: warning: unused variable 's' [-Wunused-variable] TEXTURE_OBJECT_PARAMETERS_INIT ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_indirect_functions.h:37:42: note: expanded from macro 'TEXTURE_OBJECT_PARAMETERS_INIT' unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_indirect_functions.h:360:5: warning: unused variable 's' [-Wunused-variable] TEXTURE_OBJECT_PARAMETERS_INIT ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_indirect_functions.h:37:42: note: expanded from macro 'TEXTURE_OBJECT_PARAMETERS_INIT' unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_indirect_functions.h:488:5: warning: unused variable 's' [-Wunused-variable] TEXTURE_OBJECT_PARAMETERS_INIT ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/texture_indirect_functions.h:37:42: note: expanded from macro 'TEXTURE_OBJECT_PARAMETERS_INIT' unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; ^ In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.cpp:1: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.hpp:4: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/share/scream_types.hpp:5: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src/ekat/kokkos/ekat_kokkos_types.hpp:5: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src/ekat/kokkos/ekat_kokkos_meta.hpp:4: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Core.hpp:51: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Core_fwd.hpp:52: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Macros.hpp:110: In file included from /gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos/KokkosCore_Config_SetupBackend.hpp:47: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp:53: In file included from /opt/rocm-4.5.2/hip/include/hip/hip_runtime.h:62: /opt/rocm-4.5.2/hip/include/hip/amd_detail/amd_hip_runtime.h:296:16: warning: attribute declaration must precede definition [-Wignored-attributes] attribute((weak)) ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/amd_hip_runtime.h:286:24: note: previous definition is here static constexpr __X x{}; ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/amd_hip_runtime.h:301:16: warning: attribute declaration must precede definition [-Wignored-attributes] attribute((weak)) ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/amd_hip_runtime.h:287:24: note: previous definition is here static constexpr __Y y{}; ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/amd_hip_runtime.h:306:16: warning: attribute declaration must precede definition [-Wignored-attributes] attribute((weak)) ^ /opt/rocm-4.5.2/hip/include/hip/amd_detail/amd_hip_runtime.h:288:24: note: previous definition is here static constexpr __Z z{}; ^ error: unhandled SGPR spill to memory error: unhandled SGPR spill to memory 9 warnings and 2 errors generated when compiling for gfx90a. Target CMakeFiles/shoc.dir/shoc_functions_f90.cpp.o built in 19.304051 seconds make[2]: *** [scream/src/physics/shoc/CMakeFiles/shoc.dir/build.make:93: scream/src/physics/shoc/CMakeFiles/shoc.dir/shoc_functions_f90.cpp.o] Error 1 make[2]: Leaving directory '/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld' make[1]: *** [CMakeFiles/Makefile2:1785: scream/src/physics/shoc/CMakeFiles/shoc.dir/all] Error 2 make[1]: Leaving directory '/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld' make: *** [Makefile:159: all] Error 2
I tried to use the following modules, [[email protected] F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1]$ module list
Currently Loaded Modules:
- amd/5.1.0 5) xalt/1.3.0 9) craype-accel-amd-gfx90a 13) subversion/1.14.0 17) cray-libsci/21.08.1.2
- craype/2.7.15 6) DefApps/default 10) rocm/5.1.0 14) git/2.31.1 18) cray-hdf5-parallel/1.12.0.7
- cray-dsmml/0.2.2 7) libfabric/1.15.0.0 11) cray-mpich/8.1.16 15) cmake/3.22.2 19) cray-netcdf-hdf5parallel/4.7.4.7
- PrgEnv-amd/8.3.3 8) craype-network-ofi 12) cray-python/3.9.4.2 16) zlib/1.2.11 20) cray-parallel-netcdf/1.12.1.7
and the issues are the same, see below: [ 18%] Building CXX object scream/src/physics/shoc/CMakeFiles/shoc.dir/shoc_functions_f90.cpp.o cd /gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/scream/src/physics/shoc && python3 /gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/Tools/e3sm_compile_wrap.py /opt/cray/pe/craype/2.7.15/bin/CC -DKOKKOS_DEPENDENCE -DMPICH_SKIP_MPICXX -DSCREAM_CONFIG_IS_CMAKE -DSPDLOG_COMPILED_LIB -D__HIP_ROCclr -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/../share -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/scream/src -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/ekat/src -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/ekat/src/ekat/ekat_f90_modules -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos/core/src -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos/containers/src -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/containers/src -I/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos/algorithms/src -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/algorithms/src -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/yaml-cpp/include -I/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/spdlog/include -Wall -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -I/opt/rocm-5.1.0/hip/include -I/opt/rocm-5.1.0/llvm/bin/../lib/clang/14.0.0 -I/opt/rocm-5.1.0/include -O3 -DNDEBUG -fno-gpu-rdc -x hip --rocm-path=/opt/rocm-5.1.0 --offload-arch=gfx90a -std=gnu++17 -MD -MT scream/src/physics/shoc/CMakeFiles/shoc.dir/shoc_functions_f90.cpp.o -MF CMakeFiles/shoc.dir/shoc_functions_f90.cpp.o.d -o CMakeFiles/shoc.dir/shoc_functions_f90.cpp.o -c /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.cpp__ [ 18%] Built target p3 No supported cpu target is set, CRAY_CPU_TARGET=x86-64 will be used. Load a valid targeting module or set CRAY_CPU_TARGET In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.cpp:1: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.hpp:4: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/share/scream_types.hpp:5: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src/ekat/kokkos/ekat_kokkos_types.hpp:5: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src/ekat/kokkos/ekat_kokkos_meta.hpp:4: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Core.hpp:51: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Core_fwd.hpp:52: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Macros.hpp:110: Scanning dependencies of target lnd gmake[2]: Leaving directory '/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld' gmake -f cmake/lnd/CMakeFiles/lnd.dir/build.make cmake/lnd/CMakeFiles/lnd.dir/build gmake[2]: Entering directory '/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld' gmake[2]: Nothing to be done for 'cmake/lnd/CMakeFiles/lnd.dir/build'. gmake[2]: Leaving directory '/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld' [ 57%] Built target lnd In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.cpp:1: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.hpp:4: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/share/scream_types.hpp:5: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src/ekat/kokkos/ekat_kokkos_types.hpp:5: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/src/ekat/kokkos/ekat_kokkos_meta.hpp:4: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Core.hpp:51: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Core_fwd.hpp:52: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/Kokkos_Macros.hpp:110: In file included from /gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld/externals/kokkos/KokkosCore_Config_SetupBackend.hpp:47: In file included from /gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp:53: In file included from /opt/rocm-5.1.0/hip/include/hip/hip_runtime.h:62: In file included from /opt/rocm-5.1.0/hip/include/hip/amd_detail/amd_hip_runtime.h:117: In file included from /opt/rocm-5.1.0/hip/include/hip/amd_detail/amd_hip_atomic.h:25: /opt/rocm-5.1.0/hip/include/hip/amd_detail/amd_device_functions.h:84:28: warning: unused function '__fns64' [-Wunused-function] device static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) { ^ /opt/rocm-5.1.0/hip/include/hip/amd_detail/amd_device_functions.h:120:27: warning: unused function '__fns32' [-Wunused-function] device static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) { ^ error: unhandled SGPR spill to memory error: unhandled SGPR spill to memory 11 warnings and 2 errors generated when compiling for gfx90a. Target CMakeFiles/shoc.dir/shoc_functions_f90.cpp.o built in 17.921367 seconds gmake[2]: *** [scream/src/physics/shoc/CMakeFiles/shoc.dir/build.make:93: scream/src/physics/shoc/CMakeFiles/shoc.dir/shoc_functions_f90.cpp.o] Error 1 gmake[2]: Leaving directory '/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld' gmake[1]: *** [CMakeFiles/Makefile2:1785: scream/src/physics/shoc/CMakeFiles/shoc.dir/all] Error 2 gmake[1]: Leaving directory '/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.1x1/bld/cmake-bld' gmake: *** [Makefile:159: all] Error 2 real 18.36 user 18.14
Looks like we need to submit an AMD compiler bug. Can you give me permission to access https://github.com/xyuan/e3sm_p3_shoc/tree/e3sm_p3_shoc_hip? Github is telling me it doesn't exist.
Or could you copy your build to /gpfs/alpine/cli115/world-shared?
Looks like we need to submit an AMD compiler bug. Can you give me permission to access https://github.com/xyuan/e3sm_p3_shoc/tree/e3sm_p3_shoc_hip? Github is telling me it doesn't exist.
Or could you copy your build to /gpfs/alpine/cli115/world-shared?
I have copied the file to /gpfs/alpine/cli115/world-shared, for the github branch, please let me push the crusher machine configuration file
@twhite-cray this is the script that I used to run MMF jobs using P3
#!/bin/bash
CASE_ROOT=$(pwd)
#E3SM=/ccs/home/yuanx/e3sm
E3SM=/gpfs/alpine/cli115/scratch/yuanx/e3sm_p3_crusher
OUTPUT=/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS
DATA=/gpfs/alpine/cli115/scratch/yuanx/ACME_SIMULATIONS
#COMPSET=F2010-MMF1
COMPSET=F-MMFXX-P3
#COMPSET=F-MMFXX
RES=ne4pg2_ne4pg2
COMPILER=crayclanggpu
MACH=crusher
PROJ=cli133
PELAYOUT=4x1
CASE=${COMPSET}.${RES}.${MACH}.${COMPILER}.${PELAYOUT}
echo
echo ${CASE}
echo
${E3SM}/cime/scripts/create_newcase -case ${CASE_ROOT}/${CASE} -compset ${COMPSET} -res ${RES} -mach ${MACH} -compiler ${COMPILER} -pecount ${PELAYOUT} -project ${PROJ} --output-root ${OUT
PUT} --handle-preexisting-dirs r
cd ${CASE_ROOT}/${CASE}
#./xmlchange --append -id CAM_CONFIG_OPTS -val " -crm_dt 10 "
#./xmlchange ATM_NCPL=1440
#./xmlchange DEBUG=true
./xmlchange STOP_OPTION=ndays
./xmlchange STOP_N=1
./xmlchange CONTINUE_RUN=FALSE
./xmlchange JOB_WALLCLOCK_TIME=01:00
./xmlchange REST_OPTION=never
./xmlchange CHARGE_ACCOUNT=$PROJ
cat > user_nl_eam << 'eof'
transport_alg=0
hypervis_subcycle_q=1
dt_tracer_factor = 2
eof
./case.setup
./case.build
cp -rf ${DATA}/data ${CASE_ROOT}/${CASE}/run
./case.submit
echo
echo ${CASE}
echo
I copied the data file to the e3sm_p3_crusher, please modify the scipt to copy the data [[email protected] ACME_SIMULATIONS]$ ls /gpfs/alpine/cli115/world-shared/e3sm_p3_crusher cime cime_config codemeta.json components CONTRIBUTING.md data driver-mct driver-moab externals LICENSE README.md run_e3sm.template.sh share
This issue is similar in flavor to https://github.com/E3SM-Project/E3SM/issues/5011 in that the problem appears with the same file in debug builds.
Any update on this @xyuan? We will soon have a pre-release version of AMD compiler that we can check to see.
I copied the branch to world-shared, please check, for the github branch, please let me push some most recently change to it, it didn’t include machine configuration file for crusher yet, Xingqiu Yuan
From: Trey White @.> Date: Monday, June 6, 2022 at 3:24 PM To: E3SM-Project/E3SM @.> Cc: Yuan, Xingqiu @.>, Author @.> Subject: Re: [E3SM-Project/E3SM] crusher AMD compiler build with HiP backend (Issue #4984)
Looks like we need to submit an AMD compiler bug. Can you give me permission to access https://github.com/xyuan/e3sm_p3_shoc/tree/e3sm_p3_shoc_hip? Github is telling me it doesn't exist.
Or could you copy your build to /gpfs/alpine/cli115/world-shared?
— Reply to this email directly, view it on GitHubhttps://github.com/E3SM-Project/E3SM/issues/4984#issuecomment-1147816336, or unsubscribehttps://github.com/notifications/unsubscribe-auth/AAFCG2AGMUCFYUO55CWWR4LVNZF7DANCNFSM5W56TPYA. You are receiving this because you authored the thread.Message ID: @.***>
I was able to reproduce the error with just the compile command.
++ CC -DKOKKOS_DEPENDENCE -DMPICH_SKIP_MPICXX -DSCREAM_CONFIG_IS_CMAKE -DSPDLOG_COMPILED_LIB -D__HIP_ROCclr -I/gpfs/alpine/cli115/world-shared/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/../share -I/gpfs/alpine/cli115/world-shared/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src -I/gpfs/alpine/cli115/proj-shared/trey/4984/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.4x1/bld/cmake-bld/scream/src -I/gpfs/alpine/cli115/world-shared/e3sm_p3_crusher/externals/ekat/src -I/gpfs/alpine/cli115/proj-shared/trey/4984/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.4x1/bld/cmake-bld/externals/ekat/src -I/gpfs/alpine/cli115/proj-shared/trey/4984/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.4x1/bld/cmake-bld/externals/ekat/src/ekat/ekat_f90_modules -I/gpfs/alpine/cli115/proj-shared/trey/4984/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.4x1/bld/cmake-bld/externals/kokkos -I/gpfs/alpine/cli115/proj-shared/trey/4984/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.4x1/bld/cmake-bld/externals/kokkos/core/src -I/gpfs/alpine/cli115/world-shared/e3sm_p3_crusher/externals/ekat/extern/kokkos/core/src -I/gpfs/alpine/cli115/proj-shared/trey/4984/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.4x1/bld/cmake-bld/externals/kokkos/containers/src -I/gpfs/alpine/cli115/world-shared/e3sm_p3_crusher/externals/ekat/extern/kokkos/containers/src -I/gpfs/alpine/cli115/proj-shared/trey/4984/ACME_SIMULATIONS/F-MMFXX-P3.ne4pg2_ne4pg2.crusher.amdclanggpu.4x1/bld/cmake-bld/externals/kokkos/algorithms/src -I/gpfs/alpine/cli115/world-shared/e3sm_p3_crusher/externals/ekat/extern/kokkos/algorithms/src -I/gpfs/alpine/cli115/world-shared/e3sm_p3_crusher/externals/ekat/extern/yaml-cpp/include -I/gpfs/alpine/cli115/world-shared/e3sm_p3_crusher/externals/ekat/extern/spdlog/include -Wall -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -O3 -DNDEBUG -fno-gpu-rdc -x hip --offload-arch=gfx90a -std=gnu++17 -MD -MT shoc_functions_f90.cpp.o -MF shoc_functions_f90.cpp.o.d -o shoc_functions_f90.cpp.o -c /gpfs/alpine/cli115/world-shared/e3sm_p3_crusher/components/eam/src/physics/crm/scream/src/physics/shoc/shoc_functions_f90.cpp
error: unhandled SGPR spill to memory
error: unhandled SGPR spill to memory
2 errors generated when compiling for gfx90a.
I was able to work around the error with either of the following.
- Compile with
hipccinstead. - Add the compiler options,
-mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false.
Either of these strategies should also improve code performance.
FYI, I also updated an internal HPE ticket to make the above options the default for PrgEnv-amd and PrgEnv-cray-amd, as they already are for hipcc and PrgEnv-cray.