hypre icon indicating copy to clipboard operation
hypre copied to clipboard

cudaErrorInvalidDevice: invalid device ordinal when running examples

Open moulin1024 opened this issue 2 months ago • 10 comments

Hello,

I am trying to build hypre on our cluster with CMake with the following options:

cmake ../src -DBUILD_SHARED_LIBS=ON \
             -DHYPRE_ENABLE_PRINT_ERRORS=ON \
             -DHYPRE_ENABLE_CUDA=ON \
             -DCMAKE_CUDA_ARCHITECTURES='80' \
             -DHYPRE_ENABLE_UNIFIED_MEMORY=ON \
             -DHYPRE_BUILD_EXAMPLES=ON \
             -DHYPRE_BUILD_TESTS=ON \
             -DHYPRE_ENABLE_GPU_AWARE_MPI=ON \
             -DHYPRE_ENABLE_HYPRE_BLAS=OFF -DHYPRE_ENABLE_HYPRE_LAPACK=OFF \
             -DHYPRE_ENABLE_SUPERLU=ON -DCMAKE_PREFIX_PATH=/path/to/my/superlu
make -j8

When I try to run the ij test, it runs without any error

srun --ntasks=1 --gres=gpu:a100:1 --cpus-per-task=18 --mem=64G --time=01:00:00 ./build/test/ij

However, when I run the IJ example code (ex5),

srun --ntasks=1 --gres=gpu:a100:1 --cpus-per-task=18 --mem=64G --time=01:00:00 ./build/example/ex5

I get the following error:

terminate called after throwing an instance of 'thrust::THRUST_200500_800_NS::system::system_error'
  what():  after determining tmp storage requirements for exclusive_scan: cudaErrorInvalidDevice: invalid device ordinal

I also create small sanity check program to further explore what went wrong:

// hypre_sanity_5x5.cu
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <iostream>
#include <mpi.h>
#include <cuda_runtime.h>

#include "HYPRE.h"
#include "HYPRE_krylov.h"
#include "HYPRE_parcsr_ls.h"


static void my_cuda_state(const char* tag){
  int dev=-1, ndev=0; 
  cudaGetDevice(&dev); 
  cudaGetDeviceCount(&ndev);
  size_t f=0,t=0; 
  cudaMemGetInfo(&f,&t);
  printf("[%s] device=%d of %d, total=%.2f GiB, free=%.2f GiB\n",
         tag, dev, ndev, (double)t/(1ull<<30), (double)f/(1ull<<30));
  fflush(stdout);
}

int main(int argc, char** argv)
{
  MPI_Init(&argc, &argv);
  int myid=0, nprocs=1;
  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  // ---------------- CUDA: bind this rank to GPU 0 and create context ----------------
  my_cuda_state("My own Device Info check:");

  // ---------------- HYPRE init (avoid HYPRE_PrintDeviceInfo) ----------------
  HYPRE_Initialize();
  std::cout << "HYPRE_PrintDeviceInfo:" << std::endl;
  HYPRE_PrintDeviceInfo();

  HYPRE_Finalize();
  MPI_Finalize();
  return 0;
}

and the output is

[My own Device Info check:] device=0 of 1, total=39.39 GiB, free=38.98 GiB
HYPRE_PrintDeviceInfo:
Running on "NVIDIA A100-SXM4-40GB", major 8, minor 0, total memory 0.00 GiB
MaxSharedMemoryPerBlock 49152, MaxSharedMemoryPerBlockOptin 166912

which makes me more confused, my native cuda device query can detect the GPU and valid graphic memory, however HYPRE_PrintDeviceInfo returns with total memory 0.00 GiB.

Could someone enlighten me what configuration I did wrong here? Thank you very much.

moulin1024 avatar Sep 30 '25 14:09 moulin1024

I also have this issue when configuring through PETSc with UMPIRE and unified memory on a NVIDIA RTX 4070 Ti Super, CUDA 12.9. This is the (largely unhelpful) backtrace

terminate called after throwing an instance of 'thrust::THRUST_200802_SM_890_NS::system::system_error'
  what():  after reduction step 1: cudaErrorInvalidDevice: invalid device ordinal
[zatkins-desktop:1517762] *** Process received signal ***
[zatkins-desktop:1517762] Signal: Aborted (6)
[zatkins-desktop:1517762] Signal code:  (-6)
[zatkins-desktop:1517762] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x42520)[0x744f31242520]
[zatkins-desktop:1517762] [ 1] /lib/x86_64-linux-gnu/libc.so.6(pthread_kill+0x12c)[0x744f312969fc]
[zatkins-desktop:1517762] [ 2] /lib/x86_64-linux-gnu/libc.so.6(raise+0x16)[0x744f31242476]
[zatkins-desktop:1517762] [ 3] /lib/x86_64-linux-gnu/libc.so.6(abort+0xd3)[0x744f312287f3]
[zatkins-desktop:1517762] [ 4] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xa2b9e)[0x744ed96a2b9e]
[zatkins-desktop:1517762] [ 5] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xae20c)[0x744ed96ae20c]
[zatkins-desktop:1517762] [ 6] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xae277)[0x744ed96ae277]
[zatkins-desktop:1517762] [ 7] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xae4d8)[0x744ed96ae4d8]
[zatkins-desktop:1517762] [ 8] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libHYPRE-3.0.0.so(_ZN6thrust23THRUST_200802_SM_890_NS8cuda_cub6detail13reduce_n_implINS0_6detail22execute_with_allocatorIR22hypre_device_allocatorNS1_22execute_on_stream_baseEEENS0_18transform_iteratorI8in_rangeIiEPillEEllNS0_4plusIlEEEET2_RNS1_16execution_policyIT_EET0_T1_SH_T3_+0x1d5)[0x744f2e6338d5]
[zatkins-desktop:1517762] [ 9] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libHYPRE-3.0.0.so(hypre_IJVectorAssembleParDevice+0xc3)[0x744f2e638963]
[zatkins-desktop:1517762] [10] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0x384724)[0x744f32384724]
[zatkins-desktop:1517762] [11] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0x6b6fa3)[0x744f326b6fa3]
[zatkins-desktop:1517762] [12] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(MatAssemblyEnd+0xa0)[0x744f327c9130]
[zatkins-desktop:1517762] [13] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0x6af141)[0x744f326af141]
[zatkins-desktop:1517762] [14] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(MatConvert+0xd03)[0x744f327d7b63]
[zatkins-desktop:1517762] [15] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0xea7336)[0x744f32ea7336]
[zatkins-desktop:1517762] [16] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(PCSetUp+0x1cf)[0x744f32f4c45f]
[zatkins-desktop:1517762] [17] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(KSPSetUp+0x701)[0x744f32d542b1]
[zatkins-desktop:1517762] [18] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0xedd55c)[0x744f32edd55c]
[zatkins-desktop:1517762] [19] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(PCSetUp+0x1cf)[0x744f32f4c45f]
[zatkins-desktop:1517762] [20] /home/zatkins/project/micromorph/ratel/lib/libratel.so(+0x7fdb2)[0x744f3407fdb2]
[zatkins-desktop:1517762] [21] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(PCSetUp+0x1cf)[0x744f32f4c45f]
[zatkins-desktop:1517762] [22] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(KSPSetUp+0x701)[0x744f32d542b1]
[zatkins-desktop:1517762] [23] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0xd56aa9)[0x744f32d56aa9]
[zatkins-desktop:1517762] [24] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(KSPSolve+0x18)[0x744f32d56758]
[zatkins-desktop:1517762] [25] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0xf6463f)[0x744f32f6463f]
[zatkins-desktop:1517762] [26] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(SNESSolve+0x728)[0x744f32fa7d58]
[zatkins-desktop:1517762] [27] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0x1035d11)[0x744f33035d11]
[zatkins-desktop:1517762] [28] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(TSStep+0x1de)[0x744f330643ae]
[zatkins-desktop:1517762] [29] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(TSSolve+0x798)[0x744f33065e78]
[zatkins-desktop:1517762] *** End of error message ***

zatkins-dev avatar Oct 09 '25 23:10 zatkins-dev

@moulin1024 @zatkins-dev thanks for reporting!

I suspect this is related to GPU affinity.

Would you be able to save the following script to a file gpu_affinity.sh, give execution permissions to it, and add it just before your executable in the srun/mpirun command?

#!/bin/bash
#
# Usage: gpu_affinity.sh <your_program> [args...]
#
# Works under both SLURM (srun) and generic MPI (mpirun/mpiexec).

# Get number of GPUs on this node
NUM_GPUS=$(nvidia-smi -L | wc -l)

# Determine local rank
if [ -n "${SLURM_LOCALID}" ]; then
    RANK=${SLURM_LOCALID}
elif [ -n "${OMPI_COMM_WORLD_LOCAL_RANK}" ]; then
    RANK=${OMPI_COMM_WORLD_LOCAL_RANK}
elif [ -n "${MV2_COMM_WORLD_LOCAL_RANK}" ]; then
    RANK=${MV2_COMM_WORLD_LOCAL_RANK}
elif [ -n "${PMI_LOCAL_RANK}" ]; then
    RANK=${PMI_LOCAL_RANK}
else
    echo "Warning: Could not determine local rank. Defaulting to 0."
    RANK=0
fi

# Bind GPU to rank
GPU_ID=$((RANK % NUM_GPUS))
export CUDA_VISIBLE_DEVICES=$GPU_ID

GPU_UUID=$(nvidia-smi --query-gpu=uuid --format=csv,noheader -i $GPU_ID 2>/dev/null)
echo "Rank ${RANK} bound to GPU ${GPU_ID} (UUID=${GPU_UUID}) on host $(hostname)"

# Run the user command
exec "$@"

victorapm avatar Oct 10 '25 00:10 victorapm

Unfortunately it still return the same error

My slurm script:

#!/bin/bash -l
# Job name
#SBATCH -J test
# number of nodes
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --time=00:10:00
#SBATCH --mem=64G
#SBATCH --partition=gpudev
#SBATCH --gres=gpu:a100:1
#SBATCH --cpus-per-task=18

./gpu_affinity.sh
srun ./ex5

Output

Rank 0 bound to GPU 0 (UUID=GPU-6b2d8b2e-2dd8-d517-be97-069f75cd978a) on host ravg1002
terminate called after throwing an instance of 'thrust::THRUST_200500_800_NS::system::system_error'
  what():  after determining tmp storage requirements for exclusive_scan: cudaErrorInvalidDevice: invalid device ordinal
[ravg1002:94364] *** Process received signal ***
[ravg1002:94364] Signal: Aborted (6)
[ravg1002:94364] Signal code:  (-6)
[ravg1002:94364] [ 0] /lib64/libc.so.6(+0x57900)[0x154d921f9900]
[ravg1002:94364] [ 1] /lib64/libc.so.6(+0xa8dfc)[0x154d9224adfc]
[ravg1002:94364] [ 2] /lib64/libc.so.6(raise+0x14)[0x154d921f9842]
[ravg1002:94364] [ 3] /lib64/libc.so.6(abort+0xd5)[0x154d921e15cf]
[ravg1002:94364] [ 4] /mpcdf/soft/SLE_15/packages/x86_64/gcc/13.1.0/lib64/libstdc++.so.6(+0xa8377)[0x154e109b4377]
[ravg1002:94364] [ 5] /mpcdf/soft/SLE_15/packages/x86_64/gcc/13.1.0/lib64/libstdc++.so.6(+0xb7b3c)[0x154e109c3b3c]
[ravg1002:94364] [ 6] /mpcdf/soft/SLE_15/packages/x86_64/gcc/13.1.0/lib64/libstdc++.so.6(+0xb7ba7)[0x154e109c3ba7]
[ravg1002:94364] [ 7] /mpcdf/soft/SLE_15/packages/x86_64/gcc/13.1.0/lib64/libstdc++.so.6(+0xb7e07)[0x154e109c3e07]
[ravg1002:94364] [ 8] /u/limo/hypre/build/lib/libHYPRE.so.233(+0x1566de)[0x154e10e616de]
[ravg1002:94364] [ 9] /u/limo/hypre/build/lib/libHYPRE.so.233(hypreDevice_IntegerExclusiveScan+0x45)[0x154e10ec78e5]
[ravg1002:94364] [10] /u/limo/hypre/build/lib/libHYPRE.so.233(hypre_IJMatrixSetAddValuesParCSRDevice+0x100)[0x154e110d0f90]
[ravg1002:94364] [11] /u/limo/hypre/build/lib/libHYPRE.so.233(HYPRE_IJMatrixSetValues2+0x152)[0x154e110b4472]
[ravg1002:94364] [12] /u/limo/hypre/build/lib/libHYPRE.so.233(HYPRE_IJMatrixSetValues+0x81)[0x154e110b4761]
[ravg1002:94364] [13] /raven/u/limo/hypre/build/examples/./ex5[0x402bd6]
[ravg1002:94364] [14] /lib64/libc.so.6(+0x40e6c)[0x154d921e2e6c]
[ravg1002:94364] [15] /lib64/libc.so.6(__libc_start_main+0x87)[0x154d921e2f35]
[ravg1002:94364] [16] /raven/u/limo/hypre/build/examples/./ex5[0x403901]
[ravg1002:94364] *** End of error message ***
srun: error: ravg1002: task 0: Aborted
srun: Terminating StepId=22428226.0

moulin1024 avatar Oct 10 '25 08:10 moulin1024

@moulin1024, could you change it slightly to srun ./gpu_affinity.sh ./ex5?

victorapm avatar Oct 10 '25 13:10 victorapm

Using the PETSc example used in make check (snes tutorials ex19), I get:

$ ./arch-parallel-cuda-opt/bin/mpiexec -n 2 ./gpu_affinity.sh arch-parallel-cuda-opt/tests/snes/tutorials/ex19 -dm_vec_type cuda -dm_mat_type aijcusparse -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre

Rank 0 bound to GPU 0 (UUID=GPU-a8b70e2e-f730-2a1c-27cf-d727da6b56ac) on host zatkins-desktop
Rank 1 bound to GPU 0 (UUID=GPU-a8b70e2e-f730-2a1c-27cf-d727da6b56ac) on host zatkins-desktop
lid velocity = 0.0016, prandtl # = 1., grashof # = 1.
  0 SNES Function norm 0.0406612
CUDA ERROR (code = 101, invalid device ordinal) at memory.c:199
CUDA ERROR (code = 101, invalid device ordinal) at memory.c:199
terminate called after throwing an instance of 'thrust::THRUST_200802_SM_890_NS::system::system_error'
  what():  after reduction step 1: cudaErrorInvalidDevice: invalid device ordinal
[zatkins-desktop:17881] *** Process received signal ***
[zatkins-desktop:17881] Signal: Aborted (6)
[zatkins-desktop:17881] Signal code:  (-6)
terminate called after throwing an instance of 'thrust::THRUST_200802_SM_890_NS::system::system_error'
[zatkins-desktop:17881] [ 0]   what():  after reduction step 1: cudaErrorInvalidDevice: invalid device ordinal
[zatkins-desktop:17882] *** Process received signal ***
[zatkins-desktop:17882] Signal: Aborted (6)
[zatkins-desktop:17882] Signal code:  (-6)
/lib/x86_64-linux-gnu/libc.so.6(+0x42520)[0x75724c242520]
[zatkins-desktop:17881] [ 1] /lib/x86_64-linux-gnu/libc.so.6(pthread_kill+0x12c)[0x75724c2969fc]
[zatkins-desktop:17881] [ 2] /lib/x86_64-linux-gnu/libc.so.6(raise+0x16)[0x75724c242476]
[zatkins-desktop:17881] [ 3] /lib/x86_64-linux-gnu/libc.so.6(abort+0xd3)[0x75724c2287f3]
[zatkins-desktop:17881] [ 4] [zatkins-desktop:17882] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x42520)[0x79fb07e42520]
[zatkins-desktop:17882] [ 1] /lib/x86_64-linux-gnu/libc.so.6(pthread_kill+0x12c)[0x79fb07e969fc]
[zatkins-desktop:17882] [ 2] /lib/x86_64-linux-gnu/libc.so.6(raise+0x16)[0x79fb07e42476]
[zatkins-desktop:17882] [ 3] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xa2b9e)[0x75724caa2b9e]
[zatkins-desktop:17881] [ 5] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xae20c)[0x75724caae20c]
[zatkins-desktop:17881] [ 6] /lib/x86_64-linux-gnu/libc.so.6(abort+0xd3)[0x79fb07e287f3]
[zatkins-desktop:17882] [ 4] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xae277)[0x75724caae277]
[zatkins-desktop:17881] [ 7] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xa2b9e)[0x79fb086a2b9e]
[zatkins-desktop:17882] [ 5] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xae4d8)[0x75724caae4d8]
[zatkins-desktop:17881] [ 8] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xae20c)[0x79fb086ae20c]
[zatkins-desktop:17882] [ 6] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xae277)[0x79fb086ae277]
[zatkins-desktop:17882] [ 7] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xae4d8)[0x79fb086ae4d8]
[zatkins-desktop:17882] [ 8] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libHYPRE-3.0.0.so(_ZN6thrust23THRUST_200802_SM_890_NS8cuda_cub6detail13reduce_n_implINS0_6detail22execute_with_allocatorIR22hypre_device_allocatorNS1_22execute_on_stream_baseEEENS0_18transform_iteratorI8in_rangeIiEPillEEllNS0_4plusIlEEEET2_RNS1_16execution_policyIT_EET0_T1_SH_T3_+0x1d5)[0x7572a1e338d5]
[zatkins-desktop:17881] [ 9] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libHYPRE-3.0.0.so(hypre_IJVectorAssembleParDevice+0xc3)[0x7572a1e38963]
[zatkins-desktop:17881] [10] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libHYPRE-3.0.0.so(_ZN6thrust23THRUST_200802_SM_890_NS8cuda_cub6detail13reduce_n_implINS0_6detail22execute_with_allocatorIR22hypre_device_allocatorNS1_22execute_on_stream_baseEEENS0_18transform_iteratorI8in_rangeIiEPillEEllNS0_4plusIlEEEET2_RNS1_16execution_policyIT_EET0_T1_SH_T3_+0x1d5)[0x79fb5d8338d5]
[zatkins-desktop:17882] [ 9] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libHYPRE-3.0.0.so(hypre_IJVectorAssembleParDevice+0xc3)[0x79fb5d838963]
[zatkins-desktop:17882] [10] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0x384724)[0x7572a4d84724]
[zatkins-desktop:17881] [11] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0x384724)[0x79fb60784724]
[zatkins-desktop:17882] [11] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0x6b6fa3)[0x7572a50b6fa3]
[zatkins-desktop:17881] [12] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(MatAssemblyEnd+0xa0)[0x7572a51c9130]
[zatkins-desktop:17881] [13] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0x6b6fa3)[0x79fb60ab6fa3]
[zatkins-desktop:17882] [12] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(MatAssemblyEnd+0xa0)[0x79fb60bc9130]
[zatkins-desktop:17882] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0x6af141)[0x7572a50af141]
[zatkins-desktop:17881] [14] [13] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(MatConvert+0xd03)[0x7572a51d7b63]
[zatkins-desktop:17881] [15] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0x6af141)[0x79fb60aaf141]
[zatkins-desktop:17882] [14] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(MatConvert+0xd03)[0x79fb60bd7b63]
[zatkins-desktop:17882] [15] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0xea7336)[0x7572a58a7336]
[zatkins-desktop:17881] [16] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0xea7336)[0x79fb612a7336]
[zatkins-desktop:17882] [16] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(PCSetUp+0x1cf)[0x7572a594c45f]
[zatkins-desktop:17881] [17] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(KSPSetUp+0x701)[0x7572a57542b1]
[zatkins-desktop:17881] [18] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(PCSetUp+0x1cf)[0x79fb6134c45f]
[zatkins-desktop:17882] [17] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0xd56aa9)[0x7572a5756aa9]
[zatkins-desktop:17881] [19] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(KSPSetUp+0x701)[0x79fb611542b1]
[zatkins-desktop:17882] [18] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(KSPSolve+0x18)[0x7572a5756758]
[zatkins-desktop:17881] [20] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0xd56aa9)[0x79fb61156aa9]
[zatkins-desktop:17882] [19] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(KSPSolve+0x18)[0x79fb61156758]
[zatkins-desktop:17882] [20] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0xf6463f)[0x7572a596463f]
[zatkins-desktop:17881] [21] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(SNESSolve+0x728)[0x7572a59a7d58]
[zatkins-desktop:17881] [22] arch-parallel-cuda-opt/tests/snes/tutorials/ex19(+0x269a)[0x61eb2309a69a]
[zatkins-desktop:17881] [23] /lib/x86_64-linux-gnu/libc.so.6(+0x29d90)[0x75724c229d90]
[zatkins-desktop:17881] [24] /home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(+0xf6463f)[0x79fb6136463f]
[zatkins-desktop:17882] [21] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80)[0x75724c229e40]
[zatkins-desktop:17881] [25] arch-parallel-cuda-opt/tests/snes/tutorials/ex19(_start+0x25)[0x61eb2309a2e5]
[zatkins-desktop:17881] *** End of error message ***
/home/zatkins/project/micromorph/petsc/arch-parallel-cuda-opt/lib/libpetsc.so.3.24(SNESSolve+0x728)[0x79fb613a7d58]
[zatkins-desktop:17882] [22] arch-parallel-cuda-opt/tests/snes/tutorials/ex19(+0x269a)[0x5760eb36469a]
[zatkins-desktop:17882] [23] /lib/x86_64-linux-gnu/libc.so.6(+0x29d90)[0x79fb07e29d90]
[zatkins-desktop:17882] [24] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80)[0x79fb07e29e40]
[zatkins-desktop:17882] [25] arch-parallel-cuda-opt/tests/snes/tutorials/ex19(_start+0x25)[0x5760eb3642e5]
[zatkins-desktop:17882] *** End of error message ***
--------------------------------------------------------------------------
prterun noticed that process rank 0 with PID 17881 on node zatkins-desktop exited on
signal 6 (Aborted).
--------------------------------------------------------------------------

This is my PETSc configure script:

#!/usr/bin/python3
if __name__ == '__main__':
    import sys
    import os
    sys.path.insert(0, os.path.abspath('config'))
    import configure
    configure_options = [
        '--download-blis',
        '--download-blis-complex-return=gnu',
        '--download-blis-enable-cblas-headers=1',
        '--download-cgns',
        '--download-ctetgen',
        '--download-exodusii',
        '--download-hdf5',
        '--download-hypre',
        '--download-hypre-configure-arguments=--enable-unified-memory',
        '--download-umpire',
        '--download-metis',
        '--download-netcdf',
        '--download-openmpi',
        '--download-parmetis',
        '--download-pnetcdf',
        '--download-superlu',
        '--download-superlu_dist',
        '--download-triangle',
        '--download-zlib',
        '--with-X=1',
        '--with-openmp',
        '--with-clanguage=c',
        '--with-cuda',
        '--with-cuda-arch=89',
        '--with-cuda-dialect=17',
        '--with-cuda-dir=/usr/local/cuda',
        '--with-cudac=nvcc',
        '--with-cc=clang-20',
        '--with-cxx=clang++-20',
        '--with-debugger=lldb-20',
        '--with-debugging=0',
        '--with-fc=0',
        '--with-fortran-bindings=0',
        '--with-gmsh',
        '--with-precision=double',
        '--with-scalar-type=real',
        '--with-strict-petscerrorcode',
        '--with-ld=ld.lld-20',
        'COPTFLAGS=-O3 -g -march=native -fPIC -Wno-pass-failed -fassociative-math -fno-math-errno -fno-omit-frame-pointer -ffp-contract=fast',
        'CUDAOPTFLAGS=-ccbin=clang-20 -allow-unsupported-compiler -O3 -ftz=true -prec-div=false',
        'CXXOPTFLAGS=-O3 -g -march=native -fPIC -Wno-pass-failed -fassociative-math -fno-math-errno -fno-omit-frame-pointer -ffp-contract=fast',
        'PETSC_ARCH=arch-parallel-cuda-opt',
    ]
    configure.petsc_configure(configure_options)

zatkins-dev avatar Oct 10 '25 15:10 zatkins-dev

Every time I've seen this error, hypre_IJVectorAssembleParDevice has been in the stack trace

zatkins-dev avatar Oct 10 '25 15:10 zatkins-dev

I figured out the issue for me @victorapm. On Windows and WSL, the device property concurrentManagedAccess is set to 0. As such the call to cudaMemPrefetchAsync in src/utilities/memory.c:199 causes an error. Umpire properly checks for this property in their UM implementation, perhaps HYPRE should call their CudaMemPrefetchOperation::apply instead of using HYPRE_MEM_PREFETCH_DEVICE, if HYPRE was configured with Umpire UM support.

zatkins-dev avatar Oct 17 '25 18:10 zatkins-dev

Also, not compiling with unified memory fixed the problem.

zatkins-dev avatar Oct 17 '25 18:10 zatkins-dev

Hi Zach, great to hear that. Thanks for your investigation and suggestion! We will look at incorporating it in the code.

@moulin1024, I assume you are on a linux cluster, so Zach's point does not apply to you. Please, let me know if the affinity script helps you in any way.

Best!

victorapm avatar Oct 17 '25 22:10 victorapm

Hi Victor. Sorry I was working on something else in the last couple of weeks.

I tried it today and unfortunately I still got the same error.

But I am sort of bypassing this issue right now, as it is really triggered by HYPRE_IJMatrixSetValues. I wrote a glue code that load the csr matrix from binary file, and then it works. I leave it here for future reference.

static inline hypre_ParCSRMatrix*
LoadCSRMatrixFromFiles(MPI_Comm comm,
                       HYPRE_BigInt n_global_rows,
                       HYPRE_BigInt row_start, HYPRE_BigInt row_end,
                       const char *i_path, const char *j_path, const char *v_path)
{
    hypre_ParCSRMatrix *A;
    hypre_CSRMatrix *Adiag, *Aoffd;

    const HYPRE_Int  n_local = (HYPRE_Int)(row_end - row_start);
    HYPRE_BigInt     global_part[2] = { row_start, row_end };

    /* --- read I (int32, size n_local+1) to learn nnz_total --- */
    HYPRE_Int *I32 = hypre_CTAlloc(HYPRE_Int, (size_t)n_local + 1, HYPRE_MEMORY_HOST);
    HYPRE_BigInt nnz_total = 0;

    {
        FILE *f = fopen(i_path, "rb");
        if (!f) { hypre_printf("ERROR: cannot open %s\n", i_path); }
        else
        {
            size_t got = fread(I32, sizeof(*I32), (size_t)n_local + 1, f);
            fclose(f);
            if (got != (size_t)n_local + 1)
            {
                hypre_printf("ERROR: short read in %s (got %zu, want %zu)\n",
                             i_path, got, (size_t)n_local + 1);
            }
        }
        nnz_total = (HYPRE_BigInt) I32[n_local];
    }

    /* --- read J, V (size nnz_total) --- */
    HYPRE_BigInt *J_in = hypre_CTAlloc(HYPRE_BigInt, (size_t)nnz_total, HYPRE_MEMORY_HOST);
    HYPRE_Real   *V_in = hypre_CTAlloc(HYPRE_Real,   (size_t)nnz_total, HYPRE_MEMORY_HOST);

    { FILE *f = fopen(j_path, "rb");
      if (!f) { hypre_printf("ERROR: cannot open %s\n", j_path); }
      else { (void)fread(J_in, sizeof(*J_in), (size_t)nnz_total, f); fclose(f); } }

    { FILE *f = fopen(v_path, "rb");
      if (!f) { hypre_printf("ERROR: cannot open %s\n", v_path); }
      else { (void)fread(V_in, sizeof(*V_in), (size_t)nnz_total, f); fclose(f); } }

    /* --- count diag/offd per row --- */
    HYPRE_Int *diag_i = hypre_CTAlloc(HYPRE_Int, (size_t)n_local + 1, HYPRE_MEMORY_HOST);
    HYPRE_Int *offd_i = hypre_CTAlloc(HYPRE_Int, (size_t)n_local + 1, HYPRE_MEMORY_HOST);
    HYPRE_BigInt diag_nnz = 0, offd_nnz = 0;

    diag_i[0] = 0; offd_i[0] = 0;
    for (HYPRE_Int r = 0; r < n_local; ++r)
    {
        HYPRE_BigInt rs = (HYPRE_BigInt) I32[r];
        HYPRE_BigInt re = (HYPRE_BigInt) I32[r+1];
        HYPRE_Int d = 0, o = 0;
        for (HYPRE_BigInt k = rs; k < re; ++k)
        {
            HYPRE_BigInt gc = J_in[k];
            if (gc >= row_start && gc < row_end) ++d; else ++o;
        }
        diag_nnz += d; offd_nnz += o;
        diag_i[r+1] = (HYPRE_Int)diag_nnz;
        offd_i[r+1] = (HYPRE_Int)offd_nnz;
    }

    /* --- allocate and fill diag / gather offd globals --- */
    HYPRE_Int  *diag_j    = hypre_CTAlloc(HYPRE_Int,  (size_t)diag_nnz, HYPRE_MEMORY_HOST);
    HYPRE_Real *diag_data = hypre_CTAlloc(HYPRE_Real, (size_t)diag_nnz, HYPRE_MEMORY_HOST);
    HYPRE_Int  *offd_j    = hypre_CTAlloc(HYPRE_Int,  (size_t)offd_nnz, HYPRE_MEMORY_HOST);
    HYPRE_Real *offd_data = hypre_CTAlloc(HYPRE_Real, (size_t)offd_nnz, HYPRE_MEMORY_HOST);
    HYPRE_BigInt *offd_cols_global = hypre_CTAlloc(HYPRE_BigInt, (size_t)offd_nnz, HYPRE_MEMORY_HOST);

    HYPRE_BigInt dpos = 0, opos = 0;
    for (HYPRE_Int r = 0; r < n_local; ++r)
    {
        for (HYPRE_BigInt k = (HYPRE_BigInt)I32[r]; k < (HYPRE_BigInt)I32[r+1]; ++k)
        {
            HYPRE_BigInt gc = J_in[k];
            HYPRE_Real   vv = V_in[k];
            if (gc >= row_start && gc < row_end) {
                diag_j[dpos] = (HYPRE_Int)(gc - row_start);
                diag_data[dpos++] = vv;
            } else {
                offd_cols_global[opos] = gc;
                offd_data[opos++] = vv;
            }
        }
    }

    /* --- build unique, sorted col_map_offd and map offd_j --- */
    HYPRE_Int num_cols_offd = 0;
    HYPRE_BigInt *col_map_offd = NULL;
    if (offd_nnz > 0)
    {
        col_map_offd = hypre_CTAlloc(HYPRE_BigInt, (size_t)offd_nnz, HYPRE_MEMORY_HOST);
        for (HYPRE_BigInt t = 0; t < offd_nnz; ++t) col_map_offd[t] = offd_cols_global[t];
        hypre_BigQsort0(col_map_offd, 0, (HYPRE_Int)offd_nnz - 1);

        HYPRE_Int m = 0;
        for (HYPRE_BigInt t = 0; t < offd_nnz; ++t)
            if (m == 0 || col_map_offd[m-1] != col_map_offd[t]) col_map_offd[m++] = col_map_offd[t];
        num_cols_offd = m;

        HYPRE_BigInt *cm = hypre_CTAlloc(HYPRE_BigInt, (size_t)num_cols_offd, HYPRE_MEMORY_HOST);
        for (HYPRE_Int t = 0; t < num_cols_offd; ++t) cm[t] = col_map_offd[t];
        hypre_TFree(col_map_offd, HYPRE_MEMORY_HOST);
        col_map_offd = cm;

        for (HYPRE_BigInt t = 0; t < offd_nnz; ++t)
            offd_j[t] = hypre_BigBinarySearch(col_map_offd, offd_cols_global[t], num_cols_offd);
    }

    /* --- create ParCSR matrix --- */
    A = hypre_ParCSRMatrixCreate(comm, n_global_rows, n_global_rows,
                                 global_part, global_part, num_cols_offd,
                                 (HYPRE_Int)diag_nnz, (HYPRE_Int)offd_nnz);

    hypre_ParCSRMatrixColMapOffd(A) = col_map_offd;

    Adiag = hypre_ParCSRMatrixDiag(A);
    hypre_CSRMatrixI(Adiag)    = diag_i;
    hypre_CSRMatrixJ(Adiag)    = diag_j;
    hypre_CSRMatrixData(Adiag) = diag_data;

    Aoffd = hypre_ParCSRMatrixOffd(A);
    hypre_CSRMatrixI(Aoffd)    = offd_i;
    if (num_cols_offd) {
        hypre_CSRMatrixJ(Aoffd)    = offd_j;
        hypre_CSRMatrixData(Aoffd) = offd_data;
    }

    hypre_CSRMatrixMemoryLocation(Adiag) = HYPRE_MEMORY_HOST;
    hypre_CSRMatrixMemoryLocation(Aoffd) = HYPRE_MEMORY_HOST;
    hypre_ParCSRMatrixMigrate(A, hypre_HandleMemoryLocation(hypre_handle()));

    /* cleanup temporaries */
    hypre_TFree(I32, HYPRE_MEMORY_HOST);
    hypre_TFree(J_in, HYPRE_MEMORY_HOST);
    hypre_TFree(V_in, HYPRE_MEMORY_HOST);
    hypre_TFree(offd_cols_global, HYPRE_MEMORY_HOST);

    return A;
}

moulin1024 avatar Nov 06 '25 09:11 moulin1024