relion
relion copied to clipboard
Build issue with CUDA 11.2
Environment:
- RHEL 8.1
- CUDA 11.2
- Open MPI 4.1.0
- Intel icc/icpc 19.1.3 (2020u4)
- MKL FFTW
- GPU devices: V100-SXM2
- CUDA driver: 460.32.03
Cmake invocation: cmake -Wno-dev -DGUI=no -DCUDA_ARCH=70 ..
CMakeCache.txt file attached: relion4b-CMakeCache.txt
Makefile successfully generated. make
generates error:
[ 0%] Built target copy_scripts
[ 0%] Built target class_ranker_model_file
[ 0%] Building NVCC (Device) object src/apps/CMakeFiles/relion_jaz_gpu_util.dir/__/jaz/cuda/relion_jaz_gpu_util_generated_test00.cu.o
[ 0%] Building NVCC (Device) object src/apps/CMakeFiles/relion_jaz_gpu_util.dir/__/jaz/cuda/kernels/relion_jaz_gpu_util_generated_add.cu.o
[ 1%] Linking CXX shared library ../../lib/librelion_jaz_gpu_util.so
CMakeFiles/relion_jaz_gpu_util.dir/__/jaz/cuda/relion_jaz_gpu_util_generated_test00.cu.o: In function `__device_stub__Z3addiPfS_(int, float*, float*)':
tmpxft_000072a7_00000000-8_test00.cudafe1.cpp:(.text+0x10): multiple definition of `__device_stub__Z3addiPfS_(int, float*, float*)'
CMakeFiles/relion_jaz_gpu_util.dir/__/jaz/cuda/kernels/relion_jaz_gpu_util_generated_add.cu.o:tmpxft_000072e5_00000000-8_add.cudafe1.cpp:(.text+0x10): first defined here
CMakeFiles/relion_jaz_gpu_util.dir/__/jaz/cuda/relion_jaz_gpu_util_generated_test00.cu.o: In function `add(int, float*, float*)':
tmpxft_000072a7_00000000-8_test00.cudafe1.cpp:(.text+0xc0): multiple definition of `add(int, float*, float*)'
CMakeFiles/relion_jaz_gpu_util.dir/__/jaz/cuda/kernels/relion_jaz_gpu_util_generated_add.cu.o:tmpxft_000072e5_00000000-8_add.cudafe1.cpp:(.text+0xc0): first defined here
make[2]: *** [src/apps/CMakeFiles/relion_jaz_gpu_util.dir/build.make:601: lib/librelion_jaz_gpu_util.so] Error 1
make[1]: *** [CMakeFiles/Makefile2:3013: src/apps/CMakeFiles/relion_jaz_gpu_util.dir/all] Error 2
make: *** [Makefile:136: all] Error 2
I heard someone failed to compile RELION 3.1 with CUDA 11 but RELION 4.0 was OK. Can you test the followings to find what is wrong?
- Try CUDA 10.x with ICC
- Try CUDA 11 but with GCC, not ICC
Please also read https://github.com/3dem/relion/issues/825#issuecomment-949998857.
Please also read #825 (comment).
I used ccmake TUI to set the Cmake variables, rather than relying on environment variables. All the values are in the attached CMakeCache.txt file.
I received this same error building v4.0.0 on Alma 8.6 using GCC 8.5, CUDA 11.4, MKL 2022.0 and OpenMPI 4.1.1. Adding -DCMAKE_CXX_FLAGS='-Wl,--allow-multiple-definition'
to the cmake invocation as a workaround allowed the code to build successfully. Always better to isolate the root cause of course, but it's apparently coming from a small test file (test00.cu
), so it seems unlikely to cause a major issue.
@acaprez
Can you test if the following patch solves the issue (i.e. compiles fine without CXX flags)? That is, please remove "${CMAKE_SOURCE_DIR}/src/jaz/cuda/kernels/*.cu"
from the relevant line in src/app/CMakeLists.txt
and try a fresh build in a new directory.
diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt
index 9870b735..e223be2f 100644
--- a/src/apps/CMakeLists.txt
+++ b/src/apps/CMakeLists.txt
@@ -273,7 +273,7 @@ if (CUDA_FOUND)
- file(GLOB REL_JAZ_CUDA_SRC "${CMAKE_SOURCE_DIR}/src/jaz/cuda/*.cu" "${CMAKE_SOURCE_DIR}/src/jaz/cuda/kernels/*.cu" )
+ file(GLOB REL_JAZ_CUDA_SRC "${CMAKE_SOURCE_DIR}/src/jaz/cuda/*.cu")
cuda_add_library(relion_jaz_gpu_util ${REL_JAZ_CUDA_SRC})
#list(APPEND EXTRA_LIBS "${CUDA_CUFFT_LIBRARIES}")
@biochem-fan appreciate the quick response. Yes, that resolved the issue. Looks like add.cu
was getting pulled in twice as a result of that second glob expression plus the #include
in test00.cu
. Thanks for the fix!
Oops, hit another snag. The build succeeds, but at runtime I get
[root@7efab0d444de /]# relion
relion: error while loading shared libraries: librelion_jaz_gpu_util.so: cannot open shared object file: No such file or directory
The librelion_jaz_gpu_util.so
library is built, but seems to not be in the install targets list. It looks like the relevant lines are already in CMakeLists.txt
, but commented out. Uncommenting them works to include the library for the install, i.e.
diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt
index 9870b735..7f47a664 100644
--- a/src/apps/CMakeLists.txt
+++ b/src/apps/CMakeLists.txt
@@ -273,16 +273,16 @@ if (CUDA_FOUND)
- file(GLOB REL_JAZ_CUDA_SRC "${CMAKE_SOURCE_DIR}/src/jaz/cuda/*.cu" "${CMAKE_SOURCE_DIR}/src/jaz/cuda/kernels/*.cu" )
+ file(GLOB REL_JAZ_CUDA_SRC "${CMAKE_SOURCE_DIR}/src/jaz/cuda/*.cu")
cuda_add_library(relion_jaz_gpu_util ${REL_JAZ_CUDA_SRC})
#list(APPEND EXTRA_LIBS "${CUDA_CUFFT_LIBRARIES}")
- #if(BUILD_SHARED_LIBS)
- # install (TARGETS relion_jaz_gpu_util LIBRARY DESTINATION lib)
- #else()
- # target_link_libraries(relion_jaz_gpu_util relion_lib)
- # target_link_libraries(relion_jaz_gpu_util ${CUDA_CUFFT_LIBRARIES})
- #endif()
+ if(BUILD_SHARED_LIBS)
+ install (TARGETS relion_jaz_gpu_util LIBRARY DESTINATION lib)
+ else()
+ target_link_libraries(relion_jaz_gpu_util relion_lib)
+ target_link_libraries(relion_jaz_gpu_util ${CUDA_CUFFT_LIBRARIES})
+ endif()
target_link_libraries(relion_lib relion_jaz_gpu_util ${CUDA_CUFFT_LIBRARIES})
target_link_libraries(relion_lib relion_jaz_gpu_util ${CUDA_CUFFT_LIBRARIES} ${CUDA_curand_LIBRARY})
The relion
binary works then.
Thank you very much. I pushed the fix (including yours) as commit 138b9c7 to the ver4.0
branch.
I get a similar error with relion 5.0 and cuda 12.2 when using intelmpi (but not with openmpi).
cmake warning:
CMake Warning (dev) in build/CMakeFiles/copy_scripts.dir/DependInfo.cmake:
Syntax Warning in cmake code at
/relion/build/CMakeFiles/copy_scripts.dir/DependInfo.cmake:27:8
Argument not separated from preceding token by whitespace.
This warning is for project developers. Use -Wno-dev to suppress it.
Scanning dependencies of target copy_scripts
CMake Warning (dev) in /relion/build/src/apps/CMakeFiles/relion_lib.dir/DependInfo.cmake:
Syntax Warning in cmake code at
/relion/build/src/apps/CMakeFiles/relion_lib.dir/DependInfo.cmake:312:8
Argument not separated from preceding token by whitespace.
This warning is for project developers. Use -Wno-dev to suppress it.
make error:
[ 71%] Building NVCC (Device) object src/apps/CMakeFiles/relion_jaz_gpu_util.dir//jaz/cuda/./relion_jaz_gpu_util_generated_test00.cu.o CMake Warning (dev) in relion_jaz_gpu_util_generated_test00.cu.o.cmake: Syntax Warning in cmake code at /relion/build/src/apps/CMakeFiles/relion_jaz_gpu_util.dir//jaz/cuda/relion_jaz_gpu_util_generated_test00.cu.o.cmake:80:136
Argument not separated from preceding token by whitespace. This warning is for project developers. Use -Wno-dev to suppress it.
nvcc fatal : Stray '"' character /relion/build/src/apps/CMakeFiles/relion_jaz_gpu_util.dir/__/jaz/cuda/./relion_jaz_gpu_util_generated_test00.cu.o
make[2]: *** [src/apps/CMakeFiles/relion_jaz_gpu_util.dir/__/jaz/cuda/./relion_jaz_gpu_util_generated_test00.cu.o] Error 1 make[1]: *** [src/apps/CMakeFiles/relion_jaz_gpu_util.dir/all] Error 2 make: *** [all] Error 2
The include paths for MPI_CXX and MPI_C were being double double quoted. Removing the quotes in CMakeLists.txt fixed the problem for me
@hnndlp Thanks. What is your suggested change? Can you paste it as a text?