[gpu] Driver installation breaking in Dataproc 2.1 image during initialization
Hi Team,
I was able to attach GPUs to Dataproc 2.1 cluster and it was working fine after disabling the secure boot. I am using the latest install_gpu_driver.sh from this repository. But I am getting the following error during cluster initialization now:-
++ lsb_release -is ++ tr '[:upper:]' '[:lower:]'
- OS_NAME=debian ++ . /etc/os-release +++ PRETTY_NAME='Debian GNU/Linux 11 (bullseye)' +++ NAME='Debian GNU/Linux' +++ VERSION_ID=11 +++ VERSION='11 (bullseye)' +++ VERSION_CODENAME=bullseye +++ ID=debian +++ HOME_URL=https://www.debian.org/ +++ SUPPORT_URL=https://www.debian.org/support +++ BUG_REPORT_URL=https://bugs.debian.org/ ++ echo debian11
- distribution=debian11
- readonly OS_NAME ++ /usr/share/google/get_metadata_value attributes/dataproc-role
- ROLE=Worker
- readonly ROLE
- DRIVER_FOR_CUDA=(['10.1']='418.88' ['10.2']='440.64.00' ['11.0']='450.51.06' ['11.1']='455.45.01' ['11.2']='460.73.01' ['11.5']='495.29.05' ['11.6']='510.47.03' ['11.7']='515.65.01' ['11.8']='520.56.06')
- readonly -A DRIVER_FOR_CUDA
- CUDNN_FOR_CUDA=(['10.1']='7.6.4.38' ['10.2']='7.6.5.32' ['11.0']='8.0.4.30' ['11.1']='8.0.5.39' ['11.2']='8.1.1.33' ['11.5']='8.3.3.40' ['11.6']='8.4.1.50' ['11.7']='8.5.0.96' ['11.8']='8.6.0.163')
- readonly -A CUDNN_FOR_CUDA
- NCCL_FOR_CUDA=(['10.1']='2.4.8' ['10.2']='2.5.6' ['11.0']='2.7.8' ['11.1']='2.8.3' ['11.2']='2.8.3' ['11.5']='2.11.4' ['11.6']='2.11.4' ['11.7']='2.12.12' ['11.8']='2.15.5')
- readonly -A NCCL_FOR_CUDA
- CUDA_SUBVER=(['10.1']='10.1.243' ['10.2']='10.2.89' ['11.0']='11.0.3' ['11.1']='11.1.0' ['11.2']='11.2.2' ['11.5']='11.5.2' ['11.6']='11.6.2' ['11.7']='11.7.1' ['11.8']='11.8.0')
- readonly -A CUDA_SUBVER ++ get_metadata_attribute rapids-runtime SPARK ++ local -r attribute_name=rapids-runtime ++ local -r default_value=SPARK ++ /usr/share/google/get_metadata_value attributes/rapids-runtime ++ echo -n SPARK
- RUNTIME=SPARK
- DEFAULT_CUDA_VERSION=11.2
- [[ 2.1 == 2.* ]]
- [[ SPARK == \S\P\A\R\K ]]
- DEFAULT_CUDA_VERSION=11.5
- readonly DEFAULT_CUDA_VERSION ++ get_metadata_attribute cuda-version 11.5 ++ local -r attribute_name=cuda-version ++ local -r default_value=11.5 ++ /usr/share/google/get_metadata_value attributes/cuda-version ++ echo -n 11.5
- readonly CUDA_VERSION=11.5
- CUDA_VERSION=11.5
- readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION=495.29.05
- DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION=495.29.05 ++ get_metadata_attribute gpu-driver-version 495.29.05 ++ local -r attribute_name=gpu-driver-version ++ local -r default_value=495.29.05 ++ /usr/share/google/get_metadata_value attributes/gpu-driver-version ++ echo -n 495.29.05
- readonly NVIDIA_DEBIAN_GPU_DRIVER_VERSION=495.29.05
- NVIDIA_DEBIAN_GPU_DRIVER_VERSION=495.29.05
- readonly NVIDIA_DEBIAN_GPU_DRIVER_VERSION_PREFIX=495
- NVIDIA_DEBIAN_GPU_DRIVER_VERSION_PREFIX=495
- readonly DRIVER=495
- DRIVER=495
- [[ debian == \r\o\c\k\y ]]
- [[ debian == \r\o\c\k\y ]]
- [[ debian == \d\e\b\i\a\n ]] ++ uname -r ++ awk -F- '{print $1}'
- KERNEL_VERSION=5.10.0
- [[ 495 < 455 ]]
- DEFAULT_NCCL_VERSION=2.11.4
- [[ debian == \r\o\c\k\y ]]
- readonly DEFAULT_NCCL_VERSION ++ get_metadata_attribute nccl-version 2.11.4 ++ local -r attribute_name=nccl-version ++ local -r default_value=2.11.4 ++ /usr/share/google/get_metadata_value attributes/nccl-version ++ echo -n 2.11.4
- readonly NCCL_VERSION=2.11.4
- NCCL_VERSION=2.11.4
- DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_URL=https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run ++ curl -s -I https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run ++ head -1 ++ awk '{print $2}'
- [[ 200 != \2\0\0 ]]
- readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_URL ++ get_metadata_attribute gpu-driver-url https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run ++ local -r attribute_name=gpu-driver-url ++ local -r default_value=https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run ++ /usr/share/google/get_metadata_value attributes/gpu-driver-url ++ echo -n https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run
- NVIDIA_DEBIAN_GPU_DRIVER_URL=https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run
- readonly NVIDIA_DEBIAN_GPU_DRIVER_URL
- readonly NVIDIA_BASE_DL_URL=https://developer.download.nvidia.com/compute
- NVIDIA_BASE_DL_URL=https://developer.download.nvidia.com/compute
- readonly DEFAULT_NCCL_REPO_URL=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
- DEFAULT_NCCL_REPO_URL=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb ++ get_metadata_attribute nccl-repo-url https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb ++ local -r attribute_name=nccl-repo-url ++ local -r default_value=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb ++ /usr/share/google/get_metadata_value attributes/nccl-repo-url ++ echo -n https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
- NCCL_REPO_URL=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
- readonly NCCL_REPO_URL
- readonly NCCL_REPO_KEY=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
- NCCL_REPO_KEY=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
- DEFAULT_NVIDIA_DEBIAN_CUDA_URLS=(['10.1']='https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run' ['10.2']='https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run' ['11.0']='https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run' ['11.1']='https://developer.download.nvidia.com/compute/cuda/11.1.0/local_installers/cuda_11.1.0_455.23.05_linux.run' ['11.2']='https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run' ['11.5']='https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run' ['11.6']='https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run' ['11.7']='https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run' ['11.8']='https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run')
- readonly -A DEFAULT_NVIDIA_DEBIAN_CUDA_URLS
- readonly DEFAULT_NVIDIA_DEBIAN_CUDA_URL=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
- DEFAULT_NVIDIA_DEBIAN_CUDA_URL=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run ++ get_metadata_attribute cuda-url https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run ++ local -r attribute_name=cuda-url ++ local -r default_value=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run ++ /usr/share/google/get_metadata_value attributes/cuda-url ++ echo -n https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
- NVIDIA_DEBIAN_CUDA_URL=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
- readonly NVIDIA_DEBIAN_CUDA_URL
- readonly NVIDIA_UBUNTU_REPO_URL=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64
- NVIDIA_UBUNTU_REPO_URL=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64
- readonly NVIDIA_UBUNTU_REPO_KEY_PACKAGE=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb
- NVIDIA_UBUNTU_REPO_KEY_PACKAGE=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb
- readonly NVIDIA_UBUNTU_REPO_CUDA_PIN=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
- NVIDIA_UBUNTU_REPO_CUDA_PIN=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
- readonly NVIDIA_ROCKY_REPO_URL=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
- NVIDIA_ROCKY_REPO_URL=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
- DEFAULT_CUDNN_VERSION=8.3.3.40
- [[ debian == \r\o\c\k\y ]]
- readonly DEFAULT_CUDNN_VERSION ++ get_metadata_attribute cudnn-version 8.3.3.40 ++ local -r attribute_name=cudnn-version ++ local -r default_value=8.3.3.40 ++ /usr/share/google/get_metadata_value attributes/cudnn-version ++ echo -n 8.3.3.40
- readonly CUDNN_VERSION=8.3.3.40
- CUDNN_VERSION=8.3.3.40
- CUDNN_TARBALL=cudnn-11.5-linux-x64-v8.3.3.40.tgz
- CUDNN_TARBALL_URL=https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.3/cudnn-11.5-linux-x64-v8.3.3.40.tgz
- compare_versions_lte 8.3.1.22 8.3.3.40 ++ echo -e '8.3.1.22\n8.3.3.40' ++ sort -V ++ head -n1
- '[' 8.3.1.22 = 8.3.1.22 ']'
- CUDNN_TARBALL=cudnn-linux-x86_64-8.3.3.40_cuda11-archive.tar.xz
- compare_versions_lte 8.3.3.40 8.4.1.50 ++ echo -e '8.3.3.40\n8.4.1.50' ++ sort -V ++ head -n1
- '[' 8.3.3.40 = 8.3.3.40 ']'
- CUDNN_TARBALL=cudnn-linux-x86_64-8.3.3.40_cuda11.5-archive.tar.xz
- CUDNN_TARBALL_URL=https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.3/local_installers/11.5/cudnn-linux-x86_64-8.3.3.40_cuda11.5-archive.tar.xz
- readonly CUDNN_TARBALL
- readonly CUDNN_TARBALL_URL ++ get_metadata_attribute gpu-driver-provider NVIDIA ++ local -r attribute_name=gpu-driver-provider ++ local -r default_value=NVIDIA ++ /usr/share/google/get_metadata_value attributes/gpu-driver-provider ++ echo -n NVIDIA
- GPU_DRIVER_PROVIDER=NVIDIA
- readonly GPU_DRIVER_PROVIDER
- readonly GPU_AGENT_REPO_URL=https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics
- GPU_AGENT_REPO_URL=https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics ++ get_metadata_attribute install-gpu-agent false ++ local -r attribute_name=install-gpu-agent ++ local -r default_value=false ++ /usr/share/google/get_metadata_value attributes/install-gpu-agent ++ echo -n false
- INSTALL_GPU_AGENT=false
- readonly INSTALL_GPU_AGENT
- readonly HADOOP_CONF_DIR=/etc/hadoop/conf
- HADOOP_CONF_DIR=/etc/hadoop/conf
- readonly HIVE_CONF_DIR=/etc/hive/conf
- HIVE_CONF_DIR=/etc/hive/conf
- readonly SPARK_CONF_DIR=/etc/spark/conf
- SPARK_CONF_DIR=/etc/spark/conf
- NVIDIA_SMI_PATH=/usr/bin
- MIG_MAJOR_CAPS=0
- IS_MIG_ENABLED=0
- main
- [[ debian != debian ]]
- remove_old_backports ++ curl -s https://deb.debian.org/debian/dists/oldstable/Release ++ awk '/^Codename/ {print $2}'
- oldstable=bullseye ++ curl -s https://deb.debian.org/debian/dists/stable/Release ++ awk '/^Codename/ {print $2}'
- stable=bookworm ++ grep -rsil '-backports' /etc/apt/sources.list /etc/apt/sources.list.d
- matched_files=/etc/apt/sources.list
- [[ -n /etc/apt/sources.list ]]
- for filename in "$matched_files"
- grep -e bullseye-backports -e bookworm-backports /etc/apt/sources.list deb https://deb.debian.org/debian bullseye-backports main deb-src https://deb.debian.org/debian bullseye-backports main
- [[ debian == debian ]]
- export DEBIAN_FRONTEND=noninteractive
- DEBIAN_FRONTEND=noninteractive
- execute_with_retries 'apt-get update'
- local -r 'cmd=apt-get update'
- (( i = 0 ))
- (( i < 10 ))
- eval 'apt-get update' ++ apt-get update Hit:1 https://deb.debian.org/debian bullseye InRelease Get:2 https://deb.debian.org/debian-security bullseye-security InRelease [48.4 kB] Get:3 https://download.docker.com/linux/debian bullseye InRelease [43.3 kB] Get:4 https://deb.debian.org/debian bullseye-updates InRelease [44.1 kB] Get:5 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-east4/2_1_deb11_20240513_020335-RC01 dataproc InRelease [3708 B] Get:6 https://deb.debian.org/debian bullseye-backports InRelease [49.0 kB] Hit:7 https://repo.mysql.com/apt/debian bullseye InRelease Hit:8 https://storage.googleapis.com/dataproc-bigtop-repo/2_1_deb11_20240513_020335-RC01 dataproc InRelease Hit:9 https://packages.cloud.google.com/apt google-cloud-logging-bullseye-all InRelease Get:10 https://download.docker.com/linux/debian bullseye/stable amd64 Packages [38.7 kB] Hit:11 https://packages.cloud.google.com/apt google-cloud-monitoring-bullseye-all InRelease Get:12 https://packages.cloud.google.com/apt google-compute-engine-bullseye-stable InRelease [1321 B] Hit:13 https://packages.adoptium.net/artifactory/deb bullseye InRelease Get:14 https://deb.debian.org/debian-security bullseye-security/main Sources [179 kB] Get:15 https://deb.debian.org/debian-security bullseye-security/main amd64 Packages [275 kB] Get:16 https://packages.cloud.google.com/apt cloud-sdk-bullseye InRelease [1602 B] Get:17 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-east4/2_1_deb11_20240513_020335-RC01 dataproc/contrib Sources [8460 B] Get:18 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-east4/2_1_deb11_20240513_020335-RC01 dataproc/contrib amd64 Packages [19.2 kB] Get:19 https://packages.cloud.google.com/apt cloud-sdk-bullseye/main all Packages [1480 kB] Get:20 https://packages.cloud.google.com/apt cloud-sdk-bullseye/main amd64 Packages [3083 kB] Fetched 5274 kB in 2s (3266 kB/s) Reading package lists...
- return 0
- execute_with_retries 'apt-get install -y -q pciutils'
- local -r 'cmd=apt-get install -y -q pciutils'
- (( i = 0 ))
- (( i < 10 ))
- eval 'apt-get install -y -q pciutils' ++ apt-get install -y -q pciutils Reading package lists... Building dependency tree... Reading state information... pciutils is already the newest version (1:3.7.0-5). 0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.
- return 0
- configure_yarn
- [[ ! -f /etc/hadoop/conf/resource-types.xml ]]
- printf '\n
' - set_hadoop_property resource-types.xml yarn.resource-types yarn.io/gpu
- local -r config_file=resource-types.xml
- local -r property=yarn.resource-types
- local -r value=yarn.io/gpu
- bdconfig set_property --configuration_file /etc/hadoop/conf/resource-types.xml --name yarn.resource-types --value yarn.io/gpu --clobber
- set_hadoop_property capacity-scheduler.xml yarn.scheduler.capacity.resource-calculator org.apache.hadoop.yarn.util.resource.DominantResourceCalculator
- local -r config_file=capacity-scheduler.xml
- local -r property=yarn.scheduler.capacity.resource-calculator
- local -r value=org.apache.hadoop.yarn.util.resource.DominantResourceCalculator
- bdconfig set_property --configuration_file /etc/hadoop/conf/capacity-scheduler.xml --name yarn.scheduler.capacity.resource-calculator --value org.apache.hadoop.yarn.util.resource.DominantResourceCalculator --clobber
- set_hadoop_property yarn-site.xml yarn.resource-types yarn.io/gpu
- local -r config_file=yarn-site.xml
- local -r property=yarn.resource-types
- local -r value=yarn.io/gpu
- bdconfig set_property --configuration_file /etc/hadoop/conf/yarn-site.xml --name yarn.resource-types --value yarn.io/gpu --clobber
- lspci
- grep -q NVIDIA
- /usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader
- uniq /etc/google-dataproc/startup-scripts/dataproc-initialization-script-0: line 533: /usr/bin/nvidia-smi: No such file or directory
- wc -l 0
- [[ debian == debian ]] ++ uname -r
- execute_with_retries 'apt-get install -y -q '''linux-headers-5.10.0-30-cloud-amd64''''
- local -r 'cmd=apt-get install -y -q '''linux-headers-5.10.0-30-cloud-amd64''''
- (( i = 0 ))
- (( i < 10 ))
- eval 'apt-get install -y -q '''linux-headers-5.10.0-30-cloud-amd64'''' ++ apt-get install -y -q linux-headers-5.10.0-30-cloud-amd64 Reading package lists... Building dependency tree... Reading state information... linux-headers-5.10.0-30-cloud-amd64 is already the newest version (5.10.218-1). linux-headers-5.10.0-30-cloud-amd64 set to manually installed. 0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.
- return 0
- [[ 0 -eq 0 ]]
- install_nvidia_gpu_driver
- [[ debian == debian ]]
- curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb -o /tmp/cuda-keyring.deb
- dpkg -i /tmp/cuda-keyring.deb Selecting previously unselected package cuda-keyring. (Reading database ... 168905 files and directories currently installed.) Preparing to unpack /tmp/cuda-keyring.deb ... Unpacking cuda-keyring (1.0-1) ... Setting up cuda-keyring (1.0-1) ...
- curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run -o driver.run
- bash ./driver.run --silent --install-libglvnd Verifying archive integrity... OK Uncompressing NVIDIA Accelerated Graphics Driver for Linux-x86_64 495.29.05.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
ERROR: An error occurred while performing the step: "Building kernel modules". See /var/log/nvidia-installer.log for details.
ERROR: An error occurred while performing the step: "Checking to see whether the nvidia kernel module was successfully built". See /var/log/nvidia-installer.log for details.
ERROR: The nvidia kernel module was not created.
ERROR: Installation has failed. Please see the file '/var/log/nvidia-installer.log' for details. You may find suggestions on fixing installation problems in the README available on the Linux driver download page at www.nvidia.com.
I can also see the following error in the file /var/log/nvidia-installer.log in one of the cluster machine.
ld -r -o /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/nvidia-modeset/nv-modeset-interface.o /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/nvidia-modeset/nvidia-modeset-linux.o /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/nvidia-modeset/nv-kthread-q.o
LD [M] /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/nvidia-modeset.o
LD [M] /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/nvidia-peermem.o
MODPOST /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers
FATAL: modpost: GPL-incompatible module nvidia.ko uses GPL-only symbol 'rcu_read_unlock_strict'
make[3]: *** [/usr/src/linux-headers-5.10.0-30-common/scripts/Makefile.modpost:123: /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers] Error 1
make[3]: Target '__modpost' not remade because of errors.
make[2]: *** [/usr/src/linux-headers-5.10.0-30-common/Makefile:1783: modules] Error 2
make[2]: Leaving directory '/usr/src/linux-headers-5.10.0-30-cloud-amd64'
make[1]: *** [Makefile:192: __sub-make] Error 2
make[1]: Target 'modules' not remade because of errors.
make[1]: Leaving directory '/usr/src/linux-headers-5.10.0-30-common'
make: *** [Makefile:80: modules] Error 2
-> Checking to see whether the nvidia kernel module was successfully built
executing: 'cd ./kernel; /opt/conda/default/bin/make -k -j8 NV_KERNEL_MODULES="nvidia" NV_EXCLUDE_KERNEL_MODULES="" SYSSRC="/lib/modules/5.10.0-30-cloud-amd64/source" SYSOUT="/lib/modules/5.10.0-30-cloud-amd64/build"'...
make[1]: Entering directory '/usr/src/linux-headers-5.10.0-30-common'
make[2]: Entering directory '/usr/src/linux-headers-5.10.0-30-cloud-amd64'
scripts/Makefile.lib:8: 'always' is deprecated. Please use 'always-y' instead
MODPOST /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers
FATAL: modpost: GPL-incompatible module nvidia.ko uses GPL-only symbol 'rcu_read_unlock_strict'
make[3]: *** [/usr/src/linux-headers-5.10.0-30-common/scripts/Makefile.modpost:123: /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers] Error 1
make[3]: Target '__modpost' not remade because of errors.
make[2]: *** [/usr/src/linux-headers-5.10.0-30-common/Makefile:1783: modules] Error 2
make[2]: Leaving directory '/usr/src/linux-headers-5.10.0-30-cloud-amd64'
make[1]: *** [Makefile:192: __sub-make] Error 2
make[1]: Target 'modules' not remade because of errors.
make[1]: Leaving directory '/usr/src/linux-headers-5.10.0-30-common'
make: *** [Makefile:80: modules] Error 2
-> Error.
ERROR: An error occurred while performing the step: "Checking to see whether the nvidia kernel module was successfully built". See /var/log/nvidia-installer.log for details.
-> The command cd ./kernel; /opt/conda/default/bin/make -k -j8 NV_KERNEL_MODULES="nvidia" NV_EXCLUDE_KERNEL_MODULES="" SYSSRC="/lib/modules/5.10.0-30-cloud-amd64/source" SYSOUT="/lib/modules/5.10.0-30-cloud-amd64/build" failed with the following output:
make[1]: Entering directory '/usr/src/linux-headers-5.10.0-30-common' make[2]: Entering directory '/usr/src/linux-headers-5.10.0-30-cloud-amd64' scripts/Makefile.lib:8: 'always' is deprecated. Please use 'always-y' instead MODPOST /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers FATAL: modpost: GPL-incompatible module nvidia.ko uses GPL-only symbol 'rcu_read_unlock_strict' make[3]: *** [/usr/src/linux-headers-5.10.0-30-common/scripts/Makefile.modpost:123: /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers] Error 1 make[3]: Target '__modpost' not remade because of errors. make[2]: *** [/usr/src/linux-headers-5.10.0-30-common/Makefile:1783: modules] Error 2 make[2]: Leaving directory '/usr/src/linux-headers-5.10.0-30-cloud-amd64' make[1]: *** [Makefile:192: __sub-make] Error 2 make[1]: Target 'modules' not remade because of errors. make[1]: Leaving directory '/usr/src/linux-headers-5.10.0-30-common' make: *** [Makefile:80: modules] Error 2 ERROR: The nvidia kernel module was not created. ERROR: Installation has failed. Please see the file '/var/log/nvidia-installer.log' for details. You may find suggestions on fixing installation problems in the README available on the Linux driver download page at www.nvidia.com.
Is anyone facing similar issue with driver installation in Dataproc 2.1/2.2 clusters?
I could change the default driver and cuda versions on 2.1 images to be more current.
@cjac Thank you! Is there a specific CUDA and driver version to try as a workaround to get past this error in 2.1 images now?
I don't think I've tested the current code with cuda 12, but I think that's what we should be targeting with a recent 5xx series driver.
I recently reimagined the installer to use , on bookworm and later, the stock dkms from non-,free packages and sign drivers using the MOK. That requires that the MOK x509 cert be inserted into the efi header of the block device. I'll be writing it up with some example code shortly.
I will try to set it up to do cuda 12 on a 5xx series kernel module, but I haven't tested it yet. In 2.2 we should be able to use the one from Debian stable non-free with dkms to install the current open module.
On Wed, Jun 12, 2024, 12:25 santhoshvly @.***> wrote:
@cjac https://github.com/cjac Thank you! Is there a specific CUDA and driver version to try as a workaround to get past this error in 2.1 images now?
— Reply to this email directly, view it on GitHub https://github.com/GoogleCloudDataproc/initialization-actions/issues/1189#issuecomment-2163747827, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAAM6USGZGNQTQ3ZR6XCEE3ZHCOA3AVCNFSM6AAAAABJG5NGYOVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDCNRTG42DOOBSG4 . You are receiving this because you were mentioned.Message ID: @.*** com>
@cjac Okay, Thank you!. I tried with latest CUDA version 12.0 and corresponding driver version using the latest install_gpu_driver.sh script from this repo, but got the same error. So, it looks like we can't really attach any GPUs to Dataproc 2.1/2.2 until we fix it. Please let me know if there are any other workarounds
I'm seeing a gcc error when trying to link gpl-incompatible code into kernel modules for all variants available on Debian 11 ; Debian 12 offers open driver support so I will start there tomorrow.
On Thu, Jun 13, 2024, 06:49 santhoshvly @.***> wrote:
@cjac https://github.com/cjac Okay, Thank you!. I tried with latest CUDA version 12.0 and corresponding driver version using the latest install_gpu_driver.sh script from this repo, but got the same error. So, it looks like we can't really attach any GPUs to Dataproc 2.1/2.2 until we fix it. Please let me know if there are any other workarounds
— Reply to this email directly, view it on GitHub https://github.com/GoogleCloudDataproc/initialization-actions/issues/1189#issuecomment-2165732651, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAAM6UUNC5SUOIVPDJUSYMLZHGPNVAVCNFSM6AAAAABJG5NGYOVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDCNRVG4ZTENRVGE . You are receiving this because you were mentioned.Message ID: @.*** com>
The latest Dataproc image that works with the .run file is 2.1.46-debian11
I am pushing a new change to the installer script. Please see #1190 for something that has been tested to work with images prior to and including 2.1.46-debian11
I am also working on bookworm (2.2-debian12) support for installation using apt-get.
I'm also work on something in the custom-images repo. I've got an in-progress PR open over there:
https://github.com/GoogleCloudDataproc/custom-images/pull/83
@cjac Okay, Thank you for the update. We are unable to use GPU with latest 2.1/2.2 images until we get the fixed install_gpu_driver.sh. We always use the latest 2.1 image to launch the Dataproc cluster. Will this script change help attach the GPU to the latest 2.1 Debian 11 image (currently 2.1.53-debian11), or can we only use versions prior to and including 2.1.46-debian11?
We have been running data pipelines using the latest Dataproc 2.1 images with GPU attached, and they have been breaking for some time. However, the documentation does not mention this issue: https://cloud.google.com/dataproc/docs/concepts/compute/gpus. This makes Dataproc GPU clusters seem very unreliable if they can break at any time.
Yes, I agree. I'm doing some work internally to build and distribute the kernel drivers with the stock image. I hope to have the change reviewed and published this quarter.
You are correct that the initialization-actions script will presently only work with those versions mentioned. I will do some work today to see if I can build drivers from bullseye-backports.
I've had some luck building from the open source github repo on the latest 2.1 images ; I'm integrating these changes into the open PR now.
The update is working on the latest 2.1 image.
Thu Jun 20 18:47:35 2024
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14 Driver Version: 550.54.14 CUDA Version: 12.4 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L4 Off | 00000000:00:03.0 Off | 0 |
| N/A 76C P0 37W / 72W | 0MiB / 23034MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
Okay, cool. Thank you so much!. So, we should be able to attach the GPU to latest 2.1 once you merge this PR, https://github.com/GoogleCloudDataproc/initialization-actions/pull/1190. Is that correct?
The update is also working on 2.0 images:
Thu Jun 20 19:08:20 2024
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14 Driver Version: 550.54.14 CUDA Version: 12.4 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L4 Off | 00000000:00:03.0 Off | 0 |
| N/A 62C P0 32W / 72W | 0MiB / 23034MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
Okay, cool. Thank you so much!. So, we should be able to attach the GPU to latest 2.1 once you merge this PR, https://github.com/GoogleCloudDataproc/initialization-actions/pull/1190. Is that correct?
#1190 is correct
@cjac I am facing the following error while attaching GPU to Dataproc 2.2 cluster
The following packages have unmet dependencies: systemd : Depends: libsystemd0 (= 252.26-1~deb12u2) E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
- sleep 5
- (( i++ ))
- (( i < 10 ))
- eval 'apt-get install -y -q pciutils' ++ apt-get install -y -q pciutils Reading package lists... Building dependency tree... Reading state information... pciutils is already the newest version (1:3.9.0-4). Some packages could not be installed. This may mean that you have requested an impossible situation or if you are using the unstable distribution that some required packages have not yet been created or been moved out of Incoming. The following information may help to resolve the situation:
The following packages have unmet dependencies: systemd : Depends: libsystemd0 (= 252.26-1~deb12u2) E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
- sleep 5
- (( i++ ))
- (( i < 10 ))
- eval 'apt-get install -y -q pciutils' ++ apt-get install -y -q pciutils Reading package lists... Building dependency tree... Reading state information... pciutils is already the newest version (1:3.9.0-4). Some packages could not be installed. This may mean that you have requested an impossible situation or if you are using the unstable distribution that some required packages have not yet been created or been moved out of Incoming. The following information may help to resolve the situation:
The following packages have unmet dependencies: systemd : Depends: libsystemd0 (= 252.26-1~deb12u2) E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
- sleep 5
- (( i++ ))
- (( i < 10 ))
- eval 'apt-get install -y -q pciutils' ++ apt-get install -y -q pciutils Reading package lists... Building dependency tree... Reading state information... pciutils is already the newest version (1:3.9.0-4). Some packages could not be installed. This may mean that you have requested an impossible situation or if you are using the unstable distribution that some required packages have not yet been created or been moved out of Incoming. The following information may help to resolve the situation:
The following packages have unmet dependencies: systemd : Depends: libsystemd0 (= 252.26-1~deb12u2) E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
- sleep 5
- (( i++ ))
- (( i < 10 ))
- return 1 Any workaround to proceed further? Tried with latest from master branch and the changes from 1190 PR
TL;DR: Debian started enforcing deprecation of apt-key add; must move repo signing key to its own file and reference by path in sources.list file
I am fixing. You can find a workaround at the end of install_gpu_drivers.sh in my rapids work branch
https://github.com/cjac/initialization-actions/blob/e43a1eaa402dc8a81aa8853cafb32e906f72f80f/gpu/install_gpu_driver.sh#L1077
@cjac Okay, Thank you. I will try this workaround.
You can likely use that whole file if extracting the function is too complicated.
@cjac Okay,Thanks!. We have disabled secure boot in dataproc. Is that okay or should we enable it?.
@cjac I tried with that workaround script you mentioned but still breaking with similar error in Dataproc 2.2
-----END PGP PUBLIC KEY BLOCK-----'
- sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
- rm -rf /etc/apt/trusted.gpg
- main
- is_debian ++ os_id ++ cut -d= -f2 ++ grep '^ID=' /etc/os-release ++ xargs
- [[ debian == \d\e\b\i\a\n ]]
- remove_old_backports
- is_debian12
- is_debian ++ os_id ++ xargs ++ cut -d= -f2 ++ grep '^ID=' /etc/os-release
- [[ debian == \d\e\b\i\a\n ]] ++ os_version ++ xargs ++ cut -d= -f2 ++ grep '^VERSION_ID=' /etc/os-release
- [[ 12 == \1\2* ]]
- return
- is_debian ++ os_id ++ xargs ++ cut -d= -f2 ++ grep '^ID=' /etc/os-release
- [[ debian == \d\e\b\i\a\n ]]
- export DEBIAN_FRONTEND=noninteractive
- DEBIAN_FRONTEND=noninteractive
- execute_with_retries 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64'
- local -r 'cmd=apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64'
- (( i = 0 ))
- (( i < 3 ))
- eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
- sleep 5
- (( i++ ))
- (( i < 3 ))
- eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
- sleep 5
- (( i++ ))
- (( i < 3 ))
- eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
- sleep 5
- (( i++ ))
- (( i < 3 ))
- return 1
I didn't explicitly recommend that you run apt-get update after you fix the trust database. you'll still get the errors until you apt-get update to re-build the package cache. I'll encode that into the workaround.
package cache update command included in #1240 as commit 234515d
@cjac I tried with package cache update, but getting same error:-
cAZUlaj3id3TxquAlud4lWDz =h5nH -----END PGP PUBLIC KEY BLOCK-----'
- gpg --dearmor -o /usr/share/keyrings/mysql.gpg
- sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
- rm -rf /etc/apt/trusted.gpg
- apt-get update Get:1 file:/etc/apt/mirrors/debian.list Mirrorlist [30 B] Get:5 file:/etc/apt/mirrors/debian-security.list Mirrorlist [39 B] Get:7 https://packages.cloud.google.com/apt google-compute-engine-bookworm-stable InRelease [1321 B] Get:8 https://download.docker.com/linux/debian bookworm InRelease [43.3 kB] Hit:9 https://repo.mysql.com/apt/debian bookworm InRelease Hit:2 https://deb.debian.org/debian bookworm InRelease Get:3 https://deb.debian.org/debian bookworm-updates InRelease [55.4 kB] Get:4 https://deb.debian.org/debian bookworm-backports InRelease [59.0 kB] Get:6 https://deb.debian.org/debian-security bookworm-security InRelease [48.0 kB] Get:10 https://packages.adoptium.net/artifactory/deb bookworm InRelease [7517 B] Get:11 https://packages.cloud.google.com/apt cloud-sdk-bookworm InRelease [1654 B] Get:12 https://packages.cloud.google.com/apt google-compute-engine-bookworm-stable/main amd64 Packages [3128 B] Get:13 https://download.docker.com/linux/debian bookworm/stable amd64 Packages [31.3 kB] Get:14 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-west4/2_2_deb12_20240606_230238-RC01 dataproc InRelease [3708 B] Get:15 https://deb.debian.org/debian bookworm-updates/main Sources.diff/Index [11.7 kB] Get:16 https://deb.debian.org/debian bookworm-updates/main amd64 Packages.diff/Index [11.7 kB] Get:17 https://deb.debian.org/debian bookworm-updates/main Translation-en.diff/Index [11.7 kB] Get:18 https://deb.debian.org/debian bookworm-updates/main Sources T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [562 B] Get:19 https://deb.debian.org/debian bookworm-updates/main amd64 Packages T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [1116 B] Get:18 https://deb.debian.org/debian bookworm-updates/main Sources T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [562 B] Get:19 https://deb.debian.org/debian bookworm-updates/main amd64 Packages T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [1116 B] Get:20 https://deb.debian.org/debian bookworm-updates/main Translation-en T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [538 B] Get:20 https://deb.debian.org/debian bookworm-updates/main Translation-en T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [538 B] Get:21 https://deb.debian.org/debian bookworm-backports/main Sources.diff/Index [63.3 kB] Ign:21 https://deb.debian.org/debian bookworm-backports/main Sources.diff/Index Get:22 https://deb.debian.org/debian bookworm-backports/main amd64 Packages.diff/Index [63.3 kB] Get:23 https://deb.debian.org/debian bookworm-backports/main Translation-en.diff/Index [63.3 kB] Get:25 https://deb.debian.org/debian bookworm-backports/main amd64 Packages T-2024-09-25-2006.54-F-2024-09-03-2007.15.pdiff [57.0 kB] Get:25 https://deb.debian.org/debian bookworm-backports/main amd64 Packages T-2024-09-25-2006.54-F-2024-09-03-2007.15.pdiff [57.0 kB] Get:26 https://deb.debian.org/debian bookworm-backports/main Translation-en T-2024-09-25-0804.34-F-2024-09-06-2122.28.pdiff [11.6 kB] Get:26 https://deb.debian.org/debian bookworm-backports/main Translation-en T-2024-09-25-0804.34-F-2024-09-06-2122.28.pdiff [11.6 kB] Get:24 https://deb.debian.org/debian bookworm-backports/main Sources [276 kB] Get:27 https://deb.debian.org/debian-security bookworm-security/main Sources [110 kB] Get:28 https://deb.debian.org/debian-security bookworm-security/main amd64 Packages [182 kB] Get:29 https://deb.debian.org/debian-security bookworm-security/main Translation-en [111 kB] Get:30 https://packages.adoptium.net/artifactory/deb bookworm/main amd64 Packages [7417 B] Get:31 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-west4/2_2_deb12_20240606_230238-RC01 dataproc/contrib Sources [8442 B] Get:32 https://packages.cloud.google.com/apt cloud-sdk-bookworm/main all Packages [1555 kB] Get:33 https://packages.cloud.google.com/apt cloud-sdk-bookworm/main amd64 Packages [3337 kB] Get:34 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-west4/2_2_deb12_20240606_230238-RC01 dataproc/contrib amd64 Packages [19.3 kB] Fetched 6156 kB in 2s (3423 kB/s) Reading package lists...
- main
- is_debian ++ os_id ++ grep '^ID=' /etc/os-release ++ xargs ++ cut -d= -f2
- [[ debian == \d\e\b\i\a\n ]]
- remove_old_backports
- is_debian12
- is_debian ++ os_id ++ grep '^ID=' /etc/os-release ++ xargs ++ cut -d= -f2
- [[ debian == \d\e\b\i\a\n ]] ++ os_version ++ grep '^VERSION_ID=' /etc/os-release ++ xargs ++ cut -d= -f2
- [[ 12 == \1\2* ]]
- return
- is_debian ++ os_id ++ grep '^ID=' /etc/os-release ++ xargs ++ cut -d= -f2
- [[ debian == \d\e\b\i\a\n ]]
- export DEBIAN_FRONTEND=noninteractive
- DEBIAN_FRONTEND=noninteractive
- execute_with_retries 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64'
- local -r 'cmd=apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64'
- (( i = 0 ))
- (( i < 3 ))
- eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
- sleep 5
- (( i++ ))
- (( i < 3 ))
- eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
- sleep 5
- (( i++ ))
- (( i < 3 ))
- eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
- sleep 5
- (( i++ ))
- (( i < 3 ))
- return 1