nvidia-5090
cuda12.8
python3.11.7
torch2.7.1+cuda128
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
[rank0]: Traceback (most recent call last):
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2506, in _run_ninja_build
[rank0]: subprocess.run(
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/subprocess.py", line 571, in run
[rank0]: raise CalledProcessError(retcode, process.args,
[rank0]: subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.
[rank0]: The above exception was the direct cause of the following exception:
[rank0]: Traceback (most recent call last):
[rank0]: File "", line 198, in run_module_as_main
[rank0]: File "", line 88, in run_code
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/FlagEmbedding/finetune/embedder/encoder_only/m3/main.py", line 27, in
[rank0]: main()
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/FlagEmbedding/finetune/embedder/encoder_only/m3/main.py", line 23, in main
[rank0]: runner.run()
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/FlagEmbedding/abc/finetune/embedder/AbsRunner.py", line 149, in run
[rank0]: self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint)
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/transformers/trainer.py", line 2325, in train
[rank0]: return inner_training_loop(
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/transformers/trainer.py", line 2483, in inner_training_loop
[rank0]: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/accelerate/accelerator.py", line 1551, in prepare
[rank0]: result = self.prepare_deepspeed(*args)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/accelerate/accelerator.py", line 2296, in prepare_deepspeed
[rank0]: engine, optimizer, , lr_scheduler = ds_initialize(**kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/init.py", line 203, in initialize
[rank0]: engine = DeepSpeedEngine(args=args,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 355, in init
[rank0]: self.configure_optimizer(optimizer, model_parameters)
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1445, in configure_optimizer
[rank0]: basic_optimizer = self.configure_basic_optimizer(model_parameters)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1528, in configure_basic_optimizer
[rank0]: optimizer = FusedAdam(
[rank0]: ^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in init
[rank0]: fused_adam_cuda = FusedAdamBuilder().load()
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 542, in load
[rank0]: return self.jit_load(verbose)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 591, in jit_load
[rank0]: op_module = load(name=self.name,
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1623, in load
[rank0]: return jit_compile(
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2076, in jit_compile
[rank0]: write_ninja_file_and_build_library(
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2222, in write_ninja_file_and_build_library
[rank0]: run_ninja_build(
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2522, in run_ninja_build
[rank0]: raise RuntimeError(message) from e
[rank0]: RuntimeError: Error building extension 'fused_adam': [1/3] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output multi_tensor_adam.cuda.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE="gcc" -DPYBIND11_STDLIB="libstdcpp" -DPYBIND11_BUILD_ABI="cxxabi1016" -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/local/cuda/include -isystem /data/miniconda3/envs/emb-ft/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS -D__CUDA_NO_HALF_CONVERSIONS -D__CUDA_NO_BFLOAT16_CONVERSIONS -D__CUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_120,code=compute_120 -gencode=arch=compute_120,code=sm_120 --compiler-options '-fPIC' -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_120,code=sm_120 -gencode=arch=compute_120,code=compute_120 -UC10_USE_GLOG -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS -U__CUDA_NO_BFLOAT162_OPERATORS -U__CUDA_NO_BFLOAT16_CONVERSIONS -std=c++17 -c /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o
[rank0]: FAILED: [code=1] multi_tensor_adam.cuda.o
[rank0]: /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output multi_tensor_adam.cuda.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE="gcc" -DPYBIND11_STDLIB="libstdcpp" -DPYBIND11_BUILD_ABI="cxxabi1016" -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/local/cuda/include -isystem /data/miniconda3/envs/emb-ft/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS -D__CUDA_NO_HALF_CONVERSIONS -D__CUDA_NO_BFLOAT16_CONVERSIONS -D__CUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_120,code=compute_120 -gencode=arch=compute_120,code=sm_120 --compiler-options '-fPIC' -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_120,code=sm_120 -gencode=arch=compute_120,code=compute_120 -UC10_USE_GLOG -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o
[rank0]: nvcc fatal : Unsupported gpu architecture 'compute_120'
[rank0]: [2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE="_gcc" -DPYBIND11_STDLIB="_libstdcpp" -DPYBIND11_BUILD_ABI="_cxxabi1016" -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/local/cuda/include -isystem /data/miniconda3/envs/emb-ft/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -UC10_USE_GLOG -DBF16_AVAILABLE -c /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o
[rank0]: ninja: build stopped: subcommand failed.
[rank0]:[W1104 15:58:51.171196633 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
E1104 15:58:54.409000 2899866 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 2899981) of binary: /data/miniconda3/envs/emb-ft/bin/python3.11
Traceback (most recent call last):
File "/data/miniconda3/envs/emb-ft/bin/torchrun", line 7, in
sys.exit(main())
^^^^^^
File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 355, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in main
run(args)
File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run
elastic_launch(
File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
FlagEmbedding.finetune.embedder.encoder_only.m3 FAILED