Is there an existing issue for this?
- [X] I have searched the existing issues
Current Behavior
使用全精度多卡训练时,编译torch extentions报错:
In file included from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/core/SymIntArrayRef.h:6:0,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/core/TensorImpl.h:9,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/core/GeneratorImpl.h:12,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/ATen/core/Generator.h:22,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/ATen/CPUGeneratorImpl.h:3,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/ATen/Context.h:3,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/ATen/ATen.h:7,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/torch/csrc/utils/tensor_flatten.h:3,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/ops/csrc/utils/flatten_unflatten.cpp:11:
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/Optional.h: In instantiation of ‘typename std::enable_if<(((std::is_constructible<T, U>::value && (! std::is_same<typename std::decay<_Tp>::type, c10::optional<T> >::value)) && (std::is_scalar<_Tp>::value || std::is_same<typename std::decay<_Tp>::type, T>::value)) && std::is_assignable<T&, U>::value), c10::optional<T>&>::type c10::optional<T>::operator=(U&&) [with U = const long int&; T = long int; typename std::enable_if<(((std::is_constructible<T, U>::value && (! std::is_same<typename std::decay<_Tp>::type, c10::optional<T> >::value)) && (std::is_scalar<_Tp>::value || std::is_same<typename std::decay<_Tp>::type, T>::value)) && std::is_assignable<T&, U>::value), c10::optional<T>&>::type = c10::optional&]’:
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/torch/csrc/api/include/torch/expanding_array.h:140:24: required from ‘torch::ExpandingArrayWithOptionalElem<D, T>::ExpandingArrayWithOptionalElem(c10::ArrayRef<T>) [with long unsigned int D = 2ul; T = long int]’
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/torch/csrc/api/include/torch/expanding_array.h:127:60: required from ‘torch::ExpandingArrayWithOptionalElem<D, T>::ExpandingArrayWithOptionalElem(std::vector<T>) [with long unsigned int D = 2ul; T = long int]’
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/upsampling.h:172:56: required from here
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/Optional.h:700:23: error: assignment of read-only location ‘c10::optional<T>::contained_val()’
contained_val() = std::forward<U>(v);
^
In file included from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/ArrayRef.h:21:0,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/core/DispatchKey.h:5,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/core/DispatchKeySet.h:2,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/ATen/core/Generator.h:15,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/ATen/CPUGeneratorImpl.h:3,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/ATen/Context.h:3,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/ATen/ATen.h:7,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/torch/csrc/utils/tensor_flatten.h:3,
from /home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/ops/csrc/utils/flatten_unflatten.cpp:11:
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/SmallVector.h: In instantiation of ‘void c10::SmallVectorImpl<T>::append(in_iter, in_iter) [with in_iter = const c10::SymInt*; = void; T = c10::SymInt]’:
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/SmallVector.h:750:32: required from ‘void c10::SmallVectorImpl<T>::append(std::initializer_list<_Tp>) [with T = c10::SymInt]’
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/SmallVector.h:789:14: required from ‘void c10::SmallVectorImpl<T>::assign(std::initializer_list<_Tp>) [with T = c10::SymInt]’
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/SmallVector.h:1329:5: required from ‘c10::SmallVector<T, N>::SmallVector(std::initializer_list<_Tp>) [with T = c10::SymInt; unsigned int N = 5u]’
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/core/TensorImpl.h:254:27: required from here
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/SmallVector.h:735:5: error: ‘class c10::SmallVectorImplc10::SymInt’ has no member named ‘assertSafeToAddRange’
this->assertSafeToAddRange(in_start, in_end);
^
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/SmallVector.h:737:5: error: ‘class c10::SmallVectorImplc10::SymInt’ has no member named ‘reserve’
this->reserve(this->size() + NumInputs);
^
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/SmallVector.h:737:32: error: ‘class c10::SmallVectorImplc10::SymInt’ has no member named ‘size’
this->reserve(this->size() + NumInputs);
^
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/SmallVector.h:738:5: error: ‘class c10::SmallVectorImplc10::SymInt’ has no member named ‘uninitialized_copy’
this->uninitialized_copy(in_start, in_end, this->end());
^
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/SmallVector.h:738:5: error: ‘class c10::SmallVectorImplc10::SymInt’ has no member named ‘end’
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/SmallVector.h:739:5: error: ‘class c10::SmallVectorImplc10::SymInt’ has no member named ‘set_size’
this->set_size(this->size() + NumInputs);
^
/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/include/c10/util/SmallVector.h:739:33: error: ‘class c10::SmallVectorImplc10::SymInt’ has no member named ‘size’
this->set_size(this->size() + NumInputs);
^
ninja: build stopped: subcommand failed.
Traceback (most recent call last):
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1900, in _run_ninja_build
subprocess.run(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/subprocess.py", line 528, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/adamzhangchao/ChatGLM-6B/ptuning/main.py", line 430, in
main()
File "/home/adamzhangchao/ChatGLM-6B/ptuning/main.py", line 369, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/adamzhangchao/ChatGLM-6B/ptuning/trainer.py", line 1635, in train
return inner_training_loop(
File "/home/adamzhangchao/ChatGLM-6B/ptuning/trainer.py", line 1704, in _inner_training_loop
deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/transformers/deepspeed.py", line 378, in deepspeed_init
deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/init.py", line 156, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 328, in init
self._configure_optimizer(optimizer, model_parameters)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1187, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1418, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 154, in init
util_ops = UtilsBuilder().load()
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 449, in load
return self.jit_load(verbose)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 480, in jit_load
op_module = load(name=self.name,
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1284, in load
return _jit_compile(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1508, in _jit_compile
_write_ninja_file_and_build_library(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1623, in _write_ninja_file_and_build_library
_run_ninja_build(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1916, in _run_ninja_build
raise RuntimeError(message) from e
RuntimeError: Error building extension 'utils'
Loading extension module utils...
Traceback (most recent call last):
File "/home/adamzhangchao/ChatGLM-6B/ptuning/main.py", line 430, in
main()
File "/home/adamzhangchao/ChatGLM-6B/ptuning/main.py", line 369, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/adamzhangchao/ChatGLM-6B/ptuning/trainer.py", line 1635, in train
return inner_training_loop(
File "/home/adamzhangchao/ChatGLM-6B/ptuning/trainer.py", line 1704, in _inner_training_loop
deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/transformers/deepspeed.py", line 378, in deepspeed_init
deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/init.py", line 156, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 328, in init
self._configure_optimizer(optimizer, model_parameters)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1187, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1418, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 154, in init
util_ops = UtilsBuilder().load()
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 449, in load
return self.jit_load(verbose)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 480, in jit_load
op_module = load(name=self.name,
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1284, in load
return _jit_compile(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1534, in _jit_compile
return _import_module_from_library(name, build_directory, is_python_module)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1936, in _import_module_from_library
module = importlib.util.module_from_spec(spec)
File "", line 565, in module_from_spec
File "", line 1173, in create_module
File "", line 228, in _call_with_frames_removed
ImportError: /home/adamzhangchao/.cache/torch_extensions/py39_cu116/utils/utils.so: cannot open shared object file: No such file or directory
Loading extension module utils...
Traceback (most recent call last):
File "/home/adamzhangchao/ChatGLM-6B/ptuning/main.py", line 430, in
main()
File "/home/adamzhangchao/ChatGLM-6B/ptuning/main.py", line 369, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/adamzhangchao/ChatGLM-6B/ptuning/trainer.py", line 1635, in train
return inner_training_loop(
File "/home/adamzhangchao/ChatGLM-6B/ptuning/trainer.py", line 1704, in _inner_training_loop
deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/transformers/deepspeed.py", line 378, in deepspeed_init
deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/init.py", line 156, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 328, in init
self._configure_optimizer(optimizer, model_parameters)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1187, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1418, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 154, in init
util_ops = UtilsBuilder().load()
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 449, in load
return self.jit_load(verbose)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 480, in jit_load
op_module = load(name=self.name,
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1284, in load
return _jit_compile(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1534, in _jit_compile
return _import_module_from_library(name, build_directory, is_python_module)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1936, in _import_module_from_library
module = importlib.util.module_from_spec(spec)
File "", line 565, in module_from_spec
File "", line 1173, in create_module
File "", line 228, in _call_with_frames_removed
ImportError: /home/adamzhangchao/.cache/torch_extensions/py39_cu116/utils/utils.so: cannot open shared object file: No such file or directory
Loading extension module utils...
Traceback (most recent call last):
File "/home/adamzhangchao/ChatGLM-6B/ptuning/main.py", line 430, in
main()
File "/home/adamzhangchao/ChatGLM-6B/ptuning/main.py", line 369, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/adamzhangchao/ChatGLM-6B/ptuning/trainer.py", line 1635, in train
return inner_training_loop(
File "/home/adamzhangchao/ChatGLM-6B/ptuning/trainer.py", line 1704, in _inner_training_loop
deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/transformers/deepspeed.py", line 378, in deepspeed_init
deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/init.py", line 156, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 328, in init
self._configure_optimizer(optimizer, model_parameters)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1187, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1418, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 154, in init
util_ops = UtilsBuilder().load()
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 449, in load
return self.jit_load(verbose)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 480, in jit_load
op_module = load(name=self.name,
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1284, in load
return _jit_compile(
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1534, in _jit_compile
return _import_module_from_library(name, build_directory, is_python_module)
File "/home/adamzhangchao/anaconda3/envs/glm/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1936, in _import_module_from_library
Expected Behavior
No response
Steps To Reproduce
4卡A100 80G
Environment
- OS: linux
- Python: 3.9
- Transformers: 4.29.2 用4.27也一致
- PyTorch: 1.13.1+cu116
- CUDA Support (`python -c "import torch; print(torch.cuda.is_available())"`) : True
Anything else?
No response