VisCPM
VisCPM copied to clipboard
CUDA error: an illegal memory access was encountered
我尝试在两张A800 80G 上进行微调,cuda 11.8,报了如下错误,请问要怎么解决呢
Traceback (most recent call last):
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 210, in <module>
main()
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 206, in main
train(model, args)
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 91, in train
vllm_engine, vllm_optim, _, _ = deepspeed.initialize(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/__init__.py", line 165, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 308, in __init__
self._configure_optimizer(optimizer, model_parameters)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1167, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1398, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 485, in __init__
self.initialize_optimizer_states()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 614, in initialize_optimizer_states
self.optimizer.step()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/torch/optim/optimizer.py", line 280, in wrapper
out = func(*args, **kwargs)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py", line 151, in step
multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_32, p_32, m_32, v_32],
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/multi_tensor_apply.py", line 17, in __call__
return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
RuntimeError: CUDA error: an illegal memory access was encountered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Traceback (most recent call last):
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 210, in <module>
main()
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 206, in main
train(model, args)
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 91, in train
vllm_engine, vllm_optim, _, _ = deepspeed.initialize(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/__init__.py", line 165, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 308, in __init__
self._configure_optimizer(optimizer, model_parameters)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1167, in _configure_optimizer
Traceback (most recent call last):
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 210, in <module>
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1398, in _configure_zero_optimizer
main()
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 206, in main
optimizer = DeepSpeedZeroOptimizer(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 485, in __init__
train(model, args)
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 91, in train
vllm_engine, vllm_optim, _, _ = deepspeed.initialize(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/__init__.py", line 165, in initialize
self.initialize_optimizer_states()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 614, in initialize_optimizer_states
engine = DeepSpeedEngine(args=args,
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 308, in __init__
self.optimizer.step()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/torch/optim/optimizer.py", line 280, in wrapper
self._configure_optimizer(optimizer, model_parameters)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1167, in _configure_optimizer
out = func(*args, **kwargs)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py", line 151, in step
multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_32, p_32, m_32, v_32],
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/multi_tensor_apply.py", line 17, in __call__
return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
RuntimeError: CUDA error: an illegal memory access was encountered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1398, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 485, in __init__
self.initialize_optimizer_states()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 614, in initialize_optimizer_states
self.optimizer.step()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/torch/optim/optimizer.py", line 280, in wrapper
out = func(*args, **kwargs)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py", line 151, in step
multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_32, p_32, m_32, v_32],
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/multi_tensor_apply.py", line 17, in __call__
return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
RuntimeError: CUDA error: an illegal memory access was encountered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
你好 解决了吗
你好解决了吗
没有啊,后面就没搞了,请问有解决方案啊