LLaMA-Factory
LLaMA-Factory copied to clipboard
PPO训练报错Tensors must be CUDA and denseTensors must be CUDA and dense
报错:
Assistant:<s>
Traceback (most recent call last):
Traceback (most recent call last):
File "/tmp/cct/src/train_ppo.py", line 82, in <module>
Traceback (most recent call last):
Traceback (most recent call last):
File "/tmp/cct/src/train_ppo.py", line 82, in <module>
File "/tmp/cct/src/train_ppo.py", line 82, in <module>
File "/tmp/cct/src/train_ppo.py", line 82, in <module>
main()
main()
main()
main()
File "/tmp/cct/src/train_ppo.py", line 55, in main
File "/tmp/cct/src/train_ppo.py", line 55, in main
File "/tmp/cct/src/train_ppo.py", line 55, in main
File "/tmp/cct/src/train_ppo.py", line 55, in main
ppo_trainer = PPOPeftTrainer(
ppo_trainer = PPOPeftTrainer(
ppo_trainer = PPOPeftTrainer(
ppo_trainer = PPOPeftTrainer(
^ ^ ^ ^ ^ ^ ^ ^ ^^^^ ^^^^^^ ^^^ ^^^ ^^^^^^^^^^^
^^^^^^ File "/tmp/cct/src/utils/ppo.py", line 72, in __init__
^^^^^^^^
^^^^ File "/tmp/cct/src/utils/ppo.py", line 72, in __init__
^
^^ File "/tmp/cct/src/utils/ppo.py", line 72, in __init__
^^^^
File "/tmp/cct/src/utils/ppo.py", line 72, in __init__
PPOTrainer.__init__(self, **kwargs) PPOTrainer.__init__(self, **kwargs)
PPOTrainer.__init__(self, **kwargs)PPOTrainer.__init__(self, **kwargs)
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/trl/trainer/ppo_trainer.py", line 290, in __init__
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/trl/trainer/ppo_trainer.py", line 290, in __init__
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/trl/trainer/ppo_trainer.py", line 290, in __init__
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/trl/trainer/ppo_trainer.py", line 290, in __init__
) = self.accelerator.prepare(
) = self.accelerator.prepare() = self.accelerator.prepare(
) = self.accelerator.prepare(
^ ^ ^ ^ ^ ^ ^^^ ^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^ File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1182, in prepare
^^^^^^^^^^^^^^^
^
^ File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1182, in prepare
^ File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1182, in prepare
^
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1182, in prepare
result = tuple(
^^^^^^
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1183, in <genexpr>
result = tuple(
result = tuple(
result = tuple(
^ ^ ^ ^ ^ ^^ ^ ^^^^^^
^^ File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1183, in <genexpr>
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1183, in <genexpr>
^^
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1183, in <genexpr>
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement) ^self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
^
^^^^ ^self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1022, in _prepare_one
^^^^^^ ^ ^ ^ ^ ^^ ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^return self.prepare_model(obj, device_placement=device_placement)^^^
^^^^^^^^^^^^ ^^^ ^^^ ^^^ ^^^ ^^^ ^^^ ^^^ ^^^ ^^^ ^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^ File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1022, in _prepare_one
^^^^^^^^^^^^^^
^^^^ File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1022, in _prepare_one
^^^^^^
^^^ File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1022, in _prepare_one
^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1275, in prepare_model
return self.prepare_model(obj, device_placement=device_placement)
^^^^^^ ^return self.prepare_model(obj, device_placement=device_placement)^
^^ ^return self.prepare_model(obj, device_placement=device_placement)^
^ ^ ^ ^ ^ ^ ^ ^ ^ ^ model = torch.nn.parallel.DistributedDataParallel(^
^^ ^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^ ^^^ ^ ^^^ ^^^ ^^^ ^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^ File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1275, in prepare_model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^ File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 676, in __init__
^^^^^^^ ^^model = torch.nn.parallel.DistributedDataParallel(
^
^ File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1275, in prepare_model
^^ ^ ^ ^
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/accelerator.py", line 1275, in prepare_model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 676, in __init__
model = torch.nn.parallel.DistributedDataParallel(
^^ ^model = torch.nn.parallel.DistributedDataParallel(^
^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^ File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 676, in __init__
^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 676, in __init__
_sync_module_states(_sync_module_states(
_sync_module_states(_sync_module_states(
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/distributed/utils.py", line 142, in _sync_module_states
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/distributed/utils.py", line 142, in _sync_module_states
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/distributed/utils.py", line 142, in _sync_module_states
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/distributed/utils.py", line 142, in _sync_module_states
_sync_params_and_buffers(
_sync_params_and_buffers(_sync_params_and_buffers(_sync_params_and_buffers( File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/distributed/utils.py", line 160, in _sync_params_and_buffers
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/distributed/utils.py", line 160, in _sync_params_and_buffers
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/distributed/utils.py", line 160, in _sync_params_and_buffers
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/distributed/utils.py", line 160, in _sync_params_and_buffers
dist._broadcast_coalesced(
RuntimeError : dist._broadcast_coalesced(Tensors must be CUDA and dense
dist._broadcast_coalesced(dist._broadcast_coalesced(RuntimeError
: Tensors must be CUDA and dense
RuntimeErrorRuntimeError: : Tensors must be CUDA and denseTensors must be CUDA and dense
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2344665 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 2344666) of binary: /root/miniconda3/envs/ppo/bin/python
Traceback (most recent call last):
File "/root/miniconda3/envs/ppo/bin/accelerate", line 8, in <module>
sys.exit(main())
^^^^^^
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
args.func(args)
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/commands/launch.py", line 932, in launch_command
multi_gpu_launcher(args)
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/accelerate/commands/launch.py", line 627, in multi_gpu_launcher
distrib_run.run(args)
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/ppo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
src/train_ppo.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2023-06-13_10:49:40
host : mpudgx202302-DGX-Station-A100-920-23487-2531-000
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 2344667)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2023-06-13_10:49:40
host : mpudgx202302-DGX-Station-A100-920-23487-2531-000
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 2344668)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-06-13_10:49:40
host : mpudgx202302-DGX-Station-A100-920-23487-2531-000
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 2344666)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
命令:
accelerate launch src/train_ppo.py \
--model_name_or_path llama-hf/ \
--do_train \
--dataset CCT \
--quantization_bit 4 \
--checkpoint_dir sft/checkpoint-3000 \
--reward_model rm \
--output_dir ppo \
--per_device_train_batch_size 4 \
--gradient_accumulation_steps 4 \
--lr_scheduler_type cosine \
--logging_steps 10 \
--save_steps 1000 \
--learning_rate 1e-5 \
--num_train_epochs 2.0 \
--resume_lora_training False \
--plot_loss
在 load_pretrained 的 AutoModel 加载后打印一下 Tensor 的 dtype 和 device 试试?
打印查看了,没什么大问题。还是报:
RuntimeErrorRuntimeError: : Tensors must be CUDA and denseTensors must be CUDA and dense
方便告知:cuda、cudnn、peft、transformers、bitsandbytes、accelerate、pytorch、torchvision等版本吗?
报错场景1:
- sft:LoRA
- rm:LoRA
- ppo:QLoRA
报错场景2:
- sft:QLoRA
- rm:QLoRA
- ppo:QLoRA或LoRA
正常场景:
- sft:LoRA
- rm:LoRA
- ppo:LoRA