MiniCPM-V
MiniCPM-V copied to clipboard
微调报错
- error output:
File "/projects/MiniCPM-V/finetune/finetune.py", line 124, in <module>
Traceback (most recent call last):
File "/projects/MiniCPM-V/finetune/finetune.py", line 124, in <module>
Traceback (most recent call last):
File "/projects/MiniCPM-V/finetune/finetune.py", line 124, in <module>
train()
File "/projects/MiniCPM-V/finetune/finetune.py", line 119, in train
train()
File "/projects/MiniCPM-V/finetune/finetune.py", line 119, in train
train()
File "/projects/MiniCPM-V/finetune/finetune.py", line 119, in train
trainer.train()
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1537, in train
trainer.train()
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
trainer.train()
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
return inner_training_loop(
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2383, in _save_checkpoint
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2274, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2383, in _save_checkpoint
os.rename(staging_output_dir, output_dir)
FileNotFoundError: [Errno 2] No such file or directory: 'output/output_minicpmv2/tmp-checkpoint-2' -> 'output/output_minicpmv2/checkpoint-2'
self._save_checkpoint(model, trial, metrics=metrics)
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2383, in _save_checkpoint
os.rename(staging_output_dir, output_dir)
FileNotFoundError: [Errno 2] No such file or directory: 'output/output_minicpmv2/tmp-checkpoint-2' -> 'output/output_minicpmv2/checkpoint-2'
os.rename(staging_output_dir, output_dir)
FileNotFoundError: [Errno 2] No such file or directory: 'output/output_minicpmv2/tmp-checkpoint-2' -> 'output/output_minicpmv2/checkpoint-2'
20%|█████████████████████████████████▍ | 2/10 [19:10<1:16:41, 575.24s/it]
[2024-05-10 17:01:00,045] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 855 closing signal SIGTERM
[2024-05-10 17:01:00,515] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 854) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/opt/conda/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
finetune.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2024-05-10_17:01:00
host : 5be762607d25
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 856)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-05-10_17:01:00
host : 5be762607d25
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 857)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-05-10_17:01:00
host : 5be762607d25
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 854)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
- finetune_ds.sh
#!/bin/bash
GPUS_PER_NODE=4
NNODES=1
NODE_RANK=0
MASTER_ADDR=localhost
MASTER_PORT=6001
MODEL="./MiniCPM-V-2"
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
# See the section for finetuning in README for more information.
DATA="./minicpm.json"
EVAL_DATA="./minicpm.json"
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
torchrun $DISTRIBUTED_ARGS finetune.py \
--model_name_or_path $MODEL \
--data_path $DATA \
--eval_data_path $EVAL_DATA \
--remove_unused_columns false \
--label_names "labels" \
--prediction_loss_only false \
--bf16 true \
--bf16_full_eval true \
--do_train \
--do_eval \
--max_steps 10 \
--eval_steps 1 \
--output_dir output/output_minicpmv2 \
--logging_dir output/output_minicpmv2 \
--logging_strategy "steps" \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "steps" \
--save_strategy "steps" \
--save_steps 1 \
--save_total_limit 10 \
--learning_rate 5e-7 \
--weight_decay 0.1 \
--adam_beta2 0.95 \
--warmup_ratio 0.01 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--gradient_checkpointing True \
--deepspeed ds_config_zero2.json \
--report_to none # wandb tensorboard
- dataset eg
[
{
"id": "267",
"image": "absolute_path",
"conversations": [
{
"role": "user",
"content": "<image>\nprompt"
},
{
"role": "assistant",
"content": "label"
}
]
}
]
你好,看起来是训练结果存储路径的问题,请核对一下保存地址
应该不是 因为我看那个路径第一次是重命名成功了 有没可能是多gpu想改多次那个路径导致的
你试试 save_steps 设置的大一些呢?