BELLE
BELLE copied to clipboard
finetune多卡报错binascii.Error: Incorrect padding
多卡启动指令:
CUDA_VISIBLE_DEVICES=2,3 torchrun --nproc_per_node 2 train.py \
--model_name_or_path /workspace/BELLE-7B-2M \
--deepspeed configs/deepspeed_config_stage3.json \
--train_file /workspace/BELLE-main-3/data/convert_all_0525.json \
--validation_file /workspace/BELLE-main-3/data/convert_all_0525.json \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 2 \
--gradient_accumulation_steps 4 \
--num_train_epochs 2 \
--model_max_length 1024 \
--save_strategy "steps" \
--save_total_limit 3 \
--learning_rate 8e-6 \
--weight_decay 0.00001 \
--warmup_ratio 0.05 \
--lr_scheduler_type "cosine" \
--logging_steps 10 \
--evaluation_strategy "steps" \
--fp16 True \
--seed 1234 \
--gradient_checkpointing True \
--cache_dir ./cache_dir \
--output_dir ./output_dir >train.log &
But run with error:
Traceback (most recent call last):
File "train.py", line 394, in <module>
main()
File "train.py", line 163, in main
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/opt/conda/lib/python3.8/site-packages/transformers/hf_argparser.py", line 332, in parse_args_into_dataclasses
obj = dtype(**inputs)
File "<string>", line 114, in __init__
File "/opt/conda/lib/python3.8/site-packages/transformers/training_args.py", line 1446, in __post_init__
self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed)
File "/opt/conda/lib/python3.8/site-packages/transformers/deepspeed.py", line 77, in __init__
super().__init__(config_file_or_dict)
File "/opt/conda/lib/python3.8/site-packages/transformers/deepspeed.py", line 67, in __init__
super().__init__(config_file_or_dict)
File "/opt/conda/lib/python3.8/site-packages/accelerate/utils/deepspeed.py", line 52, in __init__
config_decoded = base64.urlsafe_b64decode(config_file_or_dict).decode("utf-8")
File "/opt/conda/lib/python3.8/base64.py", line 133, in urlsafe_b64decode
return b64decode(s)
File "/opt/conda/lib/python3.8/base64.py", line 87, in b64decode
return binascii.a2b_base64(s)
binascii.Error: Incorrect padding
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 4963) of binary: /opt/conda/bin/python3
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
train.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2023-05-29_01:52:23
host : 93fef725676a
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 4964)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-05-29_01:52:23
host : 93fef725676a
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 4963)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
请问这种情况要修改什么
deepspeed_config_stage3.json 这个文件是utf-8的啊
File "/opt/conda/lib/python3.8/site-packages/transformers/deepspeed.py", line 67, in init super().init(config_file_or_dict) File "/opt/conda/lib/python3.8/site-packages/accelerate/utils/deepspeed.py", line 52, in init config_decoded = base64.urlsafe_b64decode(config_file_or_dict).decode("utf-8")
看样子是读取deepspeed_config_stage3.json 报错了 可是这个json文件不应该存在读取错误啊,不存在格式问题的
是的。很奇怪。
是的。很奇怪。
多卡启动指令:
CUDA_VISIBLE_DEVICES=2,3 torchrun --nproc_per_node 2 train.py \ --model_name_or_path /workspace/BELLE-7B-2M \ --deepspeed configs/deepspeed_config_stage3.json \ --train_file /workspace/BELLE-main-3/data/convert_all_0525.json \ --validation_file /workspace/BELLE-main-3/data/convert_all_0525.json \ --per_device_train_batch_size 2 \ --per_device_eval_batch_size 2 \ --gradient_accumulation_steps 4 \ --num_train_epochs 2 \ --model_max_length 1024 \ --save_strategy "steps" \ --save_total_limit 3 \ --learning_rate 8e-6 \ --weight_decay 0.00001 \ --warmup_ratio 0.05 \ --lr_scheduler_type "cosine" \ --logging_steps 10 \ --evaluation_strategy "steps" \ --fp16 True \ --seed 1234 \ --gradient_checkpointing True \ --cache_dir ./cache_dir \ --output_dir ./output_dir >train.log &
But run with error:
Traceback (most recent call last): File "train.py", line 394, in <module> main() File "train.py", line 163, in main model_args, data_args, training_args = parser.parse_args_into_dataclasses() File "/opt/conda/lib/python3.8/site-packages/transformers/hf_argparser.py", line 332, in parse_args_into_dataclasses obj = dtype(**inputs) File "<string>", line 114, in __init__ File "/opt/conda/lib/python3.8/site-packages/transformers/training_args.py", line 1446, in __post_init__ self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed) File "/opt/conda/lib/python3.8/site-packages/transformers/deepspeed.py", line 77, in __init__ super().__init__(config_file_or_dict) File "/opt/conda/lib/python3.8/site-packages/transformers/deepspeed.py", line 67, in __init__ super().__init__(config_file_or_dict) File "/opt/conda/lib/python3.8/site-packages/accelerate/utils/deepspeed.py", line 52, in __init__ config_decoded = base64.urlsafe_b64decode(config_file_or_dict).decode("utf-8") File "/opt/conda/lib/python3.8/base64.py", line 133, in urlsafe_b64decode return b64decode(s) File "/opt/conda/lib/python3.8/base64.py", line 87, in b64decode return binascii.a2b_base64(s) binascii.Error: Incorrect padding ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 4963) of binary: /opt/conda/bin/python3 Traceback (most recent call last): File "/opt/conda/bin/torchrun", line 8, in <module> sys.exit(main()) File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper return f(*args, **kwargs) File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 794, in main run(args) File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 785, in run elastic_launch( File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ train.py FAILED ------------------------------------------------------------ Failures: [1]: time : 2023-05-29_01:52:23 host : 93fef725676a rank : 1 (local_rank: 1) exitcode : 1 (pid: 4964) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2023-05-29_01:52:23 host : 93fef725676a rank : 0 (local_rank: 0) exitcode : 1 (pid: 4963) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================
请问这种情况要修改什么
相同的问题,你解决了么
貌似是因为gpu卡上有正在运行的程序