FlagEmbedding
FlagEmbedding copied to clipboard
微调问题
(ft_emb) b405@b405-CVN-Z790-GAMING-FROZEN:/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding$ torchrun --nproc_per_node 1
-m FlagEmbedding.reranker.run
--output_dir /media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/fine_tune_rerank_models/beg_rerank_ft
--model_name_or_path /media/b405/新加卷1/Workspace_linux/b405/ZH/embeddingModels/bge-reranker-large/
--train_data /media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/post_process_embedding_finetune_dataset.jsonl
--learning_rate 6e-5
--fp16
--num_train_epochs 5
--per_device_train_batch_size 10
--gradient_accumulation_steps 4
--dataloader_drop_last True
--train_group_size 16
--max_len 512
--weight_decay 0.01
--logging_steps 10
04/20/2024 18:56:35 - WARNING - main - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, 16-bits training: True
04/20/2024 18:56:35 - INFO - main - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=True,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=6e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/fine_tune_rerank_models/beg_rerank_ft/runs/Apr20_18-56-35_b405-CVN-Z790-GAMING-FROZEN,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=10,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=linear,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=5.0,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
output_dir=/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/fine_tune_rerank_models/beg_rerank_ft,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=10,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=True,
report_to=[],
resume_from_checkpoint=None,
run_name=/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/fine_tune_rerank_models/beg_rerank_ft,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=500,
save_strategy=steps,
save_total_limit=None,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.01,
)
04/20/2024 18:56:35 - INFO - main - Model parameters ModelArguments(model_name_or_path='/media/b405/新加卷1/Workspace_linux/b405/ZH/embeddingModels/bge-reranker-large/', config_name=None, tokenizer_name=None, cache_dir=None)
04/20/2024 18:56:35 - INFO - main - Data parameters DataArguments(train_data='/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/post_process_embedding_finetune_dataset.jsonl', train_group_size=16, max_len=512)
/home/b405/.local/lib/python3.10/site-packages/accelerate/accelerator.py:436: FutureWarning: Passing the following arguments to Accelerator is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches', 'even_batches', 'use_seedable_sampler']). Please pass an accelerate.DataLoaderConfiguration instead:
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
warnings.warn(
0%| | 0/25 [00:00<?, ?it/s]Traceback (most recent call last):
File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/FlagEmbedding/reranker/run.py", line 95, in
main()
File "/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/FlagEmbedding/reranker/run.py", line 90, in main
trainer.train()
File "/home/b405/.local/lib/python3.10/site-packages/transformers/trainer.py", line 1780, in train
return inner_training_loop(
File "/home/b405/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2118, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3036, in training_step
loss = self.compute_loss(model, inputs)
File "/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/FlagEmbedding/reranker/trainer.py", line 31, in compute_loss
return model(inputs)['loss']
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1523, in forward
else self._run_ddp_forward(*inputs, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward
return self.module(*inputs, **kwargs) # type: ignore[index]
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/accelerate/utils/operations.py", line 825, in forward
return model_forward(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/accelerate/utils/operations.py", line 813, in call
return convert_to_fp32(self.model_forward(*args, **kwargs))
File "/home/b405/.local/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
return func(*args, **kwargs)
File "/media/b405/新加卷1/Workspace_linux/b405/ZH/llmProjects/FlagEmbedding/FlagEmbedding/reranker/modeling.py", line 34, in forward
ranker_out: SequenceClassifierOutput = self.hf_model(**batch, return_dict=True)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 1208, in forward
outputs = self.roberta(
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 837, in forward
encoder_outputs = self.encoder(
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 525, in forward
layer_outputs = layer_module(
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 456, in forward
layer_output = apply_chunking_to_forward(
File "/home/b405/.local/lib/python3.10/site-packages/transformers/pytorch_utils.py", line 237, in apply_chunking_to_forward
return forward_fn(*input_tensors)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 468, in feed_forward_chunk
intermediate_output = self.intermediate(attention_output)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py", line 367, in forward
hidden_states = self.intermediate_act_fn(hidden_states)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/transformers/activations.py", line 78, in forward
return self.act(input)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 198.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 210.94 MiB is free. Including non-PyTorch memory, this process has 22.93 GiB memory in use. Of the allocated memory 21.74 GiB is allocated by PyTorch, and 628.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
0%| | 0/25 [00:00<?, ?it/s]
[2024-04-20 18:56:38,931] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 358104) of binary: /usr/bin/python3
Traceback (most recent call last):
File "/home/b405/.local/bin/torchrun", line 8, in
sys.exit(main())
File "/home/b405/.local/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 347, in wrapper
return f(*args, **kwargs)
File "/home/b405/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main
run(args)
File "/home/b405/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/home/b405/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/b405/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
FlagEmbedding.reranker.run FAILED
Failures: <NO_OTHER_FAILURES>
Root Cause (first observed failure): [0]: time : 2024-04-20_18:56:38 host : b405-CVN-Z790-GAMING-FROZEN rank : 0 (local_rank: 0) exitcode : 1 (pid: 358104) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
这个为什么报错,因为内存不够吗
OOM 问题,超出显存。可以通过降低per_device_train_batch_size和train_group_size来降低显存占用。