FlagEmbedding
FlagEmbedding copied to clipboard
训练过程中train_loss和eval_loss都是先下降后上升再缓慢下降
我的训练脚本如下:
#!/bin/bash
# --------------------------
# 时间戳生成
# --------------------------
export TIMESTAMP=$(date +"%Y%m%d_%H%M%S") # 时间戳格式:年月日_时分秒
# --------------------------
# 基础路径配置
# --------------------------
export HOME_DIR="/inno-vepfs/languoxing"
export TRAIN_DATA="${HOME_DIR}/datasets/train_data_no_hn_dedup_shuffle456_202410_to_202503.jsonl"
export EVAL_DATA="${HOME_DIR}/datasets/dev_data_no_hn_dedup.jsonl"
# --------------------------
# 模型参数
# --------------------------
export MODEL_NAME_OR_PATH="${HOME_DIR}/models/bge-base-zh-v1.5"
export HF_HUB_CACHE="${HOME_DIR}/cache/huggingface/hub" # 修正路径中的.cache为cache(与后续LOG_DIR一致)
# --------------------------
# 数据参数
# --------------------------
export TRAIN_GROUP_SIZE=16
export QUERY_MAX_LEN=512
export PASSAGE_MAX_LEN=512
export PAD_TO_MULTIPLE_OF=8
export EVAL_GROUP_SIZE=8
# --------------------------
# 训练参数
# --------------------------
export NUM_GPUS=2
export OUTPUT_DIR="${HOME_DIR}/outputs/bge-base-zh-v1.5-finetuned_202410_202503_with_eval_${NUM_GPUS}_${TIMESTAMP}"
export LOG_DIR="${OUTPUT_DIR}/logs"
export OVERWRITE_OUTPUT_DIR="false"
export LEARNING_RATE="5e-6"
export NUM_TRAIN_EPOCHS=2
export PER_DEVICE_TRAIN_BATCH_SIZE=128
export WARMUP_RATIO=0.1
export DEEPSPEED="${HOME_DIR}/projects/bge_embedder/ds_stage0.json"
export LOGGING_STRATEGY="steps"
export LOGGING_STEPS=1
export SAVE_STRATEGY="steps"
export SAVE_STEPS=100
export TEMPERATURE=0.02
export EVAL_STRATEGY="steps"
export EVAL_STEPS=1
# --------------------------
# 目录创建
# --------------------------
mkdir -p "${LOG_DIR}"
# --------------------------
# 执行训练脚本
# --------------------------
echo "即将开始训练,模型将保存至: ${OUTPUT_DIR}"
bash "${HOME_DIR}/projects/bge_embedder/train_bge_with_eval.sh" 2>&1 | tee -a "${LOG_DIR}/output.log"
# 打印最终输出目录(确保在管道命令后)
echo "训练完成!模型已保存至: ${OUTPUT_DIR}"
其中train_bge_with_eval.sh的内容如下:
# 激活Conda环境的核心命令
source /root/miniconda3/etc/profile.d/conda.sh
conda activate bge_with_eval
export WANDB_MODE=disabled
echo "系统的tensorboard目录:${TENSORBOARD_LOG_PATH}"
if [ -z "$HF_HUB_CACHE" ]; then
export HF_HUB_CACHE="${HOME_DIR}/.cache/huggingface/hub"
fi
model_args="\
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--cache_dir $HF_HUB_CACHE \
"
data_args="\
--train_data ${TRAIN_DATA} \
--eval_data ${EVAL_DATA} \
--cache_path ${DATA_CACHE_PATH} \
--train_group_size ${TRAIN_GROUP_SIZE} \
--query_max_len ${QUERY_MAX_LEN} \
--passage_max_len ${PASSAGE_MAX_LEN} \
--pad_to_multiple_of ${PAD_TO_MULTIPLE_OF} \
--query_instruction_for_retrieval '为这个句子生成表示以用于检索相关文章: ' \
--query_instruction_format '{}{}' \
--eval_group_size ${EVAL_GROUP_SIZE} \
"
training_args="\
--output_dir ${OUTPUT_DIR} \
--learning_rate ${LEARNING_RATE} \
--fp16 \
--num_train_epochs ${NUM_TRAIN_EPOCHS} \
--per_device_train_batch_size ${PER_DEVICE_TRAIN_BATCH_SIZE} \
--dataloader_drop_last \
--warmup_ratio ${WARMUP_RATIO} \
--gradient_checkpointing \
--deepspeed ${DEEPSPEED} \
--logging_strategy ${LOGGING_STRATEGY} \
--logging_steps ${LOGGING_STEPS} \
--logging_dir ${TENSORBOARD_LOG_PATH} \
--save_strategy ${SAVE_STRATEGY} \
--save_steps ${SAVE_STEPS} \
--negatives_cross_device \
--temperature ${TEMPERATURE} \
--sentence_pooling_method cls \
--normalize_embeddings True \
--kd_loss_type kl_div \
--eval_strategy ${EVAL_STRATEGY} \
--eval_steps ${EVAL_STEPS} \
--eval_accumulation_steps 1 \
--load_best_model_at_end \
--metric_for_best_model 'eval_loss' \
--include_for_metrics loss \
"
cmd="torchrun --nproc_per_node ${NUM_GPUS} \
-m FlagEmbedding.finetune.embedder.encoder_only.base \
$model_args \
$data_args \
$training_args \
"
echo "$cmd"
eval "$cmd"
总共2个epoch,训练总步数为4718。eval_loss、train_loss、grad_norm和learning_rate的变化趋势见图片。为什么会出现train_loss和eval_loss先下降到局部最小值,然后上升,然后又下降(且下降幅度较小,难以达到之前的局部最小值),如何调整参数优化训练过程