InternVL
InternVL copied to clipboard
finetune脚本有什么问题?一直卡住
您好,我最近想要finetune一个数据集,我用的sh文件是:internvl_chat_gpt_oss/shell/internvl3_5_qwen3/internvl3_5_8b_sft.sh 文件内容如下,但是我一直卡在这个位置
Replace train dataloader!!
Replace train dataloader!!
Replace train dataloader!!
[2025-10-13 16:58:58,051] [INFO] [comm.py:821:init_distributed] cdb=None
[2025-10-13 16:58:58,051] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
Replace train dataloader!!
[2025-10-13 16:58:59,683] [INFO] [comm.py:821:init_distributed] cdb=None
[2025-10-13 16:58:59,683] [INFO] [comm.py:821:init_distributed] cdb=None
[2025-10-13 16:58:59,698] [INFO] [comm.py:821:init_distributed] cdb=None
sh脚本内容
set -x
export MASTER_PORT=0
export TF_CPP_MIN_LOG_LEVEL=3
export USE_TCS_LOADER=0
export LAUNCHER=pytorch
export CUDA_VISIBLE_DEVICES=4,5,6,7
NPROC_PER_NODE=4
# Set the task name
CURRENT_PATH=$(pwd)
PROJECT_NAME=internvl3_5_8b_sft
TASK_NAME=$(basename "$0")
TASK_NAME="${TASK_NAME%.*}"
echo "TASK_NAME: $TASK_NAME"
echo "PROJECT_NAME: $PROJECT_NAME"
export OUTPUT_DIR=${CURRENT_PATH}/work_dirs/${PROJECT_NAME}/${TASK_NAME}
export TENSORBOARD_DIR=${OUTPUT_DIR}/tensorboard
export JOBLOG=${OUTPUT_DIR}/training.log
if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
fi
BATCH_SIZE=${BATCH_SIZE:-512}
PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
WORLD_SIZE=${WORLD_SIZE:-1}
RANK=${RANK:-0}
MASTER_ADDR=${MASTER_ADDR:-localhost}
GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / WORLD_SIZE / NPROC_PER_NODE))
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export TRITON_CACHE_DIR="/dev/shm/triton_wwy/"
export VLLM_CACHE_ROOT="/dev/shm/vllmca_wwy/"
torchrun \
--node-rank=${RANK:-0} \
--nnodes=${WORLD_SIZE:-1} \
--nproc-per-node=$NPROC_PER_NODE \
--master-addr=${MASTER_ADDR:-localhost} \
--master-port=$MASTER_PORT \
internvl/train/internvl_chat_finetune.py \
--model_name_or_path "/mnt/models/InternVL3_5-8B" \
--conv_style internvl2_5 \
--use_fast_tokenizer False \
--output_dir ${OUTPUT_DIR} \
--meta_path "${CURRENT_PATH}/shell/data/tour_video_data.json" \
--overwrite_output_dir True \
--force_image_size 448 \
--max_dynamic_patch 12 \
--down_sample_ratio 0.5 \
--drop_path_rate 0.1 \
--min_num_frame 8 \
--max_num_frame 32 \
--freeze_llm False \
--freeze_mlp False \
--freeze_backbone False \
--vision_select_layer -1 \
--dataloader_num_workers 16 \
--bf16 True \
--max_steps 8000 \
--per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
--gradient_accumulation_steps ${GRADIENT_ACC:-1} \
--save_strategy "steps" \
--save_steps 500 \
--save_total_limit 100 \
--learning_rate 8e-5 \
--weight_decay 0.05 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--max_seq_length 16384 \
--split_annotations True \
--do_train True \
--grad_checkpoint True \
--gradient_checkpointing True \
--group_by_length False \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--use_custom_flash_attn True \
--report_to "tensorboard" \
--deepspeed "zero_stage3_config.json" \
--use_packed_ds True \
--num_images_expected 40 \
--max_packed_tokens 16384 \
--max_buffer_size 20 \
--log_freq 1000 \
--strict_mode False \
--replacement True \
--allow_overflow False \
--remove_unused_columns False \
--loss_reduction "square" \
--seed 42 \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"