MGM
MGM copied to clipboard
Failed to continous sft for yi-34B with 8x CUDA graphics card! (deepspeed zero3)
bash ./scripts/yi/train/stage_2_full_yi34b_672_hr_1536.sh
error log:
Traceback (most recent call last):
File "/home/xxx/MiniGemini/minigemini/train/train_mem.py", line 4, in <module>
train(attn_implementation="flash_attention_2")
File "/home/xxx/MiniGemini/minigemini/train/train.py", line 1172, in train
model.get_model().initialize_vision_modules(
File "/home/xxx/MiniGemini/minigemini/model/mini_gemini_arch.py", line 137, in initialize_vision_modules
self.mm_projector.load_state_dict(named_parameters)
File "/home/xxx/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2152, in load_state_dict
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for Sequential:
size mismatch for 0.weight: copying a param with shape torch.Size([7168, 1024]) from checkpoint, the shape in current model is torch.Size([0]).
size mismatch for 0.bias: copying a param with shape torch.Size([7168]) from checkpoint, the shape in current model is torch.Size([0]).
size mismatch for 2.weight: copying a param with shape torch.Size([7168, 7168]) from checkpoint, the shape in current model is torch.Size([0]).
size mismatch for 2.bias: copying a param with shape torch.Size([7168]) from checkpoint, the shape in current model is torch.Size([0]).
./scripts/yi/train/stage_2_full_yi34b_672_hr_1536.sh:
#!/bin/bash
PRETRAIN_NAME=Mini-Gemini-34B-Pretrain
FINETUNE_NAME=Mini-Gemini-34B-HD
AUX_SIZE=1536
IMAGE_GRID=2
IMAGE_GLOBAL=True
LR_MULTI="model.mm_projector:2,model.vlm_uni:2"
deepspeed \
./minigemini/train/train_mem.py \
--deepspeed ./scripts/zero3.json \
--model_name_or_path model_zoo/YanweiLi/Mini-Gemini-34B-HD \
--version chatml_direct \
--data_path ./dataset.json \
--image_grid $IMAGE_GRID \
--image_global $IMAGE_GLOBAL \
--pretrain_mm_mlp_adapter ./$PRETRAIN_NAME/mm_projector.bin \
--vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
--vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_aspect_ratio pad \
--group_by_modality_length True \
--image_size_aux $AUX_SIZE \
--bf16 True \
--output_dir ./$FINETUNE_NAME \
--num_train_epochs 1 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 2 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 1 \
--learning_rate 1e-5 \
--lr_multi $LR_MULTI \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 4096 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to tensorboard
Hi, if you want to continue the sft training stage, please disable this hyper-parameter --pretrain_mm_mlp_adapter
.
@yanwei-li