Video-LLaMA icon indicating copy to clipboard operation
Video-LLaMA copied to clipboard

After completing the fine-tuning of 'visionbranch stage2', what should be its loss value to be considered as having correctly converged?

Open joeking11829 opened this issue 9 months ago • 2 comments

Hi guys,

Now I can fine-tune 'visionbranch_stage2_finetune.yaml' on four A100 80GB GPUs using gradient accumulation.

I'd like to know at what point the Loss is considered to have converged? For example, is it 0.7, 0.5, 0.1...? Currently, the Loss I'm observing during training fluctuates between 0.8 and 1.0.

Any suggestions would be very helpful.

Thanks!!



Here is my configuration:

model:
  arch: video_llama
  model_type: pretrain_vicuna
  freeze_vit: True
  freeze_qformer: True


  # Q-Former
  num_query_token: 32

  # If you want train models based on LLaMA-2-chat,
  # some ckpts could be download from our provided huggingface repo
  # i.e.  https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Finetuned
  llama_model: "../ckpt/Video-LLaMA-2-13B-Pretrained/llama-2-13b-chat-hf"
  imagebind_ckpt_path: "../ckpt/Video-LLaMA-2-13B-Pretrained/imagebind_huge.pth"

  # The ckpt of vision branch after stage1 pretrained, 
  ckpt: '../ckpt/Video-LLaMA-2-13B-Pretrained/VL_LLaMA_2_13B_Pretrained.pth'   # you can use our pretrained ckpt from https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Pretrained/


  # only train vision branch
  equip_audio_branch: False  # whether equips the audio branch
  frozen_llama_proj: False
  frozen_video_Qformer: False
  frozen_audio_Qformer: True
  
  fusion_head_layers: 2
  max_frame_pos: 32
  fusion_header_type: "seqTransf"

  max_txt_len: 320

  # vicuna and llama_2_chat use different template !!!

  # for llama_2_chat:
  end_sym: "</s>"
  prompt_path: "prompts/alignment_image.txt"
  prompt_template: '[INST] <<SYS>>\n \n<</SYS>>\n\n{} [/INST] '

  # for vicuna:
  #end_sym: "###"
  #prompt_path: "prompts/alignment_image.txt"
  #prompt_template: '###Human: {} ###Assistant: '


  

datasets:
  cc_sbu_align:
    data_type: images
    build_info:
      storage: ../datasets/MiniGPT-4/cc_sbu_align/
    vis_processor:
      train:
        name: "blip2_image_train"
        image_size: 224
    text_processor:
      train:
        name: "blip_caption"
      
  llava_instruct:
    data_type: images
    build_info:
      anno_dir: ../datasets/LLaVA/llava_instruct_150k.json
      videos_dir: ../datasets/LLaVA/train2017/
    vis_processor:
      train:
        name: "blip2_image_train"
        image_size: 224
    text_processor:
      train:
        name: "blip_caption"
    num_video_query_token: 32
    tokenizer_name: "../ckpt/Video-LLaMA-2-13B-Pretrained/llama-2-13b-chat-hf"
    model_type: "llama_v2"  # need to set, as vicuna and llama_2_chat use different template
    
  webvid_instruct:
    data_type: video
    build_info:
      anno_dir: ../datasets/VideoChat/videochat_instruct_11k.json
      videos_dir: ../datasets/VideoChat/videos/
    vis_processor:
      train:
        name: "alpro_video_train"
        n_frms: 8
        image_size: 224
    text_processor:
      train:
        name: "blip_caption"
    num_video_query_token: 32
    tokenizer_name: "../ckpt/Video-LLaMA-2-13B-Pretrained/llama-2-13b-chat-hf"
    model_type: "llama_v2"  # need to set, as vicuna and llama_2_chat use different template
    
run:
  task: video_text_pretrain
  # optimizer
  lr_sched: "linear_warmup_cosine_lr"
  init_lr: 3e-5
  min_lr: 1e-5
  warmup_lr: 1e-6

  weight_decay: 0.05
  max_epoch: 3
  iters_per_epoch: 3000
  accum_grad_iters: 3
  batch_size_train: 3
  batch_size_eval: 3
  num_workers: 3
  warmup_steps: 1000

  seed: 42
  output_dir: "output/videollama_stage2_finetune"

  amp: True
  resume_ckpt_path: null

  evaluate: False 
  train_splits: ["train"]

  device: "cuda"
  world_size: 1
  dist_url: "env://"
  distributed: True

joeking11829 avatar Sep 18 '23 09:09 joeking11829