fairseq icon indicating copy to clipboard operation
fairseq copied to clipboard

How to save the fine-tune checkpoint

Open asrays opened this issue 6 months ago • 0 comments

I'm running fairseq_cli/hydra_train.py to fine tune hubert model. Script is running but its not saving the checkpoint. Am I missing something ?

My base_10h.yaml looks like this

# @package _group_

common:
  fp16: true
  log_format: json
  log_interval: 200
  tensorboard_logdir: tblog
  seed: 1337

checkpoint:
  save_interval: 5
  keep_interval_updates: 1
  no_epoch_checkpoints: true
  best_checkpoint_metric: wer

distributed_training:
  ddp_backend: c10d
  find_unused_parameters: true
  distributed_world_size: 1
  distributed_port: 29671
  nprocs_per_node: 8

task:
  _name: hubert_pretraining
  data: ???
  fine_tuning: true
  label_dir: ???
  normalize: false  # must be consistent with pre-training
  labels: ["ltr"]
  single_target: true

dataset:
  num_workers: 0
  max_tokens: 3200000
  validate_after_updates: ${model.freeze_finetune_updates}
  validate_interval: 5
  train_subset: train
  valid_subset: valid

criterion:
  _name: ctc
  zero_infinity: true

optimization:
  max_update: 25000
  lr: [2e-5]
  sentence_avg: true
  update_freq: [1]

optimizer:
  _name: adam
  adam_betas: (0.9,0.98)
  adam_eps: 1e-08

lr_scheduler:
  _name: tri_stage
  warmup_steps: 8000
  hold_steps: 0
  decay_steps: 72000
  final_lr_scale: 0.05

model:
  _name: hubert_ctc
  w2v_path: ???
  apply_mask: true
  mask_selection: static
  mask_length: 10
  mask_other: 0
  mask_prob: 0.75
  mask_channel_selection: static
  mask_channel_length: 64
  mask_channel_other: 0
  mask_channel_prob: 0.5
  layerdrop: 0.1
  dropout: 0.0
  activation_dropout: 0.1
  attention_dropout: 0.0
  feature_grad_mult: 0.0
  freeze_finetune_updates: 10000

hydra:
  job:
    config:
      override_dirname:
        kv_sep: '-'
        item_sep: '__'
        exclude_keys:
          - run
          - task.data
          - task.label_dir
          - model.w2v_path
          - dataset.train_subset
          - dataset.valid_subset
          - criterion.wer_kenlm_model
          - criterion.wer_lexicon
  run:
    dir: /home/ashray/fairseq/checkpoint
  sweep:
    dir: ???
    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

asrays avatar May 15 '25 15:05 asrays