How many epochs should I train for fine tunning?

Open H0128Z opened this issue 1 year ago • 0 comments
Dear Author,
thanks for sharing great works.
I wonder how many epochs do you recommand to fine tunning?
I have about 100 images of cutom traning datasets.
I'm training the data as below.
while checking the log images during training, Image quality at 6000 epochs seems not really good.
Should I train more epochs? or change some other parameters..?
sf: 4
model:
  base_learning_rate: 5.0e-05
  target: ldm.models.diffusion.ddpm.LatentDiffusionSRTextWT
  params:
    # parameterization: "v"
    linear_start: 0.00085
    linear_end: 0.0120
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: caption
    image_size: 512
    channels: 4
    cond_stage_trainable: False   # Note: different from the one we trained before
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    use_ema: False
    # for training only
    ckpt_path : '/data2/LHC/StableSR/stablesr_000117.ckpt'
    unfrozen_diff: False
    random_size: False
    time_replace: 1000
    use_usm: True
    #P2 weighting, we do not use in final version
    p2_gamma: ~
    p2_k: ~
    # ignore_keys: []

    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModelDualcondV2
      params:
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_head_channels: 64
        use_spatial_transformer: True
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
        use_checkpoint: False
        legacy: False
        semb_channels: 256

    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        # for training only
        ckpt_path: '/data2/LHC/StableSR/stablesr_000117.ckpt'
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 512
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity

    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
      params:
        freeze: True
        layer: "penultimate"

    structcond_stage_config:
      target: ldm.modules.diffusionmodules.openaimodel.EncoderUNetModelWT
      params:
        image_size: 96
        in_channels: 4
        model_channels: 256
        out_channels: 256
        num_res_blocks: 2
        attention_resolutions: [ 4, 2, 1 ]
        dropout: 0
        channel_mult: [ 1, 1, 2, 2 ]
        conv_resample: True
        dims: 2
        use_checkpoint: False
        use_fp16: False
        num_heads: 4
        num_head_channels: -1
        num_heads_upsample: -1
        use_scale_shift_norm: False
        resblock_updown: False
        use_new_attention_order: False


degradation:
  # the first degradation process
  resize_prob: [0.2, 0.7, 0.1]  # up, down, keep
  resize_range: [0.3, 1.5]
  gaussian_noise_prob: 0.5
  noise_range: [1, 15]
  poisson_scale_range: [0.05, 2.0]
  gray_noise_prob: 0.4
  jpeg_range: [60, 95]

  # the second degradation process
  second_blur_prob: 0.5
  resize_prob2: [0.3, 0.4, 0.3]  # up, down, keep
  resize_range2: [0.6, 1.2]
  gaussian_noise_prob2: 0.5
  noise_range2: [1, 12]
  poisson_scale_range2: [0.05, 1.0]
  gray_noise_prob2: 0.4
  jpeg_range2: [60, 100]

  gt_size: 512
  no_degradation_prob: 0.01

data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 6
    num_workers: 10
    wrap: false
    train:
      target: basicsr.data.realesrgan_dataset.RealESRGANDataset
      params:
        queue_size: 180
        gt_path: '/data2/LHC/StableSR/StableSR/DNA_512/train'
        face_gt_path: '/mnt/lustre/share/jywang/dataset/FFHQ/1024/'
        num_face: 10000
        crop_size: 512
        io_backend:
          type: disk

        blur_kernel_size: 21
        kernel_list: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso']
        kernel_prob: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03]
        sinc_prob: 0.1
        blur_sigma: [0.2, 1.5]
        betag_range: [0.5, 2.0]
        betap_range: [1, 1.5]

        blur_kernel_size2: 11
        kernel_list2: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso']
        kernel_prob2: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03]
        sinc_prob2: 0.1
        blur_sigma2: [0.2, 1.0]
        betag_range2: [0.5, 2.0]
        betap_range2: [1, 1.5]

        final_sinc_prob: 0.8

        gt_size: 512
        use_hflip: True
        use_rot: False
    validation:
      target: basicsr.data.realesrgan_dataset.RealESRGANDataset
      params:
        gt_path: '/data2/LHC/StableSR/StableSR/DNA_512/val'
        crop_size: 512
        io_backend:
          type: disk

        blur_kernel_size: 21
        kernel_list: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso']
        kernel_prob: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03]
        sinc_prob: 0.1
        blur_sigma: [0.2, 1.5]
        betag_range: [0.5, 2.0]
        betap_range: [1, 1.5]

        blur_kernel_size2: 11
        kernel_list2: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso']
        kernel_prob2: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03]
        sinc_prob2: 0.1
        blur_sigma2: [0.2, 1.0]
        betag_range2: [0.5, 2.0]
        betap_range2: [1, 1.5]

        final_sinc_prob: 0.8

        gt_size: 512
        use_hflip: True
        use_rot: False

test_data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 1
    num_workers: 6
    wrap: false
    test:
      target: basicsr.data.realesrgan_dataset.RealESRGANDataset
      params:
        gt_path: '/data2/LHC/StableSR/StableSR/DNA_512/test'
        crop_size: 512
        io_backend:
          type: disk

        blur_kernel_size: 21
        kernel_list: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso']
        kernel_prob: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03]
        sinc_prob: 0.1
        blur_sigma: [0.2, 1.5]
        betag_range: [0.5, 2.0]
        betap_range: [1, 1.5]

        blur_kernel_size2: 11
        kernel_list2: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso']
        kernel_prob2: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03]
        sinc_prob2: 0.1
        blur_sigma2: [0.2, 1.0]
        betag_range2: [0.5, 2.0]
        betap_range2: [1, 1.5]

        final_sinc_prob: 0.8

        gt_size: 512
        use_hflip: True
        use_rot: True

lightning:
  modelcheckpoint:
    params:
      every_n_train_steps: 1500
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 1500
        max_images: 4
        increase_log_steps: False

  trainer:
    benchmark: True
    max_steps: 100000
    accumulate_grad_batches: 4
Sep 29 '24 14:09 H0128Z