OneTrainer
OneTrainer copied to clipboard
[Bug]: Cosine with hard restarts ends with peak
What happened?
I am experimenting with multires training (unrelated), batch sizes and accumulation. I come from Kohya and I had also great results with OneTrainer. In fact I want to continue to use OneTrainer in the future, but there appeared a bug.
Also, important if you try to reproduce it: sometimes the end peak only appears in tensorboard after restarting it!
After 4 set cycles, in the last 2 steps, the LR goes up as if there is another cycle to do. Settings for this run:
{
"__version": 4,
"training_method": "LORA",
"model_type": "STABLE_DIFFUSION_15",
"debug_mode": false,
"debug_dir": "debug",
"workspace_dir": "workspace/run",
"cache_dir": "workspace-cache/run",
"tensorboard": true,
"tensorboard_expose": false,
"continue_last_backup": false,
"include_train_config": "NONE",
"base_model_name": "P:/SD_model_safe/PreTrainModel.safetensors",
"weight_dtype": "FLOAT_16",
"output_dtype": "FLOAT_32",
"output_model_format": "SAFETENSORS",
"output_model_destination": "P:/cui/ComfyUI/models/loras/multires.safetensors",
"gradient_checkpointing": false,
"force_circular_padding": false,
"concept_file_name": "training_concepts/concepts.json",
"concepts": null,
"aspect_ratio_bucketing": false,
"latent_caching": true,
"clear_cache_before_training": true,
"learning_rate_scheduler": "COSINE_WITH_HARD_RESTARTS",
"custom_learning_rate_scheduler": null,
"scheduler_params": [],
"learning_rate": 0.00022,
"learning_rate_warmup_steps": 0,
"learning_rate_cycles": 4,
"epochs": 92,
"batch_size": 1,
"gradient_accumulation_steps": 2,
"ema": "OFF",
"ema_decay": 0.999,
"ema_update_step_interval": 5,
"dataloader_threads": 2,
"train_device": "cuda",
"temp_device": "cpu",
"train_dtype": "FLOAT_16",
"fallback_train_dtype": "BFLOAT_16",
"enable_autocast_cache": true,
"only_cache": false,
"resolution": "768",
"attention_mechanism": "XFORMERS",
"align_prop": false,
"align_prop_probability": 0.1,
"align_prop_loss": "AESTHETIC",
"align_prop_weight": 0.01,
"align_prop_steps": 20,
"align_prop_truncate_steps": 0.5,
"align_prop_cfg_scale": 7.0,
"mse_strength": 1.0,
"mae_strength": 0.0,
"vb_loss_strength": 1.0,
"loss_weight_fn": "MIN_SNR_GAMMA",
"loss_weight_strength": 5.0,
"dropout_probability": 0.0,
"loss_scaler": "NONE",
"learning_rate_scaler": "NONE",
"offset_noise_weight": 0.12,
"perturbation_noise_weight": 0.0,
"rescale_noise_scheduler_to_zero_terminal_snr": false,
"force_v_prediction": false,
"force_epsilon_prediction": false,
"min_noising_strength": 0.0,
"max_noising_strength": 1.0,
"timestep_distribution": "UNIFORM",
"noising_weight": 0.0,
"noising_bias": 0.0,
"unet": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": 0,
"stop_training_after_unit": "NEVER",
"learning_rate": 0.00022,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true,
"attention_mask": false
},
"prior": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": 0,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true,
"attention_mask": false
},
"text_encoder": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": 92,
"stop_training_after_unit": "NEVER",
"learning_rate": 0.00011,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true,
"attention_mask": false
},
"text_encoder_layer_skip": 0,
"text_encoder_2": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": 30,
"stop_training_after_unit": "EPOCH",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true,
"attention_mask": false
},
"text_encoder_2_layer_skip": 0,
"text_encoder_3": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": 30,
"stop_training_after_unit": "EPOCH",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true,
"attention_mask": false
},
"text_encoder_3_layer_skip": 0,
"vae": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "FLOAT_32",
"dropout_probability": 0.0,
"train_embedding": true,
"attention_mask": false
},
"effnet_encoder": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true,
"attention_mask": false
},
"decoder": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true,
"attention_mask": false
},
"decoder_text_encoder": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true,
"attention_mask": false
},
"decoder_vqgan": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true,
"attention_mask": false
},
"masked_training": false,
"unmasked_probability": 0.1,
"unmasked_weight": 0.1,
"normalize_masked_area_loss": false,
"embedding_learning_rate": null,
"preserve_embedding_norm": false,
"embedding": {
"__version": 0,
"uuid": "dcd1a1ec-1b61-4aba-b3d8-6854c3199b32",
"model_name": "",
"placeholder": "
I have my suspicion why this is happening: I am doing concepts with an uneven number of images, select Batch size 1 and gradient accumulation of 2. Every concept repeats two times. This way images find themselves together in (accumulation) batches that previously would not, because at the end of the dataset of the first repeat there is 1 image missing to fill the accu. batch, so it takes the first image. This offsets the configuration of the batches.
My experimentation somehow messes with the lr scheduler and the sudden rise at the end ruins some training runs. I could butcher the code and set the last 2% of training to min_lr but yea this is not the optimal solution, so I post here.
What did you expect would happen?
Is described above
Relevant log output
No response
Output of pip freeze
No response