Finetuned Model
Hi I have this types folders epoch_6_whole, which has .pt files. I cant load this file. How can I use finetuned llm model?
I got this error.
RuntimeError: Error(s) in loading state_dict for TransformerLM: Missing key(s) in state_dict: "text_embedding.weight", "text_encoder_affine_layer.weight", "text_encoder_affine_layer.bias", "llm_embedding.weight", "llm_decoder.weight", "llm_decoder.bias", "speech_embedding.weight", "spk_embed_affine_layer.weight", "spk_embed_affine_layer.bias". Unexpected key(s) in state_dict: "module", "buffer_names", "optimizer", "param_shapes", "frozen_param_shapes", "shared_params", "frozen_param_fragments", "lr_scheduler", "data_sampler", "random_ltd", "sparse_tensor_module_names", "skipped_steps", "global_steps", "global_samples", "dp_world_size", "mp_world_size", "ds_config", "ds_version", "optim", "optim_conf", "scheduler", "scheduler_conf", "max_epoch", "grad_clip", "accum_grad", "log_interval", "save_per_step", "train_engine", "model", "config", "train_data", "cv_data", "checkpoint", "model_dir", "tensorboard_dir", "dist_backend", "num_workers", "prefetch", "pin_memory", "save_states", "timeout", "deepspeed", "deepspeed_config", "deepscale", "deepscale_config", "dtype", "save_time", "tag", "step", "epoch", "batch_idx", "loss_dict", "is_gradient_accumulation_boundary", "lr", "grad_norm".
never seen this bug, try set strict=False in llm.load_state_dict
I added the following lines to cosyvoice.yaml, but the training doesn't resume from the specified checkpoint. Instead, it starts from epoch 0, step 0.
current_epoch: 1 # added current_step: 30000 # added
Do batch_size and gradient_accumulation_steps need to be set to one? Increasing batch_size causes an error in train_utils.py (line 85), while gradient_accumulation_steps can be increased without issue. Does having a higher batch_size * gradient_accumulation_steps improve accuracy and affect the learning rate?
TEST 1
from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio
cosyvoice = CosyVoice('./pretrained_models/CosyVoice-300M-TR')
import torch from cosyvoice.utils.train_utils import ( init_distributed, init_dataset_and_dataloader, init_optimizer_and_scheduler, init_summarywriter, save_model, wrap_cuda_model, check_modify_and_save_config) from hyperpyyaml import load_hyperpyyaml
config_file = "./pretrained_models/CosyVoice-300M-TR/cosyvoice.yaml"
override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != "llm"}
with open(config_file, 'r') as f: configs = load_hyperpyyaml(f, overrides=override_dict)
#configs['train_conf'].update(vars(args))
print(configs)
load checkpoint
model = configs["llm"]
model.load_state_dict(torch.load("./pretrained_models/CosyVoice-300M-TR/epoch_1_whole/mp_rank_00_model_states.pt", map_location='cpu')["module"])
cosyvoice.model.llm = model.to("cuda").half()
If I modify the LLM model in this way, there is no problem, and I get a good Turkish voice output.
TEST 2
from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio
cosyvoice = CosyVoice('./pretrained_models/CosyVoice-300M-TR')
def load(self, llm_model, flow_model, hift_model): self.llm.load_state_dict(torch.load(llm_model, map_location=self.device),strict=False) self.llm.to(self.device).eval() self.llm.half() self.flow.load_state_dict(torch.load(flow_model, map_location=self.device),strict=False) self.flow.to(self.device).eval() self.hift.load_state_dict(torch.load(hift_model, map_location=self.device),strict=False) self.hift.to(self.device).eval()
cosyvoice.model.load("./pretrained_models/CosyVoice-300M-TR/epoch_1_whole/mp_rank_00_model_states.pt", "./pretrained_models/CosyVoice-300M-TR/flow.pt", "./pretrained_models/CosyVoice-300M-TR/hift.pt", )
However, if I do this, I get incorrect voice output. It seems there are significant mismatches in the names, and when using strict=False, the model doesn't load most of the training properly.
TEST 3
from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio
cosyvoice = CosyVoice('./pretrained_models/CosyVoice-300M-TR')
def load(self, llm_model, flow_model, hift_model): self.llm.load_state_dict(torch.load(llm_model, map_location=self.device),strict=False) self.llm.to(self.device).eval() self.llm.half() self.flow.load_state_dict(torch.load(flow_model, map_location=self.device)) self.flow.to(self.device).eval() self.hift.load_state_dict(torch.load(hift_model, map_location=self.device)) self.hift.to(self.device).eval()
cosyvoice.model.load("./pretrained_models/CosyVoice-300M-TR/epoch_1_whole/mp_rank_00_model_states.pt", "./pretrained_models/CosyVoice-300M-TR/flow.pt", "./pretrained_models/CosyVoice-300M-TR/hift.pt", )
If I apply strict only to the LLM model, it still throws an error when loading the untouched flow.pt and hift.pt models.
RuntimeError: Error(s) in loading state_dict for MaskedDiffWithXvec: Unexpected key(s) in state_dict: "encoder.encoders.0.self_attn.pos_bias_u", .......... so many
@aluminumbox
Hi, @osmankrblt ,I'm guessing that this model parameter mismatch problem you're experiencing is due to the lack of aggregation in the .pt file saved after distributed training
Hi @SongYao2, how can I fix that? I cant resume training just for this reason.
Are the training scripts out of date? I see that the examples/libritts/cosyvoice/* do not change for a long time.
Are the training scripts out of date? I see that the examples/libritts/cosyvoice/* do not change for a long time.
you can use run.sh to train llm/flow of cosyvoice