TTS
TTS copied to clipboard
[Bug] Error training VITS with multiple GPUs
Describe the bug
Running train_vits_tts.py
with multiple GPUs fails with the following exception:
Traceback (most recent call last):
File "/home/ysmu/miniconda3/envs/yourtts/lib/python3.10/site-packages/trainer/trainer.py", line 1533, in fit
self._fit()
File "/home/ysmu/miniconda3/envs/yourtts/lib/python3.10/site-packages/trainer/trainer.py", line 1517, in _fit
self.train_epoch()
File "/home/ysmu/miniconda3/envs/yourtts/lib/python3.10/site-packages/trainer/trainer.py", line 1281, in train_epoch
for cur_step, batch in enumerate(self.train_loader):
File "/home/ysmu/miniconda3/envs/yourtts/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 681, in __next__
data = self._next_data()
File "/home/ysmu/miniconda3/envs/yourtts/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1376, in _next_data
return self._process_data(data)
File "/home/ysmu/miniconda3/envs/yourtts/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1402, in _process_data
data.reraise()
File "/home/ysmu/miniconda3/envs/yourtts/lib/python3.10/site-packages/torch/_utils.py", line 461, in reraise
raise exception
TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/ysmu/miniconda3/envs/yourtts/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/ysmu/miniconda3/envs/yourtts/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
TypeError: 'int' object is not iterable
I believe the issue lies in https://github.com/coqui-ai/TTS/blob/0a112f78412310b09fb7ee788a05f7a0016b4bbd/TTS/tts/models/vits.py#L1570
DistributedSampler
is not a batch sampler, but was erroneously passed as the batch_sampler
to DataLoader
.
https://github.com/coqui-ai/TTS/blob/0a112f78412310b09fb7ee788a05f7a0016b4bbd/TTS/tts/models/vits.py#L1626-L1632
The issue goes away when I update the call to
loader = DataLoader(
dataset,
sampler=sampler,
batch_size=batch_size=config.eval_batch_size if is_eval else config.batch_size,
collate_fn=dataset.collate_fn,
num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
pin_memory=False,
)
To Reproduce
Run python -m trainer.distribute --gpus "0,1" --script train_vits_tts.py
Expected behavior
No response
Logs
No response
Environment
{
"CUDA": {
"GPU": [
"Tesla V100-PCIE-16GB",
"Tesla V100-PCIE-16GB",
"Tesla V100-PCIE-16GB",
"Tesla V100-PCIE-16GB"
],
"available": true,
"version": "11.6"
},
"Packages": {
"PyTorch_debug": false,
"PyTorch_version": "1.12.1",
"TTS": "0.8.0",
"numpy": "1.22.3"
},
"System": {
"OS": "Linux",
"architecture": [
"64bit",
"ELF"
],
"processor": "x86_64",
"python": "3.10.4",
"version": "#20~20.04.1-Ubuntu SMP Fri Aug 5 12:16:53 UTC 2022"
}
}
Additional context
Related to https://github.com/coqui-ai/Trainer/issues/68
Hey @ysmu, I am currently encountering the same problem. If I find out anything I will let you know.
We don't use multi-gpu and don't have a setup for it. Someone from the community needs to take on this issue.
Hey @ysmu, could you share your vits.py
? I have tried your changes but I fall in the same error.
That's the only change I made. I am working off the dev branch and not the pypi package if that makes a difference. Also do note that my change is not the proper fix because get_sampler
can return a BatchSampler
https://github.com/coqui-ai/TTS/blob/0a112f78412310b09fb7ee788a05f7a0016b4bbd/TTS/tts/models/vits.py#L1559-L1564
@GerrySant FYI the changes @ysmu suggested (replacing batch_sampler by sampler with batch_size worked for me )
I am trying to run this script:
import os
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsAudioConfig
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname("/home/luis/Documentos/VCSpanish/")
dataset_config = BaseDatasetConfig(
name="common_voice", meta_file_train="train.tsv", meta_file_val="test.tsv", path=os.path.join(output_path, "cv-corpus-10.0-2022-07-04/es/")
)
audio_config = VitsAudioConfig(
sample_rate=16000, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
)
config = VitsConfig(
audio=audio_config,
run_name="vits_common_voice",
batch_size=8,
eval_batch_size=4,
batch_group_size=0,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="spanish_cleaners",
use_phonemes=False,
compute_input_seq_cache=True,
print_step=25,
print_eval=True,
mixed_precision=True,
output_path=output_path,
datasets=[dataset_config],
cudnn_benchmark=True,
)
ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=False,
#eval_split_max_size=config.eval_split_max_size,
#eval_split_size=config.eval_split_size,
)
model = Vits(config, ap, tokenizer, speaker_manager=None)
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
)
trainer.fit()
I did the same replacement, and everything seemed to work until it almost "finished training". I got this error:
--> STEP: 13600/13611 -- GLOBAL_STEP: 13600
| > loss_disc: 2.46658 (2.52236)
| > loss_disc_real_0: 0.10858 (0.13588)
| > loss_disc_real_1: 0.19376 (0.24477)
| > loss_disc_real_2: 0.20712 (0.24531)
| > loss_disc_real_3: 0.26703 (0.24557)
| > loss_disc_real_4: 0.25831 (0.23901)
| > loss_disc_real_5: 0.22525 (0.23405)
| > loss_0: 2.46658 (2.52236)
| > grad_norm_0: 123.99548 (41.43769)
| > loss_gen: 2.24558 (2.26243)
| > loss_kl: 0.92354 (1.16580)
| > loss_feat: 3.90392 (3.88051)
| > loss_mel: 23.78493 (26.65310)
| > loss_duration: 2.69822 (2.72713)
| > amp_scaler: 64.00000 (115.10118)
| > loss_1: 33.55618 (36.68887)
| > grad_norm_1: 601.39746 (439.01791)
| > current_lr_0: 0.00020
| > current_lr_1: 0.00020
| > step_time: 0.40920 (0.38077)
| > loader_time: 0.00510 (0.00555)
['<BLNK>', 'w', '<BLNK>', 'a', '<BLNK>', 'ł', '<BLNK>', 'ę', '<BLNK>', 's', '<BLNK>', 'a', '<BLNK>', ' ', '<BLNK>', 'f', '<BLNK>', 'u', '<BLNK>', 'e', '<BLNK>', ' ', '<BLNK>', 'e', '<BLNK>', 'l', '<BLNK>', ' ', '<BLNK>', 'l', '<BLNK>', 'í', '<BLNK>', 'd', '<BLNK>', 'e', '<BLNK>', 'r', '<BLNK>', ' ', '<BLNK>', 'i', '<BLNK>', 'n', '<BLNK>', 'f', '<BLNK>', 'o', '<BLNK>', 'r', '<BLNK>', 'm', '<BLNK>', 'a', '<BLNK>', 'l', '<BLNK>', ' ', '<BLNK>', 'd', '<BLNK>', 'e', '<BLNK>', 'l', '<BLNK>', ' ', '<BLNK>', 'l', '<BLNK>', 'a', '<BLNK>', 'd', '<BLNK>', 'o', '<BLNK>', ' ', '<BLNK>', 'n', '<BLNK>', 'o', '<BLNK>', ' ', '<BLNK>', 'g', '<BLNK>', 'u', '<BLNK>', 'b', '<BLNK>', 'e', '<BLNK>', 'r', '<BLNK>', 'n', '<BLNK>', 'a', '<BLNK>', 'm', '<BLNK>', 'e', '<BLNK>', 'n', '<BLNK>', 't', '<BLNK>', 'a', '<BLNK>', 'l', '<BLNK>', ' ', '<BLNK>', 'd', '<BLNK>', 'u', '<BLNK>', 'r', '<BLNK>', 'a', '<BLNK>', 'n', '<BLNK>', 't', '<BLNK>', 'e', '<BLNK>', ' ', '<BLNK>', 'e', '<BLNK>', 's', '<BLNK>', 't', '<BLNK>', 'a', '<BLNK>', 's', '<BLNK>', ' ', '<BLNK>', 'c', '<BLNK>', 'o', '<BLNK>', 'n', '<BLNK>', 'v', '<BLNK>', 'e', '<BLNK>', 'r', '<BLNK>', 's', '<BLNK>', 'a', '<BLNK>', 'c', '<BLNK>', 'i', '<BLNK>', 'o', '<BLNK>', 'n', '<BLNK>', 'e', '<BLNK>', 's', '<BLNK>', '.', '<BLNK>']
[!] Character 'ę' not found in the vocabulary. Discarding it.
['<BLNK>', 'a', '<BLNK>', 'r', '<BLNK>', 'n', '<BLNK>', 'ó', '<BLNK>', 'r', '<BLNK>', ' ', '<BLNK>', 'e', '<BLNK>', 'r', '<BLNK>', 'a', '<BLNK>', ' ', '<BLNK>', 'u', '<BLNK>', 'n', '<BLNK>', 'o', '<BLNK>', ' ', '<BLNK>', 'd', '<BLNK>', 'e', '<BLNK>', ' ', '<BLNK>', 'l', '<BLNK>', 'o', '<BLNK>', 's', '<BLNK>', ' ', '<BLNK>', 'h', '<BLNK>', 'i', '<BLNK>', 'j', '<BLNK>', 'o', '<BLNK>', 's', '<BLNK>', ' ', '<BLNK>', 'd', '<BLNK>', 'e', '<BLNK>', 'l', '<BLNK>', ' ', '<BLNK>', 'c', '<BLNK>', 'a', '<BLNK>', 'u', '<BLNK>', 'd', '<BLNK>', 'i', '<BLNK>', 'l', '<BLNK>', 'l', '<BLNK>', 'o', '<BLNK>', ' ', '<BLNK>', 'd', '<BLNK>', 'i', '<BLNK>', 'g', '<BLNK>', 'u', '<BLNK>', 'r', '<BLNK>', ' ', '<BLNK>', 'h', '<BLNK>', 'e', '<BLNK>', 'l', '<BLNK>', 'g', '<BLNK>', 'a', '<BLNK>', ' ', '<BLNK>', 'þ', '<BLNK>', 'o', '<BLNK>', 'r', '<BLNK>', 's', '<BLNK>', 't', '<BLNK>', 'e', '<BLNK>', 'i', '<BLNK>', 'n', '<BLNK>', 's', '<BLNK>', 's', '<BLNK>', 'o', '<BLNK>', 'n', '<BLNK>', '.', '<BLNK>']
[!] Character 'þ' not found in the vocabulary. Discarding it.
> DataLoader initialization
| > Tokenizer:
| > add_blank: True
| > use_eos_bos: False
| > use_phonemes: False
Traceback (most recent call last):
File "/home/luis/.local/lib/python3.10/site-packages/trainer/trainer.py", line 1533, in fit
self._fit()
File "/home/luis/.local/lib/python3.10/site-packages/trainer/trainer.py", line 1519, in _fit
self.eval_epoch()
File "/home/luis/.local/lib/python3.10/site-packages/trainer/trainer.py", line 1377, in eval_epoch
self.get_eval_dataloader(
File "/home/luis/.local/lib/python3.10/site-packages/trainer/trainer.py", line 828, in get_eval_dataloader
return self._get_loader(
File "/home/luis/.local/lib/python3.10/site-packages/trainer/trainer.py", line 740, in _get_loader
loader = model.module.get_data_loader(
File "/home/luis/Documentos/VCSpanish/TTS/TTS/tts/models/vits.py", line 1592, in get_data_loader
dataset = VitsDataset(
File "/home/luis/Documentos/VCSpanish/TTS/TTS/tts/models/vits.py", line 255, in __init__
super().__init__(*args, **kwargs)
File "/home/luis/Documentos/VCSpanish/TTS/TTS/tts/datasets/dataset.py", line 153, in __init__
self.print_logs()
File "/home/luis/Documentos/VCSpanish/TTS/TTS/tts/datasets/dataset.py", line 188, in print_logs
print(f"{indent}| > Number of instances : {len(self.samples)}")
TypeError: object of type 'NoneType' has no len()
Also, in the path the run was saved, it appears that my customized characters and puntuactions were not used, I don't know why it uses the last default configuration even when I overwrote them...If someone can give me a hand, the full code is here: Common_Voice_Vits
Hi @slegroux, thank you for your comment.
Yes, I made the changes and have been working with them for 2 weeks.
At first it didn't work for me because I was running the training with an environment that pointed to another clone of the repository so the changes didn't apply.😅
These last few days I have been focusing on this other issue #1964
I verified that the solution suggested by @ysmu works great. Just filed a PR for this issue: https://github.com/coqui-ai/TTS/pull/2077. Hope that it can be included in the code so that people don't need to patch things manually.
Will this fix my TypeError too? Do you have any idea?
I updated my comments above and my PR. I don't see any TypeError
in my runs. Hope it fixes your issue. If not, more than happy to debug together.
That's because you are running with the original script. I am trying to train on the common voice (spanish) dataset, so I made some modifications for it, but I am new to this repo, so I don't really know if my modifications are fine. At first I had this bug like one month ago, but I was not sure if it was a bug or my few expertise on the repo, fortunately it was not me, I used @ysmu 's correction and it finally trained, but when it was about to end, this new TypeError appeared. All the scripts that I changed are here Common_Voice_Vits and the common voice dataset I am using is here https://commonvoice.mozilla.org/es/datasets Maybe you can help me getting this to work?
Hi @AlexSteveChungAlvarez , maybe can you solve this bug? I have the same trouble.
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions. You might also look our discussion channels.
@AlexSteveChungAlvarez have you solved the issues? I got the same problem. Can you please advice me what can be done to solve this problem?
Here is my code and error logs:
os.environ["CUDA_VISIBLE_DEVICES"]='1'
def main():
# BaseDataseines name, formatter and path of the dataset.
output_path = "/home/elias/normalized_training/align-tts-female/align/TTS/normalized_checkpoints_bn_female"
dataset_config = BaseDatasetConfig(
formatter="ljspeech", meta_file_train="/home/elias/audio_normalized/train_female/metadata.txt", path="/home/elias/audio_normalized/train_female"
)
#print(BaseDatasetConfig.__init__.__code__.co_varnames)
# Add the new dataset config for validation data
dataset_config_val = BaseDatasetConfig(
formatter="ljspeech",
meta_file_val="/home/elias/audio_normalized/validation_female/metadata.txt",
path="/home/elias/audio_normalized/validation_female",
)
my_valid_lis= ["-", "অ", "আ", "ই", "ঈ", "উ", "ঊ", "ঋ", "এ", "ঐ", "ও", "ঔ", "ক", "খ", "গ", "ঘ", "ঙ", "চ", "ছ", "ঝ", "জ", "ট", "ঠ", "ড", "ঢ", "ণ", "ত", "থ", "দ", "ধ", "ন", "প", "ফ", "ব", "ভ", "ম",
"য", "য়", "র", "ল", "শ", "ষ", "স", "হ", "ঞ", "ড়", "ঢ়", "ৎ", "ি", "া", "ূ", "ু", "ৃ", "ৈ", "ঃ", "ো", "ৌ",
"ী", "্", "়", "\u200c", "ং", "ে", "ঁ", "\u200d", ".", ",", ";", ":", "?", "!", "।", "—", "…", "#", "১", "২",
"৩", "৪", "৫", "৬", "৭", "৮", "৯", "০", "য়", "ড়", "ঢ়", "৷", "্য", "্ব", "্র", "্ম", "্ণ", "্ল", "্ষ", "্স",
"্হ", "্জ", "্ঞ", "্দ", "্ধ", "ঽ", "৳", " ", "‘", "’", "“", "”", "(", ")", "ী", "্", "\u200c", "়", "ং",
"ে", "ঁ", "\u200d", "ড়", "ঢ়", "৷", "'", "'", "\u200b"]
characters_config = CharactersConfig(
pad = '',#'<PAD>',
eos = '',#'\n', #'<EOS>', #'।',
bos = '',#'<BOS>',# None,
blank = '',#'<BLNK>',
phonemes = None,
characters = my_valid_lis ,
punctuations = ''
)
audio_config = BaseAudioConfig(
sample_rate = 16000,
resample =True
)
config = AlignTTSConfig(
batch_size=8,
eval_batch_size=8,
num_loader_workers=8,
num_eval_loader_workers=8,
run_eval=True,
test_delay_epochs=-1,
epochs=5000,
text_cleaner="collapse_whitespace",
use_phonemes=False,
# phoneme_language="bn",
phoneme_cache_path=os.path.join(output_path, "grapheme_cache"),
print_step=25,
print_eval=False,
mixed_precision=True,
output_path=output_path,
datasets=[dataset_config],
save_step=1000,
audio=audio_config,
characters=characters_config,
cudnn_benchmark=True,
# test_sentences = [
# "পিপলস ইন্স্যুরেন্স অব চায়না ছেষট্টি বছর আগে ব্যবসা চালু করে।"
# ],
)
ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)
def formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the LJSpeech meta data file to TTS format
https://keithito.com/LJ-Speech-Dataset/"""
txt_file = meta_file
items = []
speaker_name = "ljspeech"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, "wav", cols[0] + ".wav")
try:
text = cols[1]
except:
print("not found")
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
train_samples= load_tts_samples(
dataset_config,
eval_split=False,
formatter=formatter,
)
eval_samples= load_tts_samples(
dataset_config_val,
eval_split= False,
formatter= formatter
)
model = AlignTTS(config, ap, tokenizer)
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()
if __name__ == "__main__" :
main()
Traceback (most recent call last):
File "/home/elias/miniconda3/envs/elias-dev/lib/python3.10/site-packages/trainer/trainer.py", line 1591, in fit
self._fit()
File "/home/elias/miniconda3/envs/elias-dev/lib/python3.10/site-packages/trainer/trainer.py", line 1546, in _fit
self.eval_epoch()
File "/home/elias/miniconda3/envs/elias-dev/lib/python3.10/site-packages/trainer/trainer.py", line 1404, in eval_epoch
self.get_eval_dataloader(
File "/home/elias/miniconda3/envs/elias-dev/lib/python3.10/site-packages/trainer/trainer.py", line 844, in get_eval_dataloader
return self._get_loader(
File "/home/elias/miniconda3/envs/elias-dev/lib/python3.10/site-packages/trainer/trainer.py", line 767, in _get_loader
loader = model.get_data_loader(
File "/home/elias/normalized_training/align-tts-female/align/TTS/TTS/tts/models/base_tts.py", line 311, in get_data_loader
dataset = TTSDataset(
File "/home/elias/normalized_training/align-tts-female/align/TTS/TTS/tts/datasets/dataset.py", line 172, in __init__
self.print_logs()
File "/home/elias/normalized_training/align-tts-female/align/TTS/TTS/tts/datasets/dataset.py", line 207, in print_logs
print(f"{indent}| > Number of instances : {len(self.samples)}")
TypeError: object of type 'NoneType' has no len() ```