YourTTS
YourTTS copied to clipboard
The tone result of voice conversion is very different from the voice of the target speaker
The tone result of speech conversion is very different from the voice of the target speaker. Is there any direction that can be optimized?
Here is my training code:
import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits,CharactersConfig,VitsArgs
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.speakers import SpeakerManager
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
name="aishell", path="/datasets/data_aishell3", language="zh-cn"
)
audio_config = BaseAudioConfig(
sample_rate=22050,
win_length=1024,
hop_length=256,
num_mels=80,
preemphasis=0.0,
ref_level_db=20,
log_func="np.log",
do_trim_silence=True,
trim_db=45,
mel_fmin=0,
mel_fmax=None,
spec_gain=1.0,
signal_norm=False,
do_amp_to_db_linear=False,
)
vitsArgs = VitsArgs(
use_speaker_embedding=True,
use_sdp=False,
use_speaker_encoder_as_loss=True,
speaker_encoder_config_path="/TTS/models/tts_models--multilingual--multi-dataset--your_tts/config_se.json",
speaker_encoder_model_path="/TTS/models/tts_models--multilingual--multi-dataset--your_tts/model_se.pth",
speaker_embedding_channels=512,
)
config = VitsConfig(
model_args=vitsArgs,
audio=audio_config,
run_name="vits_aishell",
batch_size=100,
eval_batch_size=50,
batch_group_size=15,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="chinese_mandarin_cleaners",
use_phonemes=True,
phoneme_language="zh-cn",
phonemizer="zh_cn_phonemizer",
add_blank=False,
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
compute_input_seq_cache=False,
print_step=25,
print_eval=True,
mixed_precision=True,
output_path=output_path,
datasets=[dataset_config],
characters=CharactersConfig(
characters_class=None,
vocab_dict=None,
pad="_",
eos="~",
bos="^",
blank=None,
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),.:;? ",
punctuations="\uff0c\u3002\uff1f\uff01\uff5e\uff1a\uff1b*\u2014\u2014-\uff08\uff09\u3010\u3011!'(),-.:;? “”",
phonemes="12345giy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b",
is_unique=False,
is_sorted=True
),
test_sentences=[
["你在做什么?", "SSB0005", None, "zh-cn"],
["篮球场上没有人", "SSB0005", None, "zh-cn"],
["今天心情怎么样?", "SSB0112", None, "zh-cn"],
["我想去长城上奔跑", "SSB0112", None, "zh-cn"],
["永远不要相信那些一直说谎的人,本性难易", "SSB0287", None, "zh-cn"],
["相信明天会比今天更好", "SSB0287", None, "zh-cn"],
["梦想总是要有的,万一实现了呢", "SSB0415", None, "zh-cn"],
],
)
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
speaker_manager = SpeakerManager()
speaker_manager.use_cuda = True
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.model_args.num_speakers = speaker_manager.num_speakers
# init model
model = Vits(config, ap, tokenizer, speaker_manager=speaker_manager)
# init the trainer and
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
)
trainer.fit()
And I use the tts --model_path checkpoint.pth --config_path config.json --speaker_idx "SSB0005" --out_path output.wav --reference_wav 000011.wav for voice conversion.
@Edresson