espnet_onnx
espnet_onnx copied to clipboard
The inference results of espnet_onnx are inconsistent with espnet.
After converting the FastSpeech2 model with espnet_onnx, the audio generated by the model is distorted.
Using the model: kan-bayashi/jsut_fastspeech2
Download method:
python
from espnet_model_zoo.downloader import ModelDownloader
d = ModelDownloader("~/.cache/espnet")
d.download_and_unpack("kan-bayashi/jsut_fastspeech2")
Python inference method
import torch
from espnet2.bin.tts_inference import Text2Speech
# Set the paths for the model file and configuration file
model_path = "~/.cache/espnet/6bcf613d7d73d2ba1ec6508e6b9f1177/exp/tts_train_fastspeech2_raw_phn_jaconv_pyopenjtalk/train.loss.ave_5best.pth" # Model weights file
config_path = "~/.cache/espnet/6bcf613d7d73d2ba1ec6508e6b9f1177/exp/tts_train_fastspeech2_raw_phn_jaconv_pyopenjtalk/config.yaml" # Configuration file
# Load the model
model = Text2Speech.from_pretrained(model_tag=None, train_config=config_path, model_file=model_path, device='cuda:0')
# Text to be synthesized
text = "私はあなたに好意を持っていますが、あなたが少し控えめなのが感じられます。それが社交的な不安からなのか、それとも私に対する興味がないからなのか、わかりません。もし可能であれば、もう少し積極的になってくれませんか?"
# Perform speech synthesis
with torch.no_grad():
wav = model(text)["wav"]
Convert to ONNX format
import torch
from espnet2.bin.tts_inference import Text2Speech
from espnet_onnx.export import TTSModelExport
# Set the paths for the model file and configuration file
model_path = "~/.cache/espnet/6bcf613d7d73d2ba1ec6508e6b9f1177/exp/tts_train_fastspeech2_raw_phn_jaconv_pyopenjtalk/train.loss.ave_5best.pth" # Model weights file
config_path = "~/.cache/espnet/6bcf613d7d73d2ba1ec6508e6b9f1177/exp/tts_train_fastspeech2_raw_phn_jaconv_pyopenjtalk/config.yaml" # Configuration file
# Load the model
model = Text2Speech.from_pretrained(model_tag=None, train_config=config_path, model_file=model_path)
ex = TTSModelExport()
ex.export(model=model)
ONNX inference method
from espnet_onnx import Text2Speech
PROVIDERS = ['CUDAExecutionProvider']
text2speech = Text2Speech(model_dir='${path_to_onnx_model}', providers=PROVIDERS)
text = "私はあなたに好意を持っていますが、あなたが少し控えめなのが感じられます。それが社交的な不安からなのか、それとも私に対する興味がないからなのか、わかりません。もし可能であれば、もう少し積極的になってくれませんか?"
output_dict = text2speech(text)
wav = output_dict['wav']