sherpa-onnx
sherpa-onnx copied to clipboard
Unity crash when using Mimic 3 model with Sherpa-ONNX (Piper & Coqui working fine)
Hi team,
First of all, thanks for your amazing work on Sherpa-ONNX!
I'm currently working on a Unity-based TTS project where I successfully integrated Sherpa-ONNX with VITS models from both Piper and Coqui by manually adding the required ONNX metadata (as described in your documentation and notebooks).
However, when attempting to use a Mimic 3 VITS model, Unity crashes at runtime — specifically during the initialization (SherpaOnnxCreateOfflineTts) in the Start() method of the C# script. The same setup works perfectly with Piper and Coqui models.
Metadata Script Used (Python)
import json
import onnx
from typing import Any, Dict
def add_meta_data(input_filename: str, output_filename: str, meta_data: Dict[str, Any]):
model = onnx.load(input_filename)
# Eliminar metadatos existentes
del model.metadata_props[:]
# Añadir nuevos metadatos
for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)
onnx.save(model, output_filename)
print(f"Metadatos añadidos correctamente. Modelo guardado como {output_filename}")
def main():
input_model = "model.onnx"
output_model = "model-metadataadded.onnx"
# Ajusta los valores según tu modelo.
# Fíjate que aquí añadimos 'normalize' y 'version'
meta_data = {
"model_type": "vits",
"comment": "mimic3",
"language": "Spanish",
"voice": "espeak",
"has_espeak": 1,
"n_speakers": 3,
"sample_rate": 22050,
"normalize": "True", # o "False"
"version": "1.0"
}
add_meta_data(input_model, output_model, meta_data)
if __name__ == "__main__":
main()
Unity Script:
using System;
using System.IO;
using System.Text;
using UnityEngine;
using UnityEngine.UI;
using TMPro;
using System.Runtime.InteropServices;
namespace SherpaOnnx
{
public class TTSMimic : MonoBehaviour
{
[Header("UI")]
[SerializeField] private TMP_InputField inputField;
[SerializeField] private Button runButton;
[SerializeField] private AudioSource audioSource;
[Header("Modelo Mimic 3")]
public string modelFolder = "models/Mimic/Spanish"; // ✅ Ruta directa a carpeta Spanish
public int speakerId = 0;
public float noiseScale = 0.667f;
public float noiseScaleW = 0.8f;
public float lengthScale = 1.0f;
public int numThreads = 1;
public string provider = "cpu";
public bool debug = false;
private IntPtr ttsHandle = IntPtr.Zero;
private class PendingAudio
{
public float[] samples;
public int sampleRate;
}
private PendingAudio pendingAudio = null;
private void Start()
{
if (runButton != null)
runButton.onClick.AddListener(GenerateSpeech);
InitializeTTS();
}
private void InitializeTTS()
{
string basePath = Path.Combine(Application.streamingAssetsPath, modelFolder);
string modelPath = Path.Combine(basePath, "model.onnx");
string tokensPath = Path.Combine(basePath, "tokens.txt");
string dataDir = Path.Combine(basePath, "espeak-ng-data");
if (!File.Exists(modelPath) || !File.Exists(tokensPath) || !Directory.Exists(dataDir))
{
Debug.LogError("❌ Faltan archivos del modelo Mimic.");
Debug.LogError($"Buscando en:\n📁 model.onnx: {modelPath}\n📁 tokens.txt: {tokensPath}\n📁 espeak-ng-data: {dataDir}");
return;
}
SherpaOnnx.OfflineTtsConfig config = new SherpaOnnx.OfflineTtsConfig
{
Model = new SherpaOnnx.OfflineTtsModelConfig
{
Vits = new SherpaOnnx.OfflineTtsVitsModelConfig
{
Model = modelPath,
Tokens = tokensPath,
DataDir = dataDir,
Lexicon = "",
DictDir = "",
NoiseScale = noiseScale,
NoiseScaleW = noiseScaleW,
LengthScale = lengthScale
},
NumThreads = numThreads,
Debug = debug ? 1 : 0,
Provider = provider
},
RuleFsts = "",
RuleFars = "",
MaxNumSentences = 1
};
ttsHandle = Sherpita.SherpaOnnxCreateOfflineTts(ref config);
if (ttsHandle == IntPtr.Zero)
Debug.LogError("❌ Error al inicializar Sherpa ONNX con modelo Mimic.");
else
Debug.Log("✅ Sherpa ONNX inicializado con modelo Mimic 3.");
}
private void GenerateSpeech()
{
if (ttsHandle == IntPtr.Zero || string.IsNullOrWhiteSpace(inputField.text))
return;
string text = inputField.text;
byte[] utf8 = Encoding.UTF8.GetBytes(text);
byte[] utf8WithNull = new byte[utf8.Length + 1];
Array.Copy(utf8, utf8WithNull, utf8.Length);
utf8WithNull[utf8.Length] = 0;
IntPtr audioPtr = Sherpita.SherpaOnnxOfflineTtsGenerate(ttsHandle, utf8WithNull, speakerId, 1.0f);
if (audioPtr == IntPtr.Zero)
{
Debug.LogError("❌ El puntero de audio está vacío.");
return;
}
SherpaOnnx.OfflineTtsGeneratedAudio audio = new SherpaOnnx.OfflineTtsGeneratedAudio(audioPtr);
if (audio == null || audio.Samples == null || audio.Samples.Length == 0)
{
Debug.LogError("⚠️ Audio generado inválido o vacío.");
return;
}
int sampleRate = audio.SampleRate;
Debug.Log($"✅ Audio generado: {audio.Samples.Length} muestras a {sampleRate} Hz.");
pendingAudio = new PendingAudio
{
samples = audio.Samples,
sampleRate = sampleRate
};
audio.Dispose();
}
private void Update()
{
if (pendingAudio != null)
{
if (audioSource == null)
{
Debug.LogError("❌ AudioSource no asignado.");
pendingAudio = null;
return;
}
try
{
AudioClip clip = AudioClip.Create("MimicTTS", pendingAudio.samples.Length, 1, pendingAudio.sampleRate, false);
clip.SetData(pendingAudio.samples, 0);
audioSource.clip = clip;
audioSource.Play();
Debug.Log("✅ AudioClip reproducido desde Update().");
}
catch (Exception ex)
{
Debug.LogError($"❌ Error al crear/reproducir AudioClip: {ex.Message}");
}
pendingAudio = null;
}
}
private void OnDestroy()
{
if (ttsHandle != IntPtr.Zero)
{
Sherpita.SherpaOnnxDestroyOfflineTts(ttsHandle);
ttsHandle = IntPtr.Zero;
}
}
}
}
I'd be really grateful if you could help me figure this out. Thanks in advance for your time and support!