sherpa-onnx icon indicating copy to clipboard operation
sherpa-onnx copied to clipboard

Unity crash when using Mimic 3 model with Sherpa-ONNX (Piper & Coqui working fine)

Open AdrianPresno opened this issue 8 months ago • 17 comments

Hi team,

First of all, thanks for your amazing work on Sherpa-ONNX!

I'm currently working on a Unity-based TTS project where I successfully integrated Sherpa-ONNX with VITS models from both Piper and Coqui by manually adding the required ONNX metadata (as described in your documentation and notebooks).

However, when attempting to use a Mimic 3 VITS model, Unity crashes at runtime — specifically during the initialization (SherpaOnnxCreateOfflineTts) in the Start() method of the C# script. The same setup works perfectly with Piper and Coqui models.

Metadata Script Used (Python)

import json
import onnx
from typing import Any, Dict

def add_meta_data(input_filename: str, output_filename: str, meta_data: Dict[str, Any]):
    model = onnx.load(input_filename)

    # Eliminar metadatos existentes
    del model.metadata_props[:]

    # Añadir nuevos metadatos
    for key, value in meta_data.items():
        meta = model.metadata_props.add()
        meta.key = key
        meta.value = str(value)

    onnx.save(model, output_filename)
    print(f"Metadatos añadidos correctamente. Modelo guardado como {output_filename}")

def main():
    input_model = "model.onnx"
    output_model = "model-metadataadded.onnx"

    # Ajusta los valores según tu modelo.
    # Fíjate que aquí añadimos 'normalize' y 'version'
    meta_data = {
        "model_type": "vits",
        "comment": "mimic3",
        "language": "Spanish",
        "voice": "espeak",
        "has_espeak": 1,
        "n_speakers": 3,
        "sample_rate": 22050,
        "normalize": "True",  # o "False"
        "version": "1.0"
    }

    add_meta_data(input_model, output_model, meta_data)

if __name__ == "__main__":
    main()

Unity Script:

using System;
using System.IO;
using System.Text;
using UnityEngine;
using UnityEngine.UI;
using TMPro;
using System.Runtime.InteropServices;

namespace SherpaOnnx
{
    public class TTSMimic : MonoBehaviour
    {
        [Header("UI")]
        [SerializeField] private TMP_InputField inputField;
        [SerializeField] private Button runButton;
        [SerializeField] private AudioSource audioSource;

        [Header("Modelo Mimic 3")]
        public string modelFolder = "models/Mimic/Spanish"; // ✅ Ruta directa a carpeta Spanish
        public int speakerId = 0;
        public float noiseScale = 0.667f;
        public float noiseScaleW = 0.8f;
        public float lengthScale = 1.0f;
        public int numThreads = 1;
        public string provider = "cpu";
        public bool debug = false;

        private IntPtr ttsHandle = IntPtr.Zero;

        private class PendingAudio
        {
            public float[] samples;
            public int sampleRate;
        }

        private PendingAudio pendingAudio = null;

        private void Start()
        {
            if (runButton != null)
                runButton.onClick.AddListener(GenerateSpeech);

            InitializeTTS();
        }

        private void InitializeTTS()
        {
            string basePath = Path.Combine(Application.streamingAssetsPath, modelFolder);
            string modelPath = Path.Combine(basePath, "model.onnx");
            string tokensPath = Path.Combine(basePath, "tokens.txt");
            string dataDir = Path.Combine(basePath, "espeak-ng-data");

            if (!File.Exists(modelPath) || !File.Exists(tokensPath) || !Directory.Exists(dataDir))
            {
                Debug.LogError("❌ Faltan archivos del modelo Mimic.");
                Debug.LogError($"Buscando en:\n📁 model.onnx: {modelPath}\n📁 tokens.txt: {tokensPath}\n📁 espeak-ng-data: {dataDir}");
                return;
            }

            SherpaOnnx.OfflineTtsConfig config = new SherpaOnnx.OfflineTtsConfig
            {
                Model = new SherpaOnnx.OfflineTtsModelConfig
                {
                    Vits = new SherpaOnnx.OfflineTtsVitsModelConfig
                    {
                        Model = modelPath,
                        Tokens = tokensPath,
                        DataDir = dataDir,
                        Lexicon = "",
                        DictDir = "",
                        NoiseScale = noiseScale,
                        NoiseScaleW = noiseScaleW,
                        LengthScale = lengthScale
                    },
                    NumThreads = numThreads,
                    Debug = debug ? 1 : 0,
                    Provider = provider
                },
                RuleFsts = "",
                RuleFars = "",
                MaxNumSentences = 1
            };

            ttsHandle = Sherpita.SherpaOnnxCreateOfflineTts(ref config);

            if (ttsHandle == IntPtr.Zero)
                Debug.LogError("❌ Error al inicializar Sherpa ONNX con modelo Mimic.");
            else
                Debug.Log("✅ Sherpa ONNX inicializado con modelo Mimic 3.");
        }

        private void GenerateSpeech()
        {
            if (ttsHandle == IntPtr.Zero || string.IsNullOrWhiteSpace(inputField.text))
                return;

            string text = inputField.text;
            byte[] utf8 = Encoding.UTF8.GetBytes(text);
            byte[] utf8WithNull = new byte[utf8.Length + 1];
            Array.Copy(utf8, utf8WithNull, utf8.Length);
            utf8WithNull[utf8.Length] = 0;

            IntPtr audioPtr = Sherpita.SherpaOnnxOfflineTtsGenerate(ttsHandle, utf8WithNull, speakerId, 1.0f);
            if (audioPtr == IntPtr.Zero)
            {
                Debug.LogError("❌ El puntero de audio está vacío.");
                return;
            }

            SherpaOnnx.OfflineTtsGeneratedAudio audio = new SherpaOnnx.OfflineTtsGeneratedAudio(audioPtr);

            if (audio == null || audio.Samples == null || audio.Samples.Length == 0)
            {
                Debug.LogError("⚠️ Audio generado inválido o vacío.");
                return;
            }

            int sampleRate = audio.SampleRate;
            Debug.Log($"✅ Audio generado: {audio.Samples.Length} muestras a {sampleRate} Hz.");

            pendingAudio = new PendingAudio
            {
                samples = audio.Samples,
                sampleRate = sampleRate
            };

            audio.Dispose();
        }

        private void Update()
        {
            if (pendingAudio != null)
            {
                if (audioSource == null)
                {
                    Debug.LogError("❌ AudioSource no asignado.");
                    pendingAudio = null;
                    return;
                }

                try
                {
                    AudioClip clip = AudioClip.Create("MimicTTS", pendingAudio.samples.Length, 1, pendingAudio.sampleRate, false);
                    clip.SetData(pendingAudio.samples, 0);
                    audioSource.clip = clip;
                    audioSource.Play();
                    Debug.Log("✅ AudioClip reproducido desde Update().");
                }
                catch (Exception ex)
                {
                    Debug.LogError($"❌ Error al crear/reproducir AudioClip: {ex.Message}");
                }

                pendingAudio = null;
            }
        }

        private void OnDestroy()
        {
            if (ttsHandle != IntPtr.Zero)
            {
                Sherpita.SherpaOnnxDestroyOfflineTts(ttsHandle);
                ttsHandle = IntPtr.Zero;
            }
        }
    }
}

I'd be really grateful if you could help me figure this out. Thanks in advance for your time and support!

AdrianPresno avatar Mar 24 '25 10:03 AdrianPresno