ttsmms allow for 16bit PCM - which is more common and return bytes in array

This makes it all the more useful as you can then use it for manipulating the audio file far easier in say pyaudio or otherwise

Jun 23 '24 00:06 willwade

Im hoping this shouldnt break anything.

Jun 23 '24 00:06 willwade

I thought I'd share this somewhere - this probably isn't the best place for it but since it uses my pcm16 code

The problem: Its really slow for recurrent synth calls. The answer - wrap it in a worker thread

import pyaudio
from ttsmms import TTS
import threading
from queue import Queue, Empty

def play_audio(audio_bytes, sample_rate=16000):
    try:
        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16,  # Ensure the format matches 16-bit PCM
                        channels=1,
                        rate=sample_rate,
                        output=True)
        stream.write(audio_bytes)
        stream.stop_stream()
        stream.close()
        p.terminate()
    except Exception as e:
        print(f"Error playing audio: {e}")


class TTSWorker(threading.Thread):
    def __init__(self, model_path, queue, response_queue):
        super().__init__()
        self.tts = TTS(model_path)
        self.queue = queue
        self.response_queue = response_queue
        self.running = True

    def run(self):
        while self.running:
            try:
                text = self.queue.get(timeout=1)
                if text is None:
                    self.running = False
                else:
                    result = self.tts.synthesis(text, convert_to_pcm16=True)
                    self.response_queue.put(result)
            except Empty:
                continue

    def stop(self):
        self.running = False
        self.queue.put(None)
        self.join()

# Create a queue for sending text to the TTS worker
tts_queue = Queue()
response_queue = Queue()

# Create and start the TTS worker
tts_worker = TTSWorker('/Users/willwade/mms_models/eng', tts_queue, response_queue)
tts_worker.start()

def synthesize_speech(text):
    tts_queue.put(text)
    result = response_queue.get()
    return result

# Example usage
text = "Hello world"
result = synthesize_speech(text)
audio_bytes = result["audio_bytes"]
sample_rate = result["sampling_rate"]
# Play the audio bytes
play_audio(audio_bytes, sample_rate)


# Example usage - oart 2
text = "And all my friends"
result = synthesize_speech(text)
audio_bytes = result["audio_bytes"]
sample_rate = result["sampling_rate"]
# Play the audio bytes
play_audio(audio_bytes, sample_rate)


# Stop the TTS worker when done
tts_worker.stop()

Its really quick the second, third etc time around

Jun 23 '24 00:06 willwade