Scramble Audio playback
Congratulation on this amazing repo. I am using it for real-time communication with LLM. I had no problem using KokoroTTS but when I switch to OrpheusTTS (https://github.com/freddyaboulton/orpheus-cpp) the audio playback is scrambled. Below is the whole code. Can you help me please?
import gradio as gr
import os
from fastrtc import (ReplyOnPause, Stream, get_tts_model)
from huggingface_hub import login
import numpy as np
import spaces
import torch
from transformers import pipeline
from twilio.rest import Client
from fastapi import FastAPI, WebSocket, Request
from distil_whisper_fastrtc import get_stt_model
from orpheus_cpp import OrpheusCpp
hf_token = os.getenv("hf_token")
twilio_account_sid = os.getenv("twilio_account_sid")
twilio_auth_token = os.getenv("twilio_auth_token")
mode = os.getenv("MODE")
login(token=hf_token)
stt_model = get_stt_model("openai/whisper-large-v3-turbo")
# tts_model = get_tts_model()
tts_model = OrpheusCpp()
model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
"text-generation",
model=model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
# Twilio RTC Configuration
client = Client(twilio_account_sid, twilio_auth_token)
token = client.tokens.create()
rtc_configuration = {
"iceServers": token.ice_servers,
"iceTransportPolicy": "relay",
}
history = None
@spaces.GPU
def echo(audio):
global history
# Convert audio to text
user_input = stt_model.stt(audio)
system_prompt = {"role": "system",
"content": """You are an smart assistant."""}
if history is None:
history = [system_prompt]
history.append({"role": "user", "content": user_input})
print("User said: " + user_input)
response = pipe(
history,
max_new_tokens=500,
pad_token_id=pipe.tokenizer.eos_token_id,
do_sample=True,
temperature=0.3
)[-1]["generated_text"][-1]["content"]
history.append({"role": "assistant", "content": response})
print("AI said: " + response)
for audio_chunk in tts_model.stream_tts_sync(response, options={"voice_id": "giulia"}):
yield audio_chunk
if mode == "UI":
can_interrupt = False
else:
can_interrupt = True
stream = Stream(handler=ReplyOnPause(echo, can_interrupt=can_interrupt), rtc_configuration=rtc_configuration, modality="audio", mode="send-receive")
app = FastAPI()
stream.mount(app)
if __name__ == "__main__":
import os
if (mode := os.getenv("MODE")) == "UI":
stream.ui.launch(server_port=7860, server_name="0.0.0.0")
elif mode == "PHONE":
stream.fastphone(host="0.0.0.0", port=7860)
else:
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
Will take a look later today
What do you mean by scrambled @eriadhami ?
I thought you meant that it would be incomprehensible. But I can understand it.
giulia is an italian voice so make sure to set lang="it" in OrpheusCpp. If the issue persists, please share any videos so I can see.
@freddyaboulton I did what you suggested but still the voice is bad. You can check video (the voice starts after 30s) I am using huggingface space T4.
Video: https://github.com/user-attachments/assets/e8e6f41c-9c7d-400c-a0d0-0540ef426a94
app.py
import gradio as gr
import os
from fastrtc import (ReplyOnPause, Stream, get_tts_model)
from huggingface_hub import login
import numpy as np
import spaces
import torch
from transformers import pipeline
from twilio.rest import Client
from fastapi import FastAPI, WebSocket, Request
from distil_whisper_fastrtc import get_stt_model
from orpheus_cpp import OrpheusCpp
hf_token = os.getenv("hf_token")
twilio_account_sid = os.getenv("twilio_account_sid")
twilio_auth_token = os.getenv("twilio_auth_token")
mode = os.getenv("MODE")
login(token=hf_token)
stt_model = get_stt_model("openai/whisper-large-v3-turbo")
# tts_model = get_tts_model()
tts_model = OrpheusCpp(lang="it")
model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
"text-generation",
model=model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
# Twilio RTC Configuration
client = Client(twilio_account_sid, twilio_auth_token)
token = client.tokens.create()
rtc_configuration = {
"iceServers": token.ice_servers,
"iceTransportPolicy": "relay",
}
history = None
@spaces.GPU
def echo(audio):
global history
user_input = stt_model.stt(audio)
system_prompt = {"role": "system",
"content": os.getenv("system_prompt")}
if history is None:
history = [system_prompt]
history.append({"role": "user", "content": user_input})
print("User said: " + user_input)
response = pipe(
history,
max_new_tokens=500,
pad_token_id=pipe.tokenizer.eos_token_id,
do_sample=True,
temperature=0.3
)[-1]["generated_text"][-1]["content"]
history.append({"role": "assistant", "content": response})
print("AI said: " + response)
# for audio_chunk in tts_model.stream_tts_sync(response):
# yield audio_chunk
for audio_chunk in tts_model.stream_tts_sync(response, options={"voice_id": "giulia"}):
yield audio_chunk
if mode == "UI":
can_interrupt = False
else:
can_interrupt = True
stream = Stream(handler=ReplyOnPause(echo, can_interrupt=can_interrupt), rtc_configuration=rtc_configuration, modality="audio", mode="send-receive")
# Initialize FastAPI app
app = FastAPI()
stream.mount(app)
if __name__ == "__main__":
import os
if (mode := os.getenv("MODE")) == "UI":
stream.ui.launch(server_port=7860, server_name="0.0.0.0")
elif mode == "PHONE":
stream.fastphone(host="0.0.0.0", port=7860)
else:
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
I think the issue is that the audio generation is really slow. It's slower than on my laptop so something is off.
Did you install llama.cpp with gpu support? Also there is a n_gpu_layers parameter in OrpheusCpp. You can set to -1 so that all layers are loaded on the GPU but you need to be careful about GPU memory. There is already whisper + llama on the gpu.
@freddyaboulton thanks a lot for your efforts. T4 space has 30GB RAM and 16GB VRAM so indeed that is the issue. llama.cpp installation is as follow --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 llama-cpp-python
Ok in that case then I would set n_gpu_layers in the OrpheusCpp class and see
I added the n_gpu_layers=-1 parameter tts_model = OrpheusCpp(n_gpu_layers=-1, lang="it") and also changed hardware to 48 vCPU, 186 GB RAM, 96 GB VRAM but now when the audio is generated it keeps restarting the space.
😱