fastrtc icon indicating copy to clipboard operation
fastrtc copied to clipboard

Scramble Audio playback

Open eriadhami opened this issue 1 year ago • 10 comments

Congratulation on this amazing repo. I am using it for real-time communication with LLM. I had no problem using KokoroTTS but when I switch to OrpheusTTS (https://github.com/freddyaboulton/orpheus-cpp) the audio playback is scrambled. Below is the whole code. Can you help me please?

import gradio as gr
import os
from fastrtc import (ReplyOnPause, Stream, get_tts_model)
from huggingface_hub import login
import numpy as np
import spaces
import torch
from transformers import pipeline
from twilio.rest import Client
from fastapi import FastAPI, WebSocket, Request
from distil_whisper_fastrtc import get_stt_model
from orpheus_cpp import OrpheusCpp

hf_token = os.getenv("hf_token")
twilio_account_sid = os.getenv("twilio_account_sid")
twilio_auth_token = os.getenv("twilio_auth_token")
mode = os.getenv("MODE")
login(token=hf_token)

stt_model = get_stt_model("openai/whisper-large-v3-turbo")
# tts_model = get_tts_model()
tts_model = OrpheusCpp()

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Twilio RTC Configuration
client = Client(twilio_account_sid, twilio_auth_token)
token = client.tokens.create()

rtc_configuration = {
    "iceServers": token.ice_servers,
    "iceTransportPolicy": "relay",
}

history = None

@spaces.GPU
def echo(audio):
    global history

    # Convert audio to text
    user_input = stt_model.stt(audio)
    
    system_prompt = {"role": "system", 
                     "content": """You are an smart assistant."""}

    if history is None:
        history = [system_prompt]

    history.append({"role": "user", "content": user_input})
    print("User said: " + user_input)
    
    response = pipe(
        history,  
        max_new_tokens=500,
        pad_token_id=pipe.tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.3
    )[-1]["generated_text"][-1]["content"]
    
    history.append({"role": "assistant", "content": response})
    print("AI said: " + response)

    for audio_chunk in tts_model.stream_tts_sync(response, options={"voice_id": "giulia"}):
        yield audio_chunk

    
if mode == "UI":
    can_interrupt = False
else:
    can_interrupt = True

stream = Stream(handler=ReplyOnPause(echo, can_interrupt=can_interrupt), rtc_configuration=rtc_configuration, modality="audio", mode="send-receive")

app = FastAPI()
stream.mount(app)

if __name__ == "__main__":
    import os

    if (mode := os.getenv("MODE")) == "UI":
        stream.ui.launch(server_port=7860, server_name="0.0.0.0")
    elif mode == "PHONE":
        stream.fastphone(host="0.0.0.0", port=7860)
    else:
        import uvicorn

        uvicorn.run(app, host="0.0.0.0", port=7860)

eriadhami avatar Apr 21 '25 17:04 eriadhami

Will take a look later today

freddyaboulton avatar Apr 21 '25 19:04 freddyaboulton

What do you mean by scrambled @eriadhami ?

freddyaboulton avatar Apr 23 '25 17:04 freddyaboulton

I thought you meant that it would be incomprehensible. But I can understand it.

freddyaboulton avatar Apr 23 '25 17:04 freddyaboulton

giulia is an italian voice so make sure to set lang="it" in OrpheusCpp. If the issue persists, please share any videos so I can see.

freddyaboulton avatar Apr 23 '25 17:04 freddyaboulton

@freddyaboulton I did what you suggested but still the voice is bad. You can check video (the voice starts after 30s) I am using huggingface space T4.

Video: https://github.com/user-attachments/assets/e8e6f41c-9c7d-400c-a0d0-0540ef426a94

app.py

import gradio as gr
import os
from fastrtc import (ReplyOnPause, Stream, get_tts_model)
from huggingface_hub import login
import numpy as np
import spaces
import torch
from transformers import pipeline
from twilio.rest import Client
from fastapi import FastAPI, WebSocket, Request
from distil_whisper_fastrtc import get_stt_model
from orpheus_cpp import OrpheusCpp

hf_token = os.getenv("hf_token")
twilio_account_sid = os.getenv("twilio_account_sid")
twilio_auth_token = os.getenv("twilio_auth_token")
mode = os.getenv("MODE")
login(token=hf_token)

stt_model = get_stt_model("openai/whisper-large-v3-turbo")
# tts_model = get_tts_model()
tts_model = OrpheusCpp(lang="it")

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Twilio RTC Configuration
client = Client(twilio_account_sid, twilio_auth_token)
token = client.tokens.create()

rtc_configuration = {
    "iceServers": token.ice_servers,
    "iceTransportPolicy": "relay",
}

history = None

@spaces.GPU
def echo(audio):
    global history

    user_input = stt_model.stt(audio)
    
    system_prompt = {"role": "system", 
                     "content": os.getenv("system_prompt")}

    if history is None:
        history = [system_prompt]
    
    history.append({"role": "user", "content": user_input})
    print("User said: " + user_input)
    
    response = pipe(
        history,
        max_new_tokens=500,
        pad_token_id=pipe.tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.3
    )[-1]["generated_text"][-1]["content"]

    history.append({"role": "assistant", "content": response})
    print("AI said: " + response)
    
    # for audio_chunk in tts_model.stream_tts_sync(response):
    #     yield audio_chunk

    for audio_chunk in tts_model.stream_tts_sync(response, options={"voice_id": "giulia"}):
        yield audio_chunk

if mode == "UI":
    can_interrupt = False
else:
    can_interrupt = True

stream = Stream(handler=ReplyOnPause(echo, can_interrupt=can_interrupt), rtc_configuration=rtc_configuration, modality="audio", mode="send-receive")

# Initialize FastAPI app
app = FastAPI()
stream.mount(app)

if __name__ == "__main__":
    import os

    if (mode := os.getenv("MODE")) == "UI":
        stream.ui.launch(server_port=7860, server_name="0.0.0.0")
    elif mode == "PHONE":
        stream.fastphone(host="0.0.0.0", port=7860)
    else:
        import uvicorn

        uvicorn.run(app, host="0.0.0.0", port=7860)

eriadhami avatar Apr 23 '25 18:04 eriadhami

I think the issue is that the audio generation is really slow. It's slower than on my laptop so something is off.

Did you install llama.cpp with gpu support? Also there is a n_gpu_layers parameter in OrpheusCpp. You can set to -1 so that all layers are loaded on the GPU but you need to be careful about GPU memory. There is already whisper + llama on the gpu.

freddyaboulton avatar Apr 23 '25 19:04 freddyaboulton

@freddyaboulton thanks a lot for your efforts. T4 space has 30GB RAM and 16GB VRAM so indeed that is the issue. llama.cpp installation is as follow --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 llama-cpp-python

eriadhami avatar Apr 23 '25 19:04 eriadhami

Ok in that case then I would set n_gpu_layers in the OrpheusCpp class and see

freddyaboulton avatar Apr 23 '25 19:04 freddyaboulton

I added the n_gpu_layers=-1 parameter tts_model = OrpheusCpp(n_gpu_layers=-1, lang="it") and also changed hardware to 48 vCPU, 186 GB RAM, 96 GB VRAM but now when the audio is generated it keeps restarting the space.

eriadhami avatar Apr 23 '25 19:04 eriadhami

😱

freddyaboulton avatar Apr 23 '25 20:04 freddyaboulton