[Feature]: I created a Gardio UI for the Vevo TTS, etc. Feel free to use it.
Quick Start Guide for Vevo (Amphion) 🚀
Want to try out Vevo from Amphion (https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo)? It's still early days, but here's a fast track to get it running.
Key Steps & Fixes:
- Dependencies:
requirements.txtis incomplete. You'll likely need to update packages and install missing ones. - eSpeak-NG (Crucial): Install eSpeak-NG (v1.52.0) from: https://github.com/espeak-ng/espeak-ng/releases/download/1.52.0/espeak-ng.msi. Default install path is
C:\Program Files\eSpeak NG. - PyTorch & onnxruntime-gpu: Use the latest PyTorch. Uninstall
onnxruntimeand installonnxruntime-gpu. - Encoding Fix (Resolving
UnicodeDecodeError)
Problem: You might get a UnicodeDecodeError when running Vevo. This is because the code needs to be told to read vocab.json files using UTF-8 encoding.
Solution: Modify these two files:
-
models/tts/maskgct/g2p/g2p/g2p_generation.py:- Search for:
with open("./models/tts/maskgct/g2p/g2p/vocab.json", "r") as f: - Replace with:
with open("./models/tts/maskgct/g2p/g2p/vocab.json", "r", encoding='utf-8') as f:
- Search for:
-
models/tts/maskgct/g2p/g2p/__init__.py:- Search for:
with open(vacab_path, "r") as f: - Replace with:
with open(vacab_path, "r", encoding='utf-8') as f:
- Search for:
Adding encoding='utf-8' tells Python to correctly read the vocab.json files.
5. eSpeak-NG Paths in gardio.py: Verify/update these lines at the top of gardio.py to match your eSpeak-NG install path:
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = "c:\\Program Files\\eSpeak NG\\libespeak-ng.dll"
os.environ['ESPEAK_DATA_PATH'] = "c:\\Program Files\\eSpeak NG\\espeak-ng-data"
os.environ['PATH'] = os.environ['PATH'] + ";C:\\Program Files\\eSpeak NG"
That's it for setup! Now you can run the Gradio UI with the gardio.py code below (place it in the root directory of the Amphion repo).
import sys
import os
import time
import gc
import gradio as gr
import torch
from huggingface_hub import snapshot_download
from models.vc.vevo.vevo_utils import *
import tempfile
import whisper
process_active = False
should_stop = False
current_process_name = None
current_pipeline_name = None # Track currently loaded pipeline
OUTPUT_DIR_WAV = "Path to Output"
os.makedirs(OUTPUT_DIR_WAV, exist_ok=True)
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if project_root not in sys.path:
sys.path.insert(0, project_root)
print("1. Current Working Directory:", os.getcwd())
print("2. Python Path:", sys.path)
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = "c:\\Program Files\\eSpeak NG\\libespeak-ng.dll"
os.environ['ESPEAK_DATA_PATH'] = "c:\\Program Files\\eSpeak NG\\espeak-ng-data"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
whisper_model_local = None
inference_pipeline_tts = None
inference_pipeline_vc_style = None
inference_pipeline_timbre = None
def transcribe_audio(audio_filepath):
"""Transcribes audio using Whisper ASR."""
global whisper_model_local
if not audio_filepath:
return "", "No audio file provided for transcription."
if whisper_model_local is None:
try:
print("3. Loading Whisper model for transcription...")
whisper_model_local = whisper.load_model("base") # Load whisper only when needed
print("4. Whisper model loaded successfully.")
except Exception as e:
whisper_model_local = None
print(f"5. Error loading Whisper model: {e}")
return None, "Whisper model failed to load."
status_message = "Transcribing audio..."
try:
print("6. Whisper model is being used for transcription...")
result = whisper_model_local.transcribe(audio_filepath)
transcription_text = result["text"]
return transcription_text, status_message
except Exception as e:
print(f"7. Whisper transcription error: {e}")
return "Transcription failed", f"Transcription failed! Error: {e}"
finally:
unload_whisper_model() # Unload whisper immediately after use
print("Whisper model unloaded after transcription.") # added log to confirm unload
def load_vevo_tts_pipeline():
"""Loads the Vevo TTS inference pipeline."""
global inference_pipeline_tts
if inference_pipeline_tts is not None:
print("8. Vevo TTS pipeline already loaded, skipping reload.")
return inference_pipeline_tts
status_message = "Loading Vevo TTS pipeline..."
print("9. Loading Vevo TTS pipeline components...")
try:
local_dir_tokenizer = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["tokenizer/vq8192/*"],
)
content_style_tokenizer_ckpt_path = os.path.join(local_dir_tokenizer, "tokenizer/vq8192")
local_dir_ar = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
)
ar_cfg_path_tts = "./models/vc/vevo/config/PhoneToVq8192.json"
ar_ckpt_path_tts = os.path.join(local_dir_ar, "contentstyle_modeling/PhoneToVq8192")
local_dir_fmt = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
)
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
fmt_ckpt_path = os.path.join(local_dir_fmt, "acoustic_modeling/Vq8192ToMels")
local_dir_vocoder = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["acoustic_modeling/Vocoder/*"],
)
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
vocoder_ckpt_path = os.path.join(local_dir_vocoder, "acoustic_modeling/Vocoder")
inference_pipeline_tts = VevoInferencePipeline(
content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
ar_cfg_path=ar_cfg_path_tts,
ar_ckpt_path=ar_ckpt_path_tts,
fmt_cfg_path=fmt_cfg_path,
fmt_ckpt_path=fmt_ckpt_path,
vocoder_cfg_path=vocoder_cfg_path,
vocoder_ckpt_path=vocoder_ckpt_path,
device=device,
)
print("10. Vevo TTS pipeline loaded successfully.")
return inference_pipeline_tts
except Exception as e:
print(f"11. Error loading Vevo TTS pipeline: {e}")
return None
def vevo_tts_gradio(
src_text,
ref_wav_path,
timbre_ref_wav_path=None,
src_language="en",
ref_language="en",
):
"""Generates speech using Vevo TTS."""
global inference_pipeline_tts, whisper_model_local, process_active, should_stop, current_process_name, current_pipeline_name
current_process_name = "TTS"
process_active = True
should_stop = False
audio_output_path = None
if current_pipeline_name != "TTS": # Unload models if switching tabs
unload_all_models()
current_pipeline_name = None # Reset pipeline name after unloading
if inference_pipeline_tts is None: # Load TTS model only if not already loaded
inference_pipeline_tts = load_vevo_tts_pipeline()
if inference_pipeline_tts is None:
return None, None, "Failed to load Vevo TTS pipeline."
current_pipeline_name = "TTS" # Set current pipeline name after loading
try:
if should_stop:
status_message = "TTS Generation stopped by user before processing."
print(status_message)
return None, None, status_message
if not timbre_ref_wav_path:
timbre_ref_wav_path = ref_wav_path
original_filename_base = os.path.splitext(os.path.basename(ref_wav_path))[0] if ref_wav_path else "audiofile"
target_voice_name = os.path.splitext(os.path.basename(ref_wav_path))[0] if ref_wav_path else "target_voice" # Using ref_wav_path for target voice name
output_filename = f"{original_filename_base} as {target_voice_name}_1.wav" # Modified output filename - _1 suffix, no timestamp
output_path = os.path.join(OUTPUT_DIR_WAV, output_filename)
audio_output_path = output_path
status_message = "Transcribing reference audio..."
transcribed_text, transcription_status = transcribe_audio(ref_wav_path) # whisper model will be loaded and unloaded inside
status_message = transcription_status
if "failed" in transcription_status.lower():
return None, transcribed_text, status_message
print(f"12. Debug (Transcription): Transcribed Reference Text: {transcribed_text}")
print(f"13. Debug (TTS): Before Vevo TTS Inference...")
print(f"14. Debug (TTS): src_language = {src_language}, ref_language = {ref_language}, ref_text = {transcribed_text}")
status_message = "Generating speech with Vevo TTS..."
if should_stop:
status_message = "TTS Generation stopped by user during processing."
print(status_message)
return None, transcribed_text, status_message
print("15. Starting Vevo TTS inference...")
gen_audio = inference_pipeline_tts.inference_ar_and_fm(
src_wav_path=None,
src_text=src_text,
style_ref_wav_path=ref_wav_path,
timbre_ref_wav_path=timbre_ref_wav_path,
style_ref_wav_text=transcribed_text,
src_text_language=src_language,
style_ref_wav_text_language=ref_language,
)
print("16. Debug (TTS): After Vevo TTS Inference...")
save_audio(gen_audio, output_path=output_path)
status_message = "Processing complete! Saved to: " + output_path
print("17. TTS processing complete. Output saved to:", output_path)
return output_path, transcribed_text, status_message
except Exception as e:
if "aborted by user" in str(e).lower():
status_message = "TTS Generation stopped by user."
print("TTS Generation stopped by user.")
else:
status_message = f"TTS Generation failed! Error: {e}"
print(f"18. TTS Generation failed with error: {e}")
return None, transcribed_text, status_message
finally:
process_active = False
current_process_name = None
# unload_all_models() # No need to unload here anymore, unloaded on tab switch or unload button
def load_vevo_voice_style_pipeline():
"""Loads the Vevo Voice Cloning and Style Transfer inference pipeline."""
global inference_pipeline_vc_style
if inference_pipeline_vc_style is not None:
print("21. Vevo VC/Style pipeline already loaded, skipping reload.")
return inference_pipeline_vc_style
status_message = "Loading Vevo VC/Style pipeline..."
print("22. Loading Vevo VC/Style pipeline components...")
try:
local_dir_content_tokenizer = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["tokenizer/vq32/*"],
)
content_tokenizer_ckpt_path = os.path.join(
local_dir_content_tokenizer, "tokenizer/vq32/hubert_large_l18_c32.pkl"
)
local_dir_tokenizer = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["tokenizer/vq8192/*"],
)
content_style_tokenizer_ckpt_path = os.path.join(local_dir_tokenizer, "tokenizer/vq8192")
local_dir_ar = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
)
ar_cfg_path_vc_style = "./models/vc/vevo/config/Vq32ToVq8192.json"
ar_ckpt_path_vc_style = os.path.join(local_dir_ar, "contentstyle_modeling/Vq32ToVq8192")
local_dir_fmt = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
)
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
fmt_ckpt_path = os.path.join(local_dir_fmt, "acoustic_modeling/Vq8192ToMels")
local_dir_vocoder = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["acoustic_modeling/Vocoder/*"],
)
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
vocoder_ckpt_path = os.path.join(local_dir_vocoder, "acoustic_modeling/Vocoder")
inference_pipeline_vc_style = VevoInferencePipeline(
content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
ar_cfg_path=ar_cfg_path_vc_style,
ar_ckpt_path=ar_ckpt_path_vc_style,
fmt_cfg_path=fmt_cfg_path,
fmt_ckpt_path=fmt_ckpt_path,
vocoder_cfg_path=vocoder_cfg_path,
vocoder_ckpt_path=vocoder_ckpt_path,
device=device,
)
print("23. Vevo VC/Style pipeline loaded successfully.")
return inference_pipeline_vc_style
except Exception as e:
print(f"24. Error loading Vevo VC/Style pipeline: {e}")
return None
def vevo_voice_gradio(content_wav_path, reference_wav_path):
"""Generates voice clone using Vevo."""
global inference_pipeline_vc_style, process_active, should_stop, current_process_name, current_pipeline_name
current_process_name = "Voice Cloning"
process_active = True
should_stop = False
audio_output_path = None
if current_pipeline_name != "VC": # Unload models if switching tabs
unload_all_models()
current_pipeline_name = None # Reset pipeline name after unloading
if inference_pipeline_vc_style is None: # Load VC model only if not already loaded
inference_pipeline_vc_style = load_vevo_voice_style_pipeline()
if inference_pipeline_vc_style is None:
return None, "Failed to load Vevo VC/Style pipeline."
current_pipeline_name = "VC" # Set current pipeline name after loading
try:
if should_stop:
status_message = "Voice Cloning stopped by user before processing."
print(status_message)
return None, status_message
original_filename_base = os.path.splitext(os.path.basename(content_wav_path))[0] if content_wav_path else "audiofile"
target_voice_name = os.path.splitext(os.path.basename(reference_wav_path))[0] if reference_wav_path else "target_voice" # Using reference_wav_path for target voice name
output_filename = f"{original_filename_base} as {target_voice_name}_1.wav" # Modified output filename - _1 suffix, no timestamp
output_path = os.path.join(OUTPUT_DIR_WAV, output_filename)
audio_output_path = output_path
status_message = "Generating voice clone..."
if should_stop:
status_message = "Voice Cloning stopped by user during processing."
print(status_message)
return None, status_message
print("25. Starting Vevo Voice Cloning inference...")
gen_audio = inference_pipeline_vc_style.inference_ar_and_fm(
src_wav_path=content_wav_path,
src_text=None,
style_ref_wav_path=reference_wav_path,
timbre_ref_wav_path=reference_wav_path,
)
save_audio(gen_audio, output_path=output_path)
status_message = "Processing complete! Saved to: " + output_path
print("26. Voice Cloning processing complete. Output saved to:", output_path)
return output_path, status_message
except Exception as e:
if "aborted by user" in str(e).lower():
status_message = "Voice Cloning stopped by user."
print("Voice Cloning stopped by user.")
else:
status_message = f"Voice Cloning failed! Error: {e}"
print(f"27. Voice Cloning failed with error: {e}")
return None, status_message
finally:
process_active = False
current_process_name = None
# unload_all_models() # No need to unload here anymore, unloaded on tab switch or unload button
def vevo_style_gradio(content_wav_path, style_wav_path):
"""Generates style transferred audio using Vevo."""
global inference_pipeline_vc_style, process_active, should_stop, current_process_name, current_pipeline_name
current_process_name = "Style Transfer"
process_active = True
should_stop = False
audio_output_path = None
if current_pipeline_name != "Style": # Unload models if switching tabs
unload_all_models()
current_pipeline_name = None # Reset pipeline name after unloading
if inference_pipeline_vc_style is None: # Load Style model only if not already loaded
inference_pipeline_vc_style = load_vevo_voice_style_pipeline()
if inference_pipeline_vc_style is None:
return None, "Failed to load Vevo VC/Style pipeline."
current_pipeline_name = "Style" # Set current pipeline name after loading
try:
if should_stop:
status_message = "Style Transfer stopped by user before processing."
print(status_message)
return None, status_message
original_filename_base = os.path.splitext(os.path.basename(content_wav_path))[0] if content_wav_path else "audiofile"
target_voice_name = os.path.splitext(os.path.basename(style_wav_path))[0] if style_wav_path else "target_style" # Using style_wav_path for target style name
output_filename = f"{original_filename_base} as {target_voice_name}_1.wav" # Modified output filename - _1 suffix, no timestamp
output_path = os.path.join(OUTPUT_DIR_WAV, output_filename)
audio_output_path = output_path
status_message = "Generating style transfer..."
if should_stop:
status_message = "Style Transfer stopped by user during processing."
print(status_message)
return None, status_message
print("30. Starting Vevo Style Transfer inference...")
gen_audio = inference_pipeline_vc_style.inference_ar_and_fm(
src_wav_path=content_wav_path,
src_text=None,
style_ref_wav_path=style_wav_path,
timbre_ref_wav_path=content_wav_path, # Timbre ref is content for style transfer
)
save_audio(gen_audio, output_path=output_path)
status_message = "Processing complete! Saved to: " + output_path
print("31. Style Transfer processing complete. Output saved to:", output_path)
return output_path, status_message
except Exception as e:
if "aborted by user" in str(e).lower():
status_message = "Style Transfer stopped by user."
print("Style Transfer stopped by user.")
else:
status_message = f"Style Transfer failed! Error: {e}"
print(f"32. Style Transfer failed with error: {e}")
return None, status_message
finally:
process_active = False
current_process_name = None
# unload_all_models() # No need to unload here anymore, unloaded on tab switch or unload button
def load_vevo_timbre_pipeline():
"""Loads the Vevo Timbre Transfer inference pipeline."""
global inference_pipeline_timbre
if inference_pipeline_timbre is not None:
print("35. Vevo Timbre pipeline already loaded, skipping reload.")
return inference_pipeline_timbre
status_message = "Loading Vevo Timbre pipeline..."
print("36. Loading Vevo Timbre pipeline components...")
try:
local_dir_tokenizer = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["tokenizer/vq8192/*"],
)
tokenizer_ckpt_path = os.path.join(local_dir_tokenizer, "tokenizer/vq8192")
local_dir_fmt = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
)
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
fmt_ckpt_path = os.path.join(local_dir_fmt, "acoustic_modeling/Vq8192ToMels")
local_dir_vocoder = snapshot_download(
repo_id="amphion/Vevo",
repo_type="model",
cache_dir="./ckpts/Vevo",
allow_patterns=["acoustic_modeling/Vocoder/*"],
)
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
vocoder_ckpt_path = os.path.join(local_dir_vocoder, "acoustic_modeling/Vocoder")
inference_pipeline_timbre = VevoInferencePipeline(
content_style_tokenizer_ckpt_path=tokenizer_ckpt_path,
fmt_cfg_path=fmt_cfg_path,
fmt_ckpt_path=fmt_ckpt_path,
vocoder_cfg_path=vocoder_cfg_path,
vocoder_ckpt_path=vocoder_ckpt_path,
device=device,
)
print("37. Vevo Timbre pipeline loaded successfully.")
return inference_pipeline_timbre
except Exception as e:
print(f"38. Error loading Vevo Timbre pipeline: {e}")
return None
def vevo_timbre_gradio(content_wav_path, reference_wav_path):
"""Generates timbre transferred audio using Vevo."""
global inference_pipeline_timbre, process_active, should_stop, current_process_name, current_pipeline_name
current_process_name = "Timbre Transfer"
process_active = True
should_stop = False
audio_output_path = None
if current_pipeline_name != "Timbre": # Unload models if switching tabs
unload_all_models()
current_pipeline_name = None # Reset pipeline name after unloading
if inference_pipeline_timbre is None: # Load Timbre model only if not already loaded
inference_pipeline_timbre = load_vevo_timbre_pipeline()
if inference_pipeline_timbre is None:
return None, "Failed to load Vevo Timbre pipeline."
current_pipeline_name = "Timbre" # Set current pipeline name after loading
try:
if should_stop:
status_message = "Timbre Transfer stopped by user before processing."
print(status_message)
return None, status_message
original_filename_base = os.path.splitext(os.path.basename(content_wav_path))[0] if content_wav_path else "audiofile"
target_voice_name = os.path.splitext(os.path.basename(reference_wav_path))[0] if reference_wav_path else "target_timbre" # Using reference_wav_path for target timbre name
output_filename = f"{original_filename_base} as {target_voice_name}_1.wav" # Modified output filename for Timbre Transfer - _1 suffix, no timestamp - Corrected order
output_path = os.path.join(OUTPUT_DIR_WAV, output_filename)
audio_output_path = output_path
status_message = "Generating timbre transfer..."
if should_stop:
status_message = "Timbre Transfer stopped by user during processing."
print(status_message)
return None, status_message
print("39. Starting Vevo Timbre Transfer inference...")
gen_audio = inference_pipeline_timbre.inference_fm(
src_wav_path=content_wav_path,
timbre_ref_wav_path=reference_wav_path,
flow_matching_steps=32,
)
save_audio(gen_audio, output_path=output_path)
status_message = "Processing complete! Saved to: " + output_path
print("40. Timbre Transfer processing complete. Output saved to:", output_path)
return output_path, status_message
except Exception as e:
if "aborted by user" in str(e).lower():
status_message = "Timbre Transfer stopped by user."
print("Timbre Transfer stopped by user.")
else:
status_message = f"Timbre Transfer failed! Error: {e}"
print(f"41. Timbre Transfer failed with error: {e}")
return None, status_message
finally:
process_active = False
current_process_name = None
# unload_all_models() # No need to unload here anymore, unloaded on tab switch or unload button
def unload_vevo_tts_model():
"""Unloads the Vevo TTS pipeline to release VRAM."""
global inference_pipeline_tts
if inference_pipeline_tts is not None:
print("44. Unloading Vevo TTS model...")
inference_pipeline_tts = None
torch.cuda.empty_cache()
print("45. Vevo TTS model unloaded.")
else:
print("Vevo TTS model is not loaded.")
def unload_vevo_vc_style_model():
"""Unloads the Vevo VC/Style pipeline to release VRAM."""
global inference_pipeline_vc_style
if inference_pipeline_vc_style is not None:
print("46. Unloading Vevo VC/Style model...")
inference_pipeline_vc_style = None
torch.cuda.empty_cache()
print("47. Vevo VC/Style model unloaded.")
else:
print("Vevo VC/Style model is not loaded.")
def unload_vevo_timbre_model():
"""Unloads the Vevo Timbre pipeline to release VRAM."""
global inference_pipeline_timbre
if inference_pipeline_timbre is not None:
print("48. Unloading Vevo Timbre model...")
inference_pipeline_timbre = None
torch.cuda.empty_cache()
print("49. Vevo Timbre model unloaded.")
else:
print("Vevo Timbre model is not loaded.")
def unload_whisper_model():
"""Unloads the Whisper model to release VRAM."""
global whisper_model_local
if whisper_model_local is not None:
print("50. Unloading Whisper model...")
whisper_model_local = None
torch.cuda.empty_cache()
print("51. Vevo Whisper model unloaded.")
else:
print("Vevo Whisper model is not loaded.")
def unload_all_models():
"""Unloads all Vevo and Whisper models."""
global current_pipeline_name
unload_vevo_tts_model()
unload_vevo_vc_style_model()
unload_vevo_timbre_model()
unload_whisper_model()
current_pipeline_name = None # Reset current pipeline name when unloading all
print("52. All models unloaded via unload_all_models.")
gc.collect() # Explicit garbage collection to free memory
torch.cuda.empty_cache() # Clear CUDA cache
def unload_all_models_button_click():
"""Unloads all models when the button is clicked."""
unload_all_models()
return "All models unloaded."
with gr.Blocks() as iface:
gr.Markdown("# Vevo Voice Conversion and TTS Gradio Demo")
stop_btn = gr.Button("🚫 Unload All Models", variant="stop") # Renamed button to "Unload All Models" and icon changed
stop_status = gr.Textbox(label="Stop Status", interactive=False, visible=False) # Hide stop status textbox
status_display = gr.Label(label="Status", value="Waiting for input...")
print(f"DEBUG INTERFACE: [Definition] Type of status_display: {type(status_display)}, ID: {id(status_display)}")
gr.Markdown(f"Gradio version: {gr.__version__}")
stop_btn.click(
fn=unload_all_models_button_click,
outputs=stop_status # Keep output for potential debugging, but hide in UI
)
with gr.Tabs() as tabs:
with gr.Tab("TTS") as tts_tab:
gr.Markdown("## Zero-Shot Text-to-Speech (TTS)") # Added short description
with gr.Row():
with gr.Column():
tts_src_text_input = gr.Textbox(lines=2, label="Source Text", placeholder="Enter text to synthesize") # Reduced lines for compactness
tts_style_audio_input = gr.Audio(type="filepath", label="Style Reference Audio (Voice to Mimic)") # Added info to label
tts_timbre_audio_input = gr.Audio(type="filepath", label="Timbre Reference Audio (Optional, for fine-tuning timbre)") # Added info and made optional
tts_ref_text_output = gr.Textbox(lines=1, label="Reference Text (Auto-filled from Style Audio)", interactive=False, visible=False) # Hide ref text output
with gr.Row():
tts_src_lang_dropdown = gr.Dropdown(["en", "ar", "zh", "es", "fr", "de"], value="en", label="Source Language")
tts_ref_lang_dropdown = gr.Dropdown(["en", "ar", "zh", "es", "fr", "de"], value="en", label="Reference Language")
tts_generate_button = gr.Button("Generate Speech", variant="primary") # Removed unload button
with gr.Column():
tts_audio_output = gr.Audio(label="Generated Speech")
tts_generate_button.click(
fn=vevo_tts_gradio,
inputs=[
tts_src_text_input,
tts_style_audio_input,
tts_timbre_audio_input,
tts_src_lang_dropdown,
tts_ref_lang_dropdown,
],
outputs=[tts_audio_output, tts_ref_text_output, status_display],
)
with gr.Tab("Voice Cloning") as vc_tab:
gr.Markdown("## Voice Cloning (VC)") # Added short description
with gr.Row():
with gr.Column():
vc_content_audio_input = gr.Audio(type="filepath", label="Content Audio (Audio to be converted)") # Added info to label
vc_reference_audio_input = gr.Audio(type="filepath", label="Reference Voice Audio (Voice to clone)") # Added info to label
vc_generate_button = gr.Button("Clone Voice", variant="primary") # Removed unload button
with gr.Column():
vc_audio_output = gr.Audio(label="Generated Voice Clone")
vc_generate_button.click(
fn=vevo_voice_gradio,
inputs=[vc_content_audio_input, vc_reference_audio_input],
outputs=[vc_audio_output, status_display],
)
with gr.Tab("Style Transfer") as vs_tab:
gr.Markdown("## Style Transfer (VS)") # Added short description
with gr.Row():
with gr.Column():
vs_content_audio_input = gr.Audio(type="filepath", label="Content Audio (Audio to be styled)") # Added info to label
vs_style_audio_input = gr.Audio(type="filepath", label="Style Audio (Style to transfer)") # Added info to label
vs_generate_button = gr.Button("Transfer Style", variant="primary") # Removed unload button
with gr.Column():
vs_audio_output = gr.Audio(label="Generated Style Transferred Audio")
vs_generate_button.click(
fn=vevo_style_gradio,
inputs=[vs_content_audio_input, vs_style_audio_input],
outputs=[vs_audio_output, status_display],
)
with gr.Tab("Timbre Transfer") as vt_tab:
gr.Markdown("## Timbre Transfer (VT)") # Added short description
with gr.Row():
with gr.Column():
vt_content_audio_input = gr.Audio(type="filepath", label="Content Audio (Audio to be converted)") # Added info to label
vt_reference_audio_input = gr.Audio(type="filepath", label="Timbre Reference Audio (Timbre to transfer)") # Added info to label
vt_generate_button = gr.Button("Transfer Timbre", variant="primary") # Removed unload button
with gr.Column():
vt_audio_output = gr.Audio(label="Generated Timbre Transferred Audio")
vt_generate_button.click(
fn=vevo_timbre_gradio,
inputs=[vt_content_audio_input, vt_reference_audio_input],
outputs=[vt_audio_output, status_display],
)
iface.launch()
Run the Gradio UI:
python gardio.py
Troubleshooting:
UnicodeDecodeError: Double-check encoding fixes ing2p_generation.pyand__init__.py.eSpeak-NG not found: Verify install path ingardio.pyand that eSpeak-NG is correctly installed.- Dependency issues: Upgrade pip and potentially other packages if errors occur.
Enjoy experimenting with Vevo! Let us know if you have questions or improvements.
@search620 Why don't you publish a demo on the HuggingFace platform? It would be really great. Thanks