VoiceClone-TTS / app.py
ginipick's picture
Update app.py
89976f7 verified
import os
import shlex
import subprocess
subprocess.run(
shlex.split("pip install flash-attn --no-build-isolation"),
env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
check=True,
)
subprocess.run(
shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
check=True,
)
subprocess.run(
shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
check=True,
)
import spaces
import torch
import torchaudio
import gradio as gr
from os import getenv
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict, supported_language_codes
device = "cuda"
MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
for model in MODELS.values():
model.requires_grad_(False).eval()
def update_ui(model_choice):
"""
Dynamically show/hide UI elements based on the model's conditioners.
We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
"""
model = MODELS[model_choice]
cond_names = [c.name for c in model.prefix_conditioner.conditioners]
print("Conditioners in this model:", cond_names)
text_update = gr.update(visible=("espeak" in cond_names))
language_update = gr.update(visible=("espeak" in cond_names))
speaker_audio_update = gr.update(visible=("speaker" in cond_names))
prefix_audio_update = gr.update(visible=True)
emotion1_update = gr.update(visible=("emotion" in cond_names))
emotion2_update = gr.update(visible=("emotion" in cond_names))
emotion3_update = gr.update(visible=("emotion" in cond_names))
emotion4_update = gr.update(visible=("emotion" in cond_names))
emotion5_update = gr.update(visible=("emotion" in cond_names))
emotion6_update = gr.update(visible=("emotion" in cond_names))
emotion7_update = gr.update(visible=("emotion" in cond_names))
emotion8_update = gr.update(visible=("emotion" in cond_names))
vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
fmax_slider_update = gr.update(visible=("fmax" in cond_names))
pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
unconditional_keys_update = gr.update(
choices=[name for name in cond_names if name not in ("espeak", "language_id")]
)
return (
text_update,
language_update,
speaker_audio_update,
prefix_audio_update,
emotion1_update,
emotion2_update,
emotion3_update,
emotion4_update,
emotion5_update,
emotion6_update,
emotion7_update,
emotion8_update,
vq_single_slider_update,
fmax_slider_update,
pitch_std_slider_update,
speaking_rate_slider_update,
dnsmos_slider_update,
speaker_noised_checkbox_update,
unconditional_keys_update,
)
@spaces.GPU(duration=120)
def generate_audio(
model_choice,
text,
language,
speaker_audio,
prefix_audio,
e1,
e2,
e3,
e4,
e5,
e6,
e7,
e8,
vq_single,
fmax,
pitch_std,
speaking_rate,
dnsmos_ovrl,
speaker_noised,
cfg_scale,
min_p,
seed,
randomize_seed,
unconditional_keys,
progress=gr.Progress(),
):
"""
Generates audio based on the provided UI parameters.
We do NOT use language_id or ctc_loss even if the model has them.
"""
selected_model = MODELS[model_choice]
speaker_noised_bool = bool(speaker_noised)
fmax = float(fmax)
pitch_std = float(pitch_std)
speaking_rate = float(speaking_rate)
dnsmos_ovrl = float(dnsmos_ovrl)
cfg_scale = float(cfg_scale)
min_p = float(min_p)
seed = int(seed)
max_new_tokens = 86 * 30
if randomize_seed:
seed = torch.randint(0, 2**32 - 1, (1,)).item()
torch.manual_seed(seed)
speaker_embedding = None
if speaker_audio is not None and "speaker" not in unconditional_keys:
wav, sr = torchaudio.load(speaker_audio)
speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
audio_prefix_codes = None
if prefix_audio is not None:
wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
wav_prefix = wav_prefix.mean(0, keepdim=True)
wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
wav_prefix = wav_prefix.to(device, dtype=torch.float32)
with torch.autocast(device, dtype=torch.float32):
audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
vq_val = float(vq_single)
vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
cond_dict = make_cond_dict(
text=text,
language=language,
speaker=speaker_embedding,
emotion=emotion_tensor,
vqscore_8=vq_tensor,
fmax=fmax,
pitch_std=pitch_std,
speaking_rate=speaking_rate,
dnsmos_ovrl=dnsmos_ovrl,
speaker_noised=speaker_noised_bool,
device=device,
unconditional_keys=unconditional_keys,
)
conditioning = selected_model.prepare_conditioning(cond_dict)
estimated_generation_duration = 30 * len(text) / 400
estimated_total_steps = int(estimated_generation_duration * 86)
def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
progress((step, estimated_total_steps))
return True
codes = selected_model.generate(
prefix_conditioning=conditioning,
audio_prefix_codes=audio_prefix_codes,
max_new_tokens=max_new_tokens,
cfg_scale=cfg_scale,
batch_size=1,
sampling_params=dict(min_p=min_p),
callback=update_progress,
)
wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
sr_out = selected_model.autoencoder.sampling_rate
if wav_out.dim() == 2 and wav_out.size(0) > 1:
wav_out = wav_out[0:1, :]
return (sr_out, wav_out.squeeze().numpy()), seed
# Custom CSS for pastel gradient background and enhanced UI
custom_css = """
.gradio-container {
background: linear-gradient(135deg, #f3e7ff, #e6f0ff, #ffe6f2, #e6fff9);
background-size: 400% 400%;
animation: gradient 15s ease infinite;
}
@keyframes gradient {
0% {
background-position: 0% 50%;
}
50% {
background-position: 100% 50%;
}
100% {
background-position: 0% 50%;
}
}
.container {
max-width: 1200px;
margin: 0 auto;
padding: 20px;
}
.panel {
background-color: rgba(255, 255, 255, 0.7);
border-radius: 16px;
padding: 20px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
margin-bottom: 16px;
backdrop-filter: blur(5px);
transition: all 0.3s ease;
}
.panel:hover {
box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12);
transform: translateY(-2px);
}
.title {
font-size: 1.2em;
font-weight: 600;
margin-bottom: 12px;
color: #6a3ea1;
border-bottom: 2px solid #f0e6ff;
padding-bottom: 8px;
}
.slider-container {
background-color: rgba(255, 255, 255, 0.5);
border-radius: 10px;
padding: 10px;
margin: 5px 0;
}
/* Make sliders more appealing */
input[type=range] {
height: 5px;
appearance: none;
width: 100%;
border-radius: 3px;
background: linear-gradient(90deg, #9c83e0, #83b1e0);
}
.generate-button {
background: linear-gradient(90deg, #a673ff, #7c4dff);
color: white;
border: none;
border-radius: 8px;
padding: 12px 24px;
font-size: 16px;
font-weight: 500;
cursor: pointer;
transition: all 0.3s ease;
box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2);
display: block;
width: 100%;
margin: 20px 0;
}
.generate-button:hover {
background: linear-gradient(90deg, #9c5eff, #6a3aff);
box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3);
transform: translateY(-2px);
}
/* Tabs styling */
.tabs {
display: flex;
border-bottom: 1px solid #e0e0e0;
margin-bottom: 20px;
}
.tab {
padding: 10px 20px;
cursor: pointer;
transition: all 0.3s ease;
background-color: transparent;
border: none;
color: #666;
}
.tab.active {
color: #7c4dff;
border-bottom: 3px solid #7c4dff;
font-weight: 600;
}
/* Emotion sliders container */
.emotion-grid {
display: grid;
grid-template-columns: repeat(4, 1fr);
gap: 12px;
}
/* Header styling */
.app-header {
text-align: center;
margin-bottom: 25px;
}
.app-header h1 {
font-size: 2.5em;
color: #6a3ea1;
margin-bottom: 8px;
font-weight: 700;
}
.app-header p {
font-size: 1.1em;
color: #666;
margin-bottom: 20px;
}
/* Audio player styling */
.audio-output {
margin-top: 20px;
}
/* Make output area more prominent */
.output-container {
background-color: rgba(255, 255, 255, 0.85);
border-radius: 16px;
padding: 24px;
box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1);
margin-top: 20px;
}
"""
def build_interface():
# Build interface with enhanced visual elements and layout
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
# Header section
with gr.Column(elem_classes="app-header"):
gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨")
gr.Markdown("Create natural-sounding speech with customizable voice characteristics")
# Main content container
with gr.Column(elem_classes="container"):
# First panel - Text & Model Selection
with gr.Column(elem_classes="panel"):
gr.Markdown('<div class="title">💬 Text & Model Configuration</div>')
with gr.Row():
with gr.Column(scale=2):
model_choice = gr.Dropdown(
choices=MODEL_NAMES,
value="Zyphra/Zonos-v0.1-transformer",
label="Zonos Model Type",
info="Select the model variant to use.",
)
text = gr.Textbox(
label="Text to Synthesize",
value="Zonos uses eSpeak for text to phoneme conversion!",
lines=4,
max_length=500,
)
language = gr.Dropdown(
choices=supported_language_codes,
value="en-us",
label="Language Code",
info="Select a language code.",
)
with gr.Column(scale=1):
prefix_audio = gr.Audio(
value="assets/silence_100ms.wav",
label="Optional Prefix Audio (continue from this audio)",
type="filepath",
)
# Second panel - Voice Characteristics
with gr.Column(elem_classes="panel"):
gr.Markdown('<div class="title">🎤 Voice Characteristics</div>')
with gr.Row():
with gr.Column(scale=1):
speaker_audio = gr.Audio(
label="Optional Speaker Audio (for voice cloning)",
type="filepath",
)
speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
with gr.Column(scale=2):
with gr.Row():
with gr.Column():
dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container")
fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container")
vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container")
with gr.Column():
pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container")
speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container")
# Third panel - Generation Parameters
with gr.Column(elem_classes="panel"):
gr.Markdown('<div class="title">⚙️ Generation Parameters</div>')
with gr.Row():
with gr.Column():
cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container")
min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container")
with gr.Column():
seed_number = gr.Number(label="Seed", value=420, precision=0)
randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
# Emotion Panel with Tabbed Interface
with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"):
gr.Markdown(
"Adjust these sliders to control the emotional tone of the generated speech.\n"
"For a neutral voice, keep 'Neutral' high and other emotions low."
)
with gr.Row(elem_classes="emotion-grid"):
emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container")
emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container")
emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container")
emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container")
with gr.Row(elem_classes="emotion-grid"):
emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container")
emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container")
emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container")
emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container")
# Advanced Settings Panel
with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"):
gr.Markdown(
"### Unconditional Toggles\n"
"Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
)
unconditional_keys = gr.CheckboxGroup(
[
"speaker",
"emotion",
"vqscore_8",
"fmax",
"pitch_std",
"speaking_rate",
"dnsmos_ovrl",
"speaker_noised",
],
value=["emotion"],
label="Unconditional Keys",
)
# Generate Button and Output Area
with gr.Column(elem_classes="panel output-container"):
gr.Markdown('<div class="title">🔊 Generate & Output</div>')
generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")
model_choice.change(
fn=update_ui,
inputs=[model_choice],
outputs=[
text,
language,
speaker_audio,
prefix_audio,
emotion1,
emotion2,
emotion3,
emotion4,
emotion5,
emotion6,
emotion7,
emotion8,
vq_single_slider,
fmax_slider,
pitch_std_slider,
speaking_rate_slider,
dnsmos_slider,
speaker_noised_checkbox,
unconditional_keys,
],
)
# On page load, trigger the same UI refresh
demo.load(
fn=update_ui,
inputs=[model_choice],
outputs=[
text,
language,
speaker_audio,
prefix_audio,
emotion1,
emotion2,
emotion3,
emotion4,
emotion5,
emotion6,
emotion7,
emotion8,
vq_single_slider,
fmax_slider,
pitch_std_slider,
speaking_rate_slider,
dnsmos_slider,
speaker_noised_checkbox,
unconditional_keys,
],
)
# Generate audio on button click
generate_button.click(
fn=generate_audio,
inputs=[
model_choice,
text,
language,
speaker_audio,
prefix_audio,
emotion1,
emotion2,
emotion3,
emotion4,
emotion5,
emotion6,
emotion7,
emotion8,
vq_single_slider,
fmax_slider,
pitch_std_slider,
speaking_rate_slider,
dnsmos_slider,
speaker_noised_checkbox,
cfg_scale_slider,
min_p_slider,
seed_number,
randomize_seed_toggle,
unconditional_keys,
],
outputs=[output_audio, seed_number],
)
return demo
if __name__ == "__main__":
demo = build_interface()
share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
demo.launch(server_name="0.0.0.0", server_port=7860, share=share, mcp_server=True)