import os import shlex import subprocess subprocess.run( shlex.split("pip install flash-attn --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True, ) subprocess.run( shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True, ) subprocess.run( shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True, ) import spaces import torch import torchaudio import gradio as gr from os import getenv from zonos.model import Zonos from zonos.conditioning import make_cond_dict, supported_language_codes device = "cuda" MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"] MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES} for model in MODELS.values(): model.requires_grad_(False).eval() def update_ui(model_choice): """ Dynamically show/hide UI elements based on the model's conditioners. We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model. """ model = MODELS[model_choice] cond_names = [c.name for c in model.prefix_conditioner.conditioners] print("Conditioners in this model:", cond_names) text_update = gr.update(visible=("espeak" in cond_names)) language_update = gr.update(visible=("espeak" in cond_names)) speaker_audio_update = gr.update(visible=("speaker" in cond_names)) prefix_audio_update = gr.update(visible=True) emotion1_update = gr.update(visible=("emotion" in cond_names)) emotion2_update = gr.update(visible=("emotion" in cond_names)) emotion3_update = gr.update(visible=("emotion" in cond_names)) emotion4_update = gr.update(visible=("emotion" in cond_names)) emotion5_update = gr.update(visible=("emotion" in cond_names)) emotion6_update = gr.update(visible=("emotion" in cond_names)) emotion7_update = gr.update(visible=("emotion" in cond_names)) emotion8_update = gr.update(visible=("emotion" in cond_names)) vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names)) fmax_slider_update = gr.update(visible=("fmax" in cond_names)) pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names)) speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names)) dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names)) unconditional_keys_update = gr.update( choices=[name for name in cond_names if name not in ("espeak", "language_id")] ) return ( text_update, language_update, speaker_audio_update, prefix_audio_update, emotion1_update, emotion2_update, emotion3_update, emotion4_update, emotion5_update, emotion6_update, emotion7_update, emotion8_update, vq_single_slider_update, fmax_slider_update, pitch_std_slider_update, speaking_rate_slider_update, dnsmos_slider_update, speaker_noised_checkbox_update, unconditional_keys_update, ) @spaces.GPU(duration=120) def generate_audio( model_choice, text, language, speaker_audio, prefix_audio, e1, e2, e3, e4, e5, e6, e7, e8, vq_single, fmax, pitch_std, speaking_rate, dnsmos_ovrl, speaker_noised, cfg_scale, min_p, seed, randomize_seed, unconditional_keys, progress=gr.Progress(), ): """ Generates audio based on the provided UI parameters. We do NOT use language_id or ctc_loss even if the model has them. """ selected_model = MODELS[model_choice] speaker_noised_bool = bool(speaker_noised) fmax = float(fmax) pitch_std = float(pitch_std) speaking_rate = float(speaking_rate) dnsmos_ovrl = float(dnsmos_ovrl) cfg_scale = float(cfg_scale) min_p = float(min_p) seed = int(seed) max_new_tokens = 86 * 30 if randomize_seed: seed = torch.randint(0, 2**32 - 1, (1,)).item() torch.manual_seed(seed) speaker_embedding = None if speaker_audio is not None and "speaker" not in unconditional_keys: wav, sr = torchaudio.load(speaker_audio) speaker_embedding = selected_model.make_speaker_embedding(wav, sr) speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16) audio_prefix_codes = None if prefix_audio is not None: wav_prefix, sr_prefix = torchaudio.load(prefix_audio) wav_prefix = wav_prefix.mean(0, keepdim=True) wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate) wav_prefix = wav_prefix.to(device, dtype=torch.float32) with torch.autocast(device, dtype=torch.float32): audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0)) emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device) vq_val = float(vq_single) vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0) cond_dict = make_cond_dict( text=text, language=language, speaker=speaker_embedding, emotion=emotion_tensor, vqscore_8=vq_tensor, fmax=fmax, pitch_std=pitch_std, speaking_rate=speaking_rate, dnsmos_ovrl=dnsmos_ovrl, speaker_noised=speaker_noised_bool, device=device, unconditional_keys=unconditional_keys, ) conditioning = selected_model.prepare_conditioning(cond_dict) estimated_generation_duration = 30 * len(text) / 400 estimated_total_steps = int(estimated_generation_duration * 86) def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool: progress((step, estimated_total_steps)) return True codes = selected_model.generate( prefix_conditioning=conditioning, audio_prefix_codes=audio_prefix_codes, max_new_tokens=max_new_tokens, cfg_scale=cfg_scale, batch_size=1, sampling_params=dict(min_p=min_p), callback=update_progress, ) wav_out = selected_model.autoencoder.decode(codes).cpu().detach() sr_out = selected_model.autoencoder.sampling_rate if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :] return (sr_out, wav_out.squeeze().numpy()), seed # Custom CSS for pastel gradient background and enhanced UI custom_css = """ .gradio-container { background: linear-gradient(135deg, #f3e7ff, #e6f0ff, #ffe6f2, #e6fff9); background-size: 400% 400%; animation: gradient 15s ease infinite; } @keyframes gradient { 0% { background-position: 0% 50%; } 50% { background-position: 100% 50%; } 100% { background-position: 0% 50%; } } .container { max-width: 1200px; margin: 0 auto; padding: 20px; } .panel { background-color: rgba(255, 255, 255, 0.7); border-radius: 16px; padding: 20px; box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08); margin-bottom: 16px; backdrop-filter: blur(5px); transition: all 0.3s ease; } .panel:hover { box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12); transform: translateY(-2px); } .title { font-size: 1.2em; font-weight: 600; margin-bottom: 12px; color: #6a3ea1; border-bottom: 2px solid #f0e6ff; padding-bottom: 8px; } .slider-container { background-color: rgba(255, 255, 255, 0.5); border-radius: 10px; padding: 10px; margin: 5px 0; } /* Make sliders more appealing */ input[type=range] { height: 5px; appearance: none; width: 100%; border-radius: 3px; background: linear-gradient(90deg, #9c83e0, #83b1e0); } .generate-button { background: linear-gradient(90deg, #a673ff, #7c4dff); color: white; border: none; border-radius: 8px; padding: 12px 24px; font-size: 16px; font-weight: 500; cursor: pointer; transition: all 0.3s ease; box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2); display: block; width: 100%; margin: 20px 0; } .generate-button:hover { background: linear-gradient(90deg, #9c5eff, #6a3aff); box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3); transform: translateY(-2px); } /* Tabs styling */ .tabs { display: flex; border-bottom: 1px solid #e0e0e0; margin-bottom: 20px; } .tab { padding: 10px 20px; cursor: pointer; transition: all 0.3s ease; background-color: transparent; border: none; color: #666; } .tab.active { color: #7c4dff; border-bottom: 3px solid #7c4dff; font-weight: 600; } /* Emotion sliders container */ .emotion-grid { display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; } /* Header styling */ .app-header { text-align: center; margin-bottom: 25px; } .app-header h1 { font-size: 2.5em; color: #6a3ea1; margin-bottom: 8px; font-weight: 700; } .app-header p { font-size: 1.1em; color: #666; margin-bottom: 20px; } /* Audio player styling */ .audio-output { margin-top: 20px; } /* Make output area more prominent */ .output-container { background-color: rgba(255, 255, 255, 0.85); border-radius: 16px; padding: 24px; box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1); margin-top: 20px; } """ def build_interface(): # Build interface with enhanced visual elements and layout with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: # Header section with gr.Column(elem_classes="app-header"): gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨") gr.Markdown("Create natural-sounding speech with customizable voice characteristics") # Main content container with gr.Column(elem_classes="container"): # First panel - Text & Model Selection with gr.Column(elem_classes="panel"): gr.Markdown('