VoiceClone-TTS

Running on Zero

File size: 21,232 Bytes

import os
import sys
import subprocess

# Emergency flash-attn installation if not found
try:
    import flash_attn
except ImportError:
    print("flash_attn not found, attempting to install...")
    try:
        # Try installing pre-built wheel first (fastest)
        subprocess.run([
            sys.executable, "-m", "pip", "install", 
            "https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
        ], check=True)
    except:
        # Fallback: install without CUDA build (slower but more compatible)
        env = os.environ.copy()
        env["FLASH_ATTENTION_SKIP_CUDA_BUILD"] = "TRUE"
        subprocess.run([
            sys.executable, "-m", "pip", "install", "flash-attn", "--no-build-isolation"
        ], env=env, check=True)
    
    # Restart the script after installation
    os.execv(sys.executable, [sys.executable] + sys.argv)

import spaces
import torch
import torchaudio
import gradio as gr
from os import getenv

from zonos.model import Zonos
from zonos.conditioning import make_cond_dict, supported_language_codes

device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
MODELS = {}

# Load models with error handling
for name in MODEL_NAMES:
    try:
        MODELS[name] = Zonos.from_pretrained(name, device=device)
        MODELS[name].requires_grad_(False).eval()
        print(f"Successfully loaded model: {name}")
    except Exception as e:
        print(f"Failed to load model {name}: {e}")
        if not MODELS:  # If no models loaded at all
            raise

def update_ui(model_choice):
    """
    Dynamically show/hide UI elements based on the model's conditioners.
    We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
    """
    model = MODELS[model_choice]
    cond_names = [c.name for c in model.prefix_conditioner.conditioners]
    print("Conditioners in this model:", cond_names)

    text_update = gr.update(visible=("espeak" in cond_names))
    language_update = gr.update(visible=("espeak" in cond_names))
    speaker_audio_update = gr.update(visible=("speaker" in cond_names))
    prefix_audio_update = gr.update(visible=True)
    emotion1_update = gr.update(visible=("emotion" in cond_names))
    emotion2_update = gr.update(visible=("emotion" in cond_names))
    emotion3_update = gr.update(visible=("emotion" in cond_names))
    emotion4_update = gr.update(visible=("emotion" in cond_names))
    emotion5_update = gr.update(visible=("emotion" in cond_names))
    emotion6_update = gr.update(visible=("emotion" in cond_names))
    emotion7_update = gr.update(visible=("emotion" in cond_names))
    emotion8_update = gr.update(visible=("emotion" in cond_names))
    vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
    fmax_slider_update = gr.update(visible=("fmax" in cond_names))
    pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
    speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
    dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
    speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
    unconditional_keys_update = gr.update(
        choices=[name for name in cond_names if name not in ("espeak", "language_id")]
    )

    return (
        text_update,
        language_update,
        speaker_audio_update,
        prefix_audio_update,
        emotion1_update,
        emotion2_update,
        emotion3_update,
        emotion4_update,
        emotion5_update,
        emotion6_update,
        emotion7_update,
        emotion8_update,
        vq_single_slider_update,
        fmax_slider_update,
        pitch_std_slider_update,
        speaking_rate_slider_update,
        dnsmos_slider_update,
        speaker_noised_checkbox_update,
        unconditional_keys_update,
    )


@spaces.GPU(duration=120)
def generate_audio(
    model_choice,
    text,
    language,
    speaker_audio,
    prefix_audio,
    e1,
    e2,
    e3,
    e4,
    e5,
    e6,
    e7,
    e8,
    vq_single,
    fmax,
    pitch_std,
    speaking_rate,
    dnsmos_ovrl,
    speaker_noised,
    cfg_scale,
    min_p,
    seed,
    randomize_seed,
    unconditional_keys,
    progress=gr.Progress(),
):
    """
    Generates audio based on the provided UI parameters.
    We do NOT use language_id or ctc_loss even if the model has them.
    """
    selected_model = MODELS[model_choice]

    speaker_noised_bool = bool(speaker_noised)
    fmax = float(fmax)
    pitch_std = float(pitch_std)
    speaking_rate = float(speaking_rate)
    dnsmos_ovrl = float(dnsmos_ovrl)
    cfg_scale = float(cfg_scale)
    min_p = float(min_p)
    seed = int(seed)
    max_new_tokens = 86 * 30

    if randomize_seed:
        seed = torch.randint(0, 2**32 - 1, (1,)).item()
    torch.manual_seed(seed)

    speaker_embedding = None
    if speaker_audio is not None and "speaker" not in unconditional_keys:
        wav, sr = torchaudio.load(speaker_audio)
        speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
        speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)

    audio_prefix_codes = None
    if prefix_audio is not None:
        wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
        wav_prefix = wav_prefix.mean(0, keepdim=True)
        wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
        wav_prefix = wav_prefix.to(device, dtype=torch.float32)
        with torch.autocast(device, dtype=torch.float32):
            audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))

    emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)

    vq_val = float(vq_single)
    vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)

    cond_dict = make_cond_dict(
        text=text,
        language=language,
        speaker=speaker_embedding,
        emotion=emotion_tensor,
        vqscore_8=vq_tensor,
        fmax=fmax,
        pitch_std=pitch_std,
        speaking_rate=speaking_rate,
        dnsmos_ovrl=dnsmos_ovrl,
        speaker_noised=speaker_noised_bool,
        device=device,
        unconditional_keys=unconditional_keys,
    )
    conditioning = selected_model.prepare_conditioning(cond_dict)

    estimated_generation_duration = 30 * len(text) / 400
    estimated_total_steps = int(estimated_generation_duration * 86)

    def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
        progress((step, estimated_total_steps))
        return True

    codes = selected_model.generate(
        prefix_conditioning=conditioning,
        audio_prefix_codes=audio_prefix_codes,
        max_new_tokens=max_new_tokens,
        cfg_scale=cfg_scale,
        batch_size=1,
        sampling_params=dict(min_p=min_p),
        callback=update_progress,
    )

    wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
    sr_out = selected_model.autoencoder.sampling_rate
    if wav_out.dim() == 2 and wav_out.size(0) > 1:
        wav_out = wav_out[0:1, :]
    return (sr_out, wav_out.squeeze().numpy()), seed


# Custom CSS for pastel gradient background and enhanced UI
custom_css = """
.gradio-container {
    background: linear-gradient(135deg, #f3e7ff, #e6f0ff, #ffe6f2, #e6fff9);
    background-size: 400% 400%;
    animation: gradient 15s ease infinite;
}

@keyframes gradient {
    0% {
        background-position: 0% 50%;
    }
    50% {
        background-position: 100% 50%;
    }
    100% {
        background-position: 0% 50%;
    }
}

.container {
    max-width: 1200px;
    margin: 0 auto;
    padding: 20px;
}

.panel {
    background-color: rgba(255, 255, 255, 0.7);
    border-radius: 16px;
    padding: 20px;
    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
    margin-bottom: 16px;
    backdrop-filter: blur(5px);
    transition: all 0.3s ease;
}

.panel:hover {
    box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12);
    transform: translateY(-2px);
}

.title {
    font-size: 1.2em;
    font-weight: 600;
    margin-bottom: 12px;
    color: #6a3ea1;
    border-bottom: 2px solid #f0e6ff;
    padding-bottom: 8px;
}

.slider-container {
    background-color: rgba(255, 255, 255, 0.5);
    border-radius: 10px;
    padding: 10px;
    margin: 5px 0;
}

/* Make sliders more appealing */
input[type=range] {
    height: 5px;
    appearance: none;
    width: 100%;
    border-radius: 3px;
    background: linear-gradient(90deg, #9c83e0, #83b1e0);
}

.generate-button {
    background: linear-gradient(90deg, #a673ff, #7c4dff);
    color: white;
    border: none;
    border-radius: 8px;
    padding: 12px 24px;
    font-size: 16px;
    font-weight: 500;
    cursor: pointer;
    transition: all 0.3s ease;
    box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2);
    display: block;
    width: 100%;
    margin: 20px 0;
}

.generate-button:hover {
    background: linear-gradient(90deg, #9c5eff, #6a3aff);
    box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3);
    transform: translateY(-2px);
}

/* Tabs styling */
.tabs {
    display: flex;
    border-bottom: 1px solid #e0e0e0;
    margin-bottom: 20px;
}

.tab {
    padding: 10px 20px;
    cursor: pointer;
    transition: all 0.3s ease;
    background-color: transparent;
    border: none;
    color: #666;
}

.tab.active {
    color: #7c4dff;
    border-bottom: 3px solid #7c4dff;
    font-weight: 600;
}

/* Emotion sliders container */
.emotion-grid {
    display: grid;
    grid-template-columns: repeat(4, 1fr);
    gap: 12px;
}

/* Header styling */
.app-header {
    text-align: center;
    margin-bottom: 25px;
}

.app-header h1 {
    font-size: 2.5em;
    color: #6a3ea1;
    margin-bottom: 8px;
    font-weight: 700;
}

.app-header p {
    font-size: 1.1em;
    color: #666;
    margin-bottom: 20px;
}

/* Audio player styling */
.audio-output {
    margin-top: 20px;
}

/* Make output area more prominent */
.output-container {
    background-color: rgba(255, 255, 255, 0.85);
    border-radius: 16px;
    padding: 24px;
    box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1);
    margin-top: 20px;
}
"""


def build_interface():
    # Build interface with enhanced visual elements and layout
    with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
        gr.HTML(
            """
            <div class='container' style='display:flex; justify-content:center; gap:12px;'>
                <a href="https://huggingface.co/spaces/openfree/Best-AI" target="_blank">
                    <img src="https://img.shields.io/static/v1?label=OpenFree&message=BEST%20AI%20Services&color=%230000ff&labelColor=%23000080&logo=huggingface&logoColor=%23ffa500&style=for-the-badge" alt="OpenFree badge">
                </a>
        
                <a href="https://discord.gg/openfreeai" target="_blank">
                    <img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="Discord badge">
                </a>
            </div>
            """
        )
                
        # Header section
        with gr.Column(elem_classes="app-header"):
            gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨")
            gr.Markdown("Create natural-sounding speech with customizable voice characteristics")
        
        # Main content container 
        with gr.Column(elem_classes="container"):
            # First panel - Text & Model Selection
            with gr.Column(elem_classes="panel"):
                gr.Markdown('<div class="title">💬 Text & Model Configuration</div>')
                with gr.Row():
                    with gr.Column(scale=2):
                        model_choice = gr.Dropdown(
                            choices=list(MODELS.keys()),
                            value=list(MODELS.keys())[0] if MODELS else None,
                            label="Zonos Model Type",
                            info="Select the model variant to use.",
                        )
                        text = gr.Textbox(
                            label="Text to Synthesize",
                            value="Zonos uses eSpeak for text to phoneme conversion!",
                            lines=4,
                            max_length=500,
                        )
                        language = gr.Dropdown(
                            choices=supported_language_codes,
                            value="en-us",
                            label="Language Code",
                            info="Select a language code.",
                        )
                    with gr.Column(scale=1):
                        prefix_audio = gr.Audio(
                            value="assets/silence_100ms.wav" if os.path.exists("assets/silence_100ms.wav") else None,
                            label="Optional Prefix Audio (continue from this audio)",
                            type="filepath",
                        )
            
            # Second panel - Voice Characteristics
            with gr.Column(elem_classes="panel"):
                gr.Markdown('<div class="title">🎤 Voice Characteristics</div>')
                with gr.Row():
                    with gr.Column(scale=1):
                        speaker_audio = gr.Audio(
                            label="Optional Speaker Audio (for voice cloning)",
                            type="filepath",
                        )
                        speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
                    
                    with gr.Column(scale=2):
                        with gr.Row():
                            with gr.Column():
                                dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container")
                                fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container")
                                vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container")
                            with gr.Column():
                                pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container")
                                speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container")
            
            # Third panel - Generation Parameters
            with gr.Column(elem_classes="panel"):
                gr.Markdown('<div class="title">⚙️ Generation Parameters</div>')
                with gr.Row():
                    with gr.Column():
                        cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container")
                        min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container")
                    with gr.Column():
                        seed_number = gr.Number(label="Seed", value=420, precision=0)
                        randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
            
            # Emotion Panel with Tabbed Interface
            with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"):
                gr.Markdown(
                    "Adjust these sliders to control the emotional tone of the generated speech.\n"
                    "For a neutral voice, keep 'Neutral' high and other emotions low."
                )
                with gr.Row(elem_classes="emotion-grid"):
                    emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container")
                    emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container")
                    emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container")
                    emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container")
                with gr.Row(elem_classes="emotion-grid"):
                    emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container")
                    emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container")
                    emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container")
                    emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container")
            
            # Advanced Settings Panel
            with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"):
                gr.Markdown(
                    "### Unconditional Toggles\n"
                    "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
                    'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
                )
                unconditional_keys = gr.CheckboxGroup(
                    [
                        "speaker",
                        "emotion",
                        "vqscore_8",
                        "fmax",
                        "pitch_std",
                        "speaking_rate",
                        "dnsmos_ovrl",
                        "speaker_noised",
                    ],
                    value=["emotion"],
                    label="Unconditional Keys",
                )
            
            # Generate Button and Output Area
            with gr.Column(elem_classes="panel output-container"):
                gr.Markdown('<div class="title">🔊 Generate & Output</div>')
                generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
                output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")

        if MODELS:  # Only set up callbacks if models loaded successfully
            model_choice.change(
                fn=update_ui,
                inputs=[model_choice],
                outputs=[
                    text,
                    language,
                    speaker_audio,
                    prefix_audio,
                    emotion1,
                    emotion2,
                    emotion3,
                    emotion4,
                    emotion5,
                    emotion6,
                    emotion7,
                    emotion8,
                    vq_single_slider,
                    fmax_slider,
                    pitch_std_slider,
                    speaking_rate_slider,
                    dnsmos_slider,
                    speaker_noised_checkbox,
                    unconditional_keys,
                ],
            )

            # On page load, trigger the same UI refresh
            demo.load(
                fn=update_ui,
                inputs=[model_choice],
                outputs=[
                    text,
                    language,
                    speaker_audio,
                    prefix_audio,
                    emotion1,
                    emotion2,
                    emotion3,
                    emotion4,
                    emotion5,
                    emotion6,
                    emotion7,
                    emotion8,
                    vq_single_slider,
                    fmax_slider,
                    pitch_std_slider,
                    speaking_rate_slider,
                    dnsmos_slider,
                    speaker_noised_checkbox,
                    unconditional_keys,
                ],
            )

            # Generate audio on button click
            generate_button.click(
                fn=generate_audio,
                inputs=[
                    model_choice,
                    text,
                    language,
                    speaker_audio,
                    prefix_audio,
                    emotion1,
                    emotion2,
                    emotion3,
                    emotion4,
                    emotion5,
                    emotion6,
                    emotion7,
                    emotion8,
                    vq_single_slider,
                    fmax_slider,
                    pitch_std_slider,
                    speaking_rate_slider,
                    dnsmos_slider,
                    speaker_noised_checkbox,
                    cfg_scale_slider,
                    min_p_slider,
                    seed_number,
                    randomize_seed_toggle,
                    unconditional_keys,
                ],
                outputs=[output_audio, seed_number],
            )

    return demo


if __name__ == "__main__":
    demo = build_interface()
    share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
    demo.launch(server_name="0.0.0.0", server_port=7860, share=share)