|
|
|
|
|
import os |
|
import shlex |
|
import subprocess |
|
|
|
subprocess.run(shlex.split("pip install flash-attn --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True) |
|
subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True) |
|
subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True) |
|
|
|
import spaces |
|
import gradio as gr |
|
import numpy as np |
|
|
|
from typing import Tuple, Dict, Any, Optional |
|
from taproot import Task |
|
|
|
|
|
is_hf_spaces = os.getenv("SYSTEM", "") == "spaces" |
|
max_characters = 2000 |
|
header_markdown = """ |
|
# Zonos v0.1 |
|
State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio) |
|
## Unleashed |
|
Use this space to generate long-form speech up to around ~2 minutes in length. To generate an unlimited length, clone this space and run it locally. |
|
### Tips |
|
|
|
- When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition. |
|
- The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed. |
|
- Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect. |
|
""".strip() |
|
|
|
|
|
speech_enhancement = Task.get("speech-enhancement", model="deep-filter-net-v3", available_only=False) |
|
speech_enhancement.download_required_files(text_callback=print) |
|
hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False) |
|
hybrid_task.download_required_files(text_callback=print) |
|
hybrid_pipe = hybrid_task() |
|
hybrid_pipe.load(allow_optional=True) |
|
|
|
transformer_task = Task.get( |
|
"speech-synthesis", model="zonos-transformer", available_only=False |
|
) |
|
transformer_task.download_required_files(text_callback=print) |
|
transformer_pipe = transformer_task() |
|
|
|
if is_hf_spaces: |
|
|
|
transformer_pipe.load(allow_optional=True) |
|
|
|
|
|
pipelines = { |
|
"Zonos Transformer v0.1": transformer_pipe, |
|
"Zonos Hybrid v0.1": hybrid_pipe, |
|
} |
|
pipeline_names = list(pipelines.keys()) |
|
supported_language_codes = hybrid_pipe.supported_languages |
|
|
|
|
|
def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]: |
|
""" |
|
Dynamically show/hide UI elements based on the model's conditioners. |
|
""" |
|
if not is_hf_spaces: |
|
|
|
for pipeline_name, pipeline in pipelines.items(): |
|
if pipeline_name == pipeline_choice: |
|
pipeline.load() |
|
else: |
|
pipeline.unload() |
|
|
|
pipe = pipelines[pipeline_choice] |
|
cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners] |
|
|
|
vqscore_update = gr.update(visible=("vqscore_8" in cond_names)) |
|
emotion_update = gr.update(visible=("emotion" in cond_names)) |
|
fmax_update = gr.update(visible=("fmax" in cond_names)) |
|
pitch_update = gr.update(visible=("pitch_std" in cond_names)) |
|
speaking_rate_update = gr.update(visible=("speaking_rate" in cond_names)) |
|
dnsmos_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) |
|
speaker_noised_update = gr.update(visible=("speaker_noised" in cond_names)) |
|
|
|
return ( |
|
vqscore_update, |
|
emotion_update, |
|
fmax_update, |
|
pitch_update, |
|
speaking_rate_update, |
|
dnsmos_update, |
|
speaker_noised_update, |
|
) |
|
|
|
|
|
@spaces.GPU(duration=180) |
|
def generate_audio( |
|
pipeline_choice: str, |
|
text: str, |
|
language: str, |
|
speaker_audio: Optional[str], |
|
prefix_audio: Optional[str], |
|
e1: float, |
|
e2: float, |
|
e3: float, |
|
e4: float, |
|
e5: float, |
|
e6: float, |
|
e7: float, |
|
e8: float, |
|
vq_single: float, |
|
fmax: float, |
|
pitch_std: float, |
|
speaking_rate: float, |
|
dnsmos_ovrl: float, |
|
speaker_noised: bool, |
|
cfg_scale: float, |
|
min_p: float, |
|
seed: int, |
|
max_chunk_length: int, |
|
cross_fade_duration: float, |
|
punctuation_pause_duration: float, |
|
target_rms: float, |
|
randomize_seed: bool, |
|
skip_dnsmos: bool, |
|
skip_vqscore: bool, |
|
skip_fmax: bool, |
|
skip_pitch: bool, |
|
skip_speaking_rate: bool, |
|
skip_emotion: bool, |
|
skip_speaker: bool, |
|
speaker_pitch_shift: float, |
|
speaker_equalize: bool, |
|
speaker_enhance: bool, |
|
prefix_equalize: bool, |
|
prefix_enhance: bool, |
|
enhance: bool, |
|
progress=gr.Progress(), |
|
) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]: |
|
""" |
|
Generates audio based on the provided UI parameters. |
|
""" |
|
selected_pipeline = pipelines[pipeline_choice] |
|
if randomize_seed: |
|
seed = np.random.randint(0, 2**32) |
|
|
|
def on_progress(step: int, total: int) -> None: |
|
progress((step, total)) |
|
|
|
print(f"{speaker_audio=}") |
|
selected_pipeline.on_progress(on_progress) |
|
try: |
|
wav_out = selected_pipeline( |
|
text=text, |
|
enhance=enhance, |
|
language=language, |
|
reference_audio=speaker_audio, |
|
reference_audio_pitch_shift=speaker_pitch_shift, |
|
equalize_reference_audio=speaker_equalize, |
|
enhance_reference_audio=speaker_enhance, |
|
prefix_audio=prefix_audio, |
|
equalize_prefix_audio=prefix_equalize, |
|
enhance_prefix_audio=prefix_enhance, |
|
seed=seed, |
|
max_chunk_length=max_chunk_length, |
|
cross_fade_duration=cross_fade_duration, |
|
punctuation_pause_duration=punctuation_pause_duration, |
|
target_rms=target_rms, |
|
cfg_scale=cfg_scale, |
|
min_p=min_p, |
|
fmax=fmax, |
|
pitch_std=pitch_std, |
|
emotion_happiness=e1, |
|
emotion_sadness=e2, |
|
emotion_disgust=e3, |
|
emotion_fear=e4, |
|
emotion_surprise=e5, |
|
emotion_anger=e6, |
|
emotion_other=e7, |
|
emotion_neutral=e8, |
|
speaking_rate=speaking_rate, |
|
vq_score=vq_single, |
|
speaker_noised=speaker_noised, |
|
dnsmos=dnsmos_ovrl, |
|
skip_speaker=skip_speaker, |
|
skip_dnsmos=skip_dnsmos, |
|
skip_vq_score=skip_vqscore, |
|
skip_fmax=skip_fmax, |
|
skip_pitch=skip_pitch, |
|
skip_speaking_rate=skip_speaking_rate, |
|
skip_emotion=skip_emotion, |
|
output_format="float", |
|
) |
|
|
|
return ( |
|
( |
|
48000 if enhance else 44100, |
|
wav_out.squeeze().numpy() |
|
), |
|
seed |
|
) |
|
finally: |
|
selected_pipeline.off_progress() |
|
|
|
|
|
if __name__ == "__main__": |
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
gr.Markdown(header_markdown) |
|
|
|
gr.Image( |
|
value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png", |
|
container=False, |
|
interactive=False, |
|
show_label=False, |
|
show_share_button=False, |
|
show_fullscreen_button=False, |
|
show_download_button=False, |
|
) |
|
|
|
with gr.Row(equal_height=True): |
|
pipeline_choice = gr.Dropdown( |
|
choices=pipeline_names, |
|
value=pipeline_names[0], |
|
label="Zonos Model Variant", |
|
) |
|
language = gr.Dropdown( |
|
choices=supported_language_codes, |
|
value="en-us", |
|
label="Language", |
|
) |
|
enhanced_checkbox = gr.Checkbox( |
|
value=True, |
|
label="Enhance Output with DeepFilterNet" |
|
) |
|
|
|
with gr.Row(): |
|
if not is_hf_spaces: |
|
limit_text = "Unlimited" |
|
else: |
|
limit_text = f"Up to {max_characters}" |
|
|
|
text = gr.Textbox( |
|
label=f"Speech Text ({limit_text} Characters)", |
|
value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.", |
|
lines=4, |
|
max_lines=20, |
|
max_length=max_characters if is_hf_spaces else None, |
|
) |
|
|
|
with gr.Row(): |
|
generate_button = gr.Button("Generate Audio") |
|
|
|
with gr.Row(): |
|
output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True) |
|
|
|
with gr.Row(): |
|
gr.Markdown("## Long-Form Parameters") |
|
|
|
with gr.Column(variant="panel"): |
|
with gr.Row(equal_height=True): |
|
max_chunk_length = gr.Slider( |
|
1, 300, 150, 1, label="Max Chunk Length (Characters)", |
|
info="The maximum number of characters to generate in a single chunk. Zonos itself has a much higher limit than this, but consistency breaks down as you go past ~200 characters or so." |
|
) |
|
target_rms = gr.Slider( |
|
0.0, 1.0, 0.10, 0.01, label="Target RMS", |
|
info="The target RMS (root-mean-square) amplitude for the generated audio. Each chunk will have its loudness normalized to this value to ensure consistent volume levels." |
|
) |
|
with gr.Row(equal_height=True): |
|
punctuation_pause_duration = gr.Slider( |
|
0, 1, 0.10, 0.01, label="Punctuation Pause Duration (Seconds)", |
|
info="Pause duration to add after a chunk that ends with punctuation. Full-stop punctuation (periods) will have the entire length, while shorter pauses will use half of this duration." |
|
) |
|
cross_fade_duration = gr.Slider( |
|
0, 1, 0.15, 0.01, label="Chunk Cross-Fade Duration (Seconds)", |
|
info="The duration of the cross-fade between chunks. This helps to smooth out transitions between chunks. In general, this should be set to a value greater than the pause duration." |
|
) |
|
|
|
with gr.Row(): |
|
gr.Markdown("## Generation Parameters") |
|
|
|
with gr.Row(variant="panel", equal_height=True): |
|
with gr.Column(): |
|
prefix_audio = gr.Audio( |
|
label="Optional Prefix Audio (continue from this audio)", |
|
type="filepath", |
|
) |
|
prefix_equalize_checkbox = gr.Checkbox(label="Equalize Prefix Audio", value=True) |
|
prefix_enhance_checkbox = gr.Checkbox(label="Enhance Prefix Audio with DeepFilterNet", value=True) |
|
|
|
with gr.Column(scale=3): |
|
cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale") |
|
min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P") |
|
seed_number = gr.Number(label="Seed", value=6475309, precision=0) |
|
randomize_seed_toggle = gr.Checkbox(label="Randomize Seed", value=True) |
|
|
|
with gr.Row(): |
|
gr.Markdown( |
|
"## Conditioning Parameters\nAll of these types of conditioning are optional and can be disabled." |
|
) |
|
|
|
with gr.Row(variant="panel", equal_height=True) as speaker_row: |
|
with gr.Column(): |
|
speaker_uncond = gr.Checkbox(label="Skip Speaker") |
|
speaker_noised_checkbox = gr.Checkbox( |
|
label="Speaker Noised", |
|
value=False, |
|
interactive=False, |
|
info="'Speaker Noised' is a conditioning value that the model understands, not a processing step. Check this box if your input audio is noisy." |
|
) |
|
speaker_equalize_checkbox = gr.Checkbox(label="Equalize Speaker Audio", value=True) |
|
speaker_enhance_checkbox = gr.Checkbox(label="Enhance Speaker Audio with DeepFilterNet", value=True) |
|
|
|
def on_enhanced_change(use_enhance: bool) -> Dict[str, Any]: |
|
update_dict = {"interactive": not use_enhance} |
|
if use_enhance: |
|
update_dict["value"] = False |
|
return gr.update(**update_dict) |
|
|
|
speaker_enhance_checkbox.change( |
|
fn=on_enhanced_change, |
|
inputs=[speaker_enhance_checkbox], |
|
outputs=[speaker_noised_checkbox] |
|
) |
|
speaker_pitch_shift = gr.Slider( |
|
-1200, 1200, -44.99, 0.01, label="Speaker Pitch Shift (Cents)", |
|
info="A pitch shift to apply to speaker audio before extracting embeddings. A slight down-shift of ~45 cents tends to produce a more accurate voice cloning." |
|
) |
|
|
|
speaker_audio = gr.Audio( |
|
label="Optional Speaker Audio (for cloning)", |
|
type="filepath", |
|
scale=3, |
|
) |
|
|
|
with gr.Row(variant="panel", equal_height=True) as emotion_row: |
|
emotion_uncond = gr.Checkbox(label="Skip Emotion") |
|
with gr.Column(scale=3): |
|
with gr.Row(): |
|
emotion1 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Happiness") |
|
emotion2 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Sadness") |
|
emotion3 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Disgust") |
|
emotion4 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Fear") |
|
with gr.Row(): |
|
emotion5 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Surprise") |
|
emotion6 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Anger") |
|
emotion7 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Other") |
|
emotion8 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Neutral") |
|
|
|
with gr.Row(variant="panel", equal_height=True) as dnsmos_row: |
|
dnsmos_uncond = gr.Checkbox(label="Skip DNSMOS") |
|
dnsmos_slider = gr.Slider( |
|
1.0, |
|
5.0, |
|
value=4.0, |
|
step=0.1, |
|
label="Deep Noise Suppression Mean Opinion Score [arXiv 2010.15258]", |
|
scale=3, |
|
) |
|
|
|
with gr.Row(variant="panel", equal_height=True) as vq_score_row: |
|
vq_uncond = gr.Checkbox(label="Skip VQScore") |
|
vq_single_slider = gr.Slider( |
|
0.5, 0.8, 0.78, 0.01, label="VQScore [arXiv 2402.16321]", scale=3 |
|
) |
|
|
|
with gr.Row(variant="panel", equal_height=True) as fmax_row: |
|
fmax_uncond = gr.Checkbox(label="Skip Fmax") |
|
fmax_slider = gr.Slider( |
|
0, 22050, value=22050, step=1, label="Fmax (Hz)", scale=3 |
|
) |
|
|
|
with gr.Row(variant="panel", equal_height=True) as pitch_row: |
|
pitch_uncond = gr.Checkbox(label="Skip Pitch") |
|
pitch_std_slider = gr.Slider( |
|
0.0, 300.0, value=20.0, step=1, label="Pitch Standard Deviation", scale=3 |
|
) |
|
|
|
with gr.Row(variant="panel", equal_height=True) as speaking_rate_row: |
|
speaking_rate_uncond = gr.Checkbox(label="Skip Speaking Rate") |
|
speaking_rate_slider = gr.Slider( |
|
5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", scale=3 |
|
) |
|
|
|
pipeline_choice.change( |
|
fn=update_ui, |
|
inputs=[pipeline_choice], |
|
outputs=[ |
|
vq_score_row, |
|
emotion_row, |
|
fmax_row, |
|
pitch_row, |
|
speaking_rate_row, |
|
dnsmos_row, |
|
speaker_noised_checkbox, |
|
], |
|
) |
|
|
|
|
|
demo.load( |
|
fn=update_ui, |
|
inputs=[pipeline_choice], |
|
outputs=[ |
|
vq_score_row, |
|
emotion_row, |
|
fmax_row, |
|
pitch_row, |
|
speaking_rate_row, |
|
dnsmos_row, |
|
speaker_noised_checkbox, |
|
], |
|
) |
|
|
|
|
|
generate_button.click( |
|
fn=generate_audio, |
|
inputs=[ |
|
pipeline_choice, |
|
text, |
|
language, |
|
speaker_audio, |
|
prefix_audio, |
|
emotion1, |
|
emotion2, |
|
emotion3, |
|
emotion4, |
|
emotion5, |
|
emotion6, |
|
emotion7, |
|
emotion8, |
|
vq_single_slider, |
|
fmax_slider, |
|
pitch_std_slider, |
|
speaking_rate_slider, |
|
dnsmos_slider, |
|
speaker_noised_checkbox, |
|
cfg_scale_slider, |
|
min_p_slider, |
|
seed_number, |
|
max_chunk_length, |
|
cross_fade_duration, |
|
punctuation_pause_duration, |
|
target_rms, |
|
randomize_seed_toggle, |
|
dnsmos_uncond, |
|
vq_uncond, |
|
fmax_uncond, |
|
pitch_uncond, |
|
speaking_rate_uncond, |
|
emotion_uncond, |
|
speaker_uncond, |
|
speaker_pitch_shift, |
|
speaker_equalize_checkbox, |
|
speaker_enhance_checkbox, |
|
prefix_equalize_checkbox, |
|
prefix_enhance_checkbox, |
|
enhanced_checkbox, |
|
], |
|
outputs=[output_audio, seed_number], |
|
) |
|
|
|
demo.launch() |