Spaces:
Running
Running
import torch, torchaudio, gradio as gr, spaces | |
from functools import lru_cache | |
from zonos.model import Zonos | |
from zonos.conditioning import make_cond_dict, supported_language_codes | |
MODEL_ID = "Zyphra/Zonos-v0.1-transformer" | |
def get_model(): | |
return Zonos.from_pretrained(MODEL_ID, device = "cuda").eval().requires_grad_(False) | |
def synthesize( | |
text, | |
language, | |
ref_audio, | |
pitch_std, | |
speaking_rate, | |
guidance_scale, | |
seed, | |
happiness, | |
sadness, | |
disgust, | |
fear, | |
surprise, | |
anger, | |
other, | |
neutral | |
): | |
model = get_model() | |
if seed == 0: | |
seed = int(torch.randint(0, 2 ** 32 - 1, (1,)).item()) | |
torch.manual_seed(seed) | |
speaker = None | |
if ref_audio: | |
wav, sr = torchaudio.load(ref_audio) | |
speaker = model.make_speaker_embedding(wav, sr).to("cuda", dtype = torch.bfloat16) | |
emotion = torch.tensor( | |
[happiness, sadness, disgust, fear, surprise, anger, other, neutral], | |
device = "cuda" | |
) | |
cond = make_cond_dict( | |
text = text, | |
language = language, | |
speaker = speaker, | |
emotion = emotion, | |
fmax = 24_000.0, | |
pitch_std = float(pitch_std), | |
speaking_rate = float(speaking_rate), | |
device = "cuda" | |
) | |
conditioning = model.prepare_conditioning(cond) | |
codes = model.generate(conditioning, cfg_scale = float(guidance_scale), max_new_tokens = 2_590) | |
wav_out = model.autoencoder.decode(codes).cpu().squeeze() | |
return (model.autoencoder.sampling_rate, wav_out.numpy()), seed | |
with gr.Blocks() as demo: | |
gr.Markdown("## zonos v0.1 tts") | |
text = gr.Textbox(label = "text to synthesize", lines = 3) | |
language = gr.Dropdown(supported_language_codes, value = "en-us", label = "language") | |
ref_audio = gr.Audio(label = "reference audio (zeroshot tts)", type = "filepath") | |
pitch_std = gr.Slider(0.0, 300.0, 45.0, step = 1, label = "pitch variation") | |
speaking_rate = gr.Slider(5.0, 30.0, 15.0, step = 0.5, label = "speaking rate") | |
guidance_scale = gr.Slider(1.0, 5.0, 2.0, step = 0.1, label = "guidance scale") | |
seed = gr.Number(value = 0, label = "seed (0 = random)", precision = 0) | |
gr.Markdown("### emotion settings") | |
happiness = gr.Slider(0, 1, 0.0, step = 0.01, label = "happiness") | |
sadness = gr.Slider(0, 1, 0.0, step = 0.01, label = "sadness") | |
disgust = gr.Slider(0, 1, 0.0, step = 0.01, label = "disgust") | |
fear = gr.Slider(0, 1, 0.0, step = 0.01, label = "fear") | |
surprise = gr.Slider(0, 1, 0.0, step = 0.01, label = "surprise") | |
anger = gr.Slider(0, 1, 0.0, step = 0.01, label = "anger") | |
other = gr.Slider(0, 1, 0.0, step = 0.01, label = "other") | |
neutral = gr.Slider(0, 1, 1.0, step = 0.01, label = "neutral") | |
btn = gr.Button("synthesize") | |
out_audio = gr.Audio(label = "output") | |
out_seed = gr.Number(label = "used seed", interactive = False) | |
btn.click( | |
synthesize, | |
[ | |
text, | |
language, | |
ref_audio, | |
pitch_std, | |
speaking_rate, | |
guidance_scale, | |
seed, | |
happiness, | |
sadness, | |
disgust, | |
fear, | |
surprise, | |
anger, | |
other, | |
neutral | |
], | |
[out_audio, out_seed] | |
) | |
demo.launch() |