Zero-5 / app.py
Staticaliza's picture
Update app.py
2f05b19 verified
raw
history blame
3.37 kB
import torch, torchaudio, gradio as gr, spaces
from functools import lru_cache
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict, supported_language_codes
MODEL_ID = "Zyphra/Zonos-v0.1-transformer"
@lru_cache(maxsize = 1)
def get_model():
return Zonos.from_pretrained(MODEL_ID, device = "cuda").eval().requires_grad_(False)
@spaces.GPU(duration = 120)
def synthesize(
text,
language,
ref_audio,
pitch_std,
speaking_rate,
guidance_scale,
seed,
happiness,
sadness,
disgust,
fear,
surprise,
anger,
other,
neutral
):
model = get_model()
if seed == 0:
seed = int(torch.randint(0, 2 ** 32 - 1, (1,)).item())
torch.manual_seed(seed)
speaker = None
if ref_audio:
wav, sr = torchaudio.load(ref_audio)
speaker = model.make_speaker_embedding(wav, sr).to("cuda", dtype = torch.bfloat16)
emotion = torch.tensor(
[happiness, sadness, disgust, fear, surprise, anger, other, neutral],
device = "cuda"
)
cond = make_cond_dict(
text = text,
language = language,
speaker = speaker,
emotion = emotion,
fmax = 24_000.0,
pitch_std = float(pitch_std),
speaking_rate = float(speaking_rate),
device = "cuda"
)
conditioning = model.prepare_conditioning(cond)
codes = model.generate(conditioning, cfg_scale = float(guidance_scale), max_new_tokens = 2_590)
wav_out = model.autoencoder.decode(codes).cpu().squeeze()
return (model.autoencoder.sampling_rate, wav_out.numpy()), seed
with gr.Blocks() as demo:
gr.Markdown("## zonos v0.1 tts")
text = gr.Textbox(label = "text to synthesize", lines = 3)
language = gr.Dropdown(supported_language_codes, value = "en-us", label = "language")
ref_audio = gr.Audio(label = "reference audio (zeroshot tts)", type = "filepath")
pitch_std = gr.Slider(0.0, 300.0, 45.0, step = 1, label = "pitch variation")
speaking_rate = gr.Slider(5.0, 30.0, 15.0, step = 0.5, label = "speaking rate")
guidance_scale = gr.Slider(1.0, 5.0, 2.0, step = 0.1, label = "guidance scale")
seed = gr.Number(value = 0, label = "seed (0 = random)", precision = 0)
gr.Markdown("### emotion settings")
happiness = gr.Slider(0, 1, 0.0, step = 0.01, label = "happiness")
sadness = gr.Slider(0, 1, 0.0, step = 0.01, label = "sadness")
disgust = gr.Slider(0, 1, 0.0, step = 0.01, label = "disgust")
fear = gr.Slider(0, 1, 0.0, step = 0.01, label = "fear")
surprise = gr.Slider(0, 1, 0.0, step = 0.01, label = "surprise")
anger = gr.Slider(0, 1, 0.0, step = 0.01, label = "anger")
other = gr.Slider(0, 1, 0.0, step = 0.01, label = "other")
neutral = gr.Slider(0, 1, 1.0, step = 0.01, label = "neutral")
btn = gr.Button("synthesize")
out_audio = gr.Audio(label = "output")
out_seed = gr.Number(label = "used seed", interactive = False)
btn.click(
synthesize,
[
text,
language,
ref_audio,
pitch_std,
speaking_rate,
guidance_scale,
seed,
happiness,
sadness,
disgust,
fear,
surprise,
anger,
other,
neutral
],
[out_audio, out_seed]
)
demo.launch()