Zero-5 / app.py
Staticaliza's picture
Update app.py
b5cb70e verified
raw
history blame
1.79 kB
# app.py
import gradio as gr
import torch, torchaudio, spaces
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict
cpu = "cpu"
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device=cpu)
def _speaker_embed(audio):
if audio is None:
return None
sr, wav = audio
wav = torch.tensor(wav).unsqueeze(0)
return model.make_speaker_embedding(wav, sr)
@spaces.GPU
def tts(text, language, speaker_audio, emotion_vec, speaking_rate, pitch_std):
speaker = _speaker_embed(speaker_audio)
emotion = [float(x) for x in emotion_vec.split(",")] if emotion_vec else None
model.to("cuda")
with torch.no_grad():
cond = make_cond_dict(
text=text,
language=language,
speaker=speaker,
emotion=emotion,
speaking_rate=float(speaking_rate),
pitch_std=float(pitch_std),
)
conditioning = model.prepare_conditioning(cond)
codes = model.generate(conditioning)
wav = model.autoencoder.decode(codes)[0].cpu()
model.to(cpu)
torch.cuda.empty_cache()
return (model.autoencoder.sampling_rate, wav.numpy())
langs = ["en-us", "fr-fr", "ja", "de-de", "zh"]
demo = gr.Interface(
fn=tts,
inputs=[
gr.Textbox(label="text"),
gr.Dropdown(langs, value="en-us", label="language"),
gr.Audio(type="numpy", label="speaker reference (optional)"),
gr.Textbox(value="0.3,0,0,0,0,0,0.2,0.5", label="emotion (8 comma-sep floats)"),
gr.Slider(0, 40, value=15, step=1, label="speaking_rate"),
gr.Slider(0, 400, value=20, step=1, label="pitch_std"),
],
outputs=gr.Audio(label="generated speech"),
title="zonos-v0.1 zerogpu tts",
)
if __name__ == "__main__":
demo.launch()