Spaces:
Running
Running
# app.py | |
import gradio as gr | |
import torch, torchaudio, spaces | |
from zonos.model import Zonos | |
from zonos.conditioning import make_cond_dict | |
cpu = "cpu" | |
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device=cpu) | |
def _speaker_embed(audio): | |
if audio is None: | |
return None | |
sr, wav = audio | |
wav = torch.tensor(wav).unsqueeze(0) | |
return model.make_speaker_embedding(wav, sr) | |
def tts(text, language, speaker_audio, emotion_vec, speaking_rate, pitch_std): | |
speaker = _speaker_embed(speaker_audio) | |
emotion = [float(x) for x in emotion_vec.split(",")] if emotion_vec else None | |
model.to("cuda") | |
with torch.no_grad(): | |
cond = make_cond_dict( | |
text=text, | |
language=language, | |
speaker=speaker, | |
emotion=emotion, | |
speaking_rate=float(speaking_rate), | |
pitch_std=float(pitch_std), | |
) | |
conditioning = model.prepare_conditioning(cond) | |
codes = model.generate(conditioning) | |
wav = model.autoencoder.decode(codes)[0].cpu() | |
model.to(cpu) | |
torch.cuda.empty_cache() | |
return (model.autoencoder.sampling_rate, wav.numpy()) | |
langs = ["en-us", "fr-fr", "ja", "de-de", "zh"] | |
demo = gr.Interface( | |
fn=tts, | |
inputs=[ | |
gr.Textbox(label="text"), | |
gr.Dropdown(langs, value="en-us", label="language"), | |
gr.Audio(type="numpy", label="speaker reference (optional)"), | |
gr.Textbox(value="0.3,0,0,0,0,0,0.2,0.5", label="emotion (8 comma-sep floats)"), | |
gr.Slider(0, 40, value=15, step=1, label="speaking_rate"), | |
gr.Slider(0, 400, value=20, step=1, label="pitch_std"), | |
], | |
outputs=gr.Audio(label="generated speech"), | |
title="zonos-v0.1 zerogpu tts", | |
) | |
if __name__ == "__main__": | |
demo.launch() |