# app.py import gradio as gr import torch, torchaudio, spaces from zonos.model import Zonos from zonos.conditioning import make_cond_dict cpu = "cpu" model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device=cpu) def _speaker_embed(audio): if audio is None: return None sr, wav = audio wav = torch.tensor(wav).unsqueeze(0) return model.make_speaker_embedding(wav, sr) @spaces.GPU def tts(text, language, speaker_audio, emotion_vec, speaking_rate, pitch_std): speaker = _speaker_embed(speaker_audio) emotion = [float(x) for x in emotion_vec.split(",")] if emotion_vec else None model.to("cuda") with torch.no_grad(): cond = make_cond_dict( text=text, language=language, speaker=speaker, emotion=emotion, speaking_rate=float(speaking_rate), pitch_std=float(pitch_std), ) conditioning = model.prepare_conditioning(cond) codes = model.generate(conditioning) wav = model.autoencoder.decode(codes)[0].cpu() model.to(cpu) torch.cuda.empty_cache() return (model.autoencoder.sampling_rate, wav.numpy()) langs = ["en-us", "fr-fr", "ja", "de-de", "zh"] demo = gr.Interface( fn=tts, inputs=[ gr.Textbox(label="text"), gr.Dropdown(langs, value="en-us", label="language"), gr.Audio(type="numpy", label="speaker reference (optional)"), gr.Textbox(value="0.3,0,0,0,0,0,0.2,0.5", label="emotion (8 comma-sep floats)"), gr.Slider(0, 40, value=15, step=1, label="speaking_rate"), gr.Slider(0, 400, value=20, step=1, label="pitch_std"), ], outputs=gr.Audio(label="generated speech"), title="zonos-v0.1 zerogpu tts", ) if __name__ == "__main__": demo.launch()