import torch, torchaudio, gradio as gr, spaces from functools import lru_cache from zonos.model import Zonos from zonos.conditioning import make_cond_dict, supported_language_codes MODEL_ID = "Zyphra/Zonos-v0.1-transformer" @lru_cache(maxsize = 1) def get_model(): return Zonos.from_pretrained(MODEL_ID, device = "cuda").eval().requires_grad_(False) @spaces.GPU(duration = 120) def synthesize( text, language, ref_audio, pitch_std, speaking_rate, guidance_scale, seed, happiness, sadness, disgust, fear, surprise, anger, other, neutral ): model = get_model() if seed == 0: seed = int(torch.randint(0, 2 ** 32 - 1, (1,)).item()) torch.manual_seed(seed) speaker = None if ref_audio: wav, sr = torchaudio.load(ref_audio) speaker = model.make_speaker_embedding(wav, sr).to("cuda", dtype = torch.bfloat16) emotion = torch.tensor( [happiness, sadness, disgust, fear, surprise, anger, other, neutral], device = "cuda" ) cond = make_cond_dict( text = text, language = language, speaker = speaker, emotion = emotion, fmax = 24_000.0, pitch_std = float(pitch_std), speaking_rate = float(speaking_rate), device = "cuda" ) conditioning = model.prepare_conditioning(cond) codes = model.generate(conditioning, cfg_scale = float(guidance_scale), max_new_tokens = 2_590) wav_out = model.autoencoder.decode(codes).cpu().squeeze() return (model.autoencoder.sampling_rate, wav_out.numpy()), seed with gr.Blocks() as demo: gr.Markdown("## zonos v0.1 tts") text = gr.Textbox(label = "text to synthesize", lines = 3) language = gr.Dropdown(supported_language_codes, value = "en-us", label = "language") ref_audio = gr.Audio(label = "reference audio (zeroshot tts)", type = "filepath") pitch_std = gr.Slider(0.0, 300.0, 45.0, step = 1, label = "pitch variation") speaking_rate = gr.Slider(5.0, 30.0, 15.0, step = 0.5, label = "speaking rate") guidance_scale = gr.Slider(1.0, 5.0, 2.0, step = 0.1, label = "guidance scale") seed = gr.Number(value = 0, label = "seed (0 = random)", precision = 0) gr.Markdown("### emotion settings") happiness = gr.Slider(0, 1, 0.0, step = 0.01, label = "happiness") sadness = gr.Slider(0, 1, 0.0, step = 0.01, label = "sadness") disgust = gr.Slider(0, 1, 0.0, step = 0.01, label = "disgust") fear = gr.Slider(0, 1, 0.0, step = 0.01, label = "fear") surprise = gr.Slider(0, 1, 0.0, step = 0.01, label = "surprise") anger = gr.Slider(0, 1, 0.0, step = 0.01, label = "anger") other = gr.Slider(0, 1, 0.0, step = 0.01, label = "other") neutral = gr.Slider(0, 1, 1.0, step = 0.01, label = "neutral") btn = gr.Button("synthesize") out_audio = gr.Audio(label = "output") out_seed = gr.Number(label = "used seed", interactive = False) btn.click( synthesize, [ text, language, ref_audio, pitch_std, speaking_rate, guidance_scale, seed, happiness, sadness, disgust, fear, surprise, anger, other, neutral ], [out_audio, out_seed] ) demo.launch()