Spaces:
Running
Running
File size: 3,371 Bytes
2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c cd19b02 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c 2f05b19 d9a9c0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import torch, torchaudio, gradio as gr, spaces
from functools import lru_cache
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict, supported_language_codes
MODEL_ID = "Zyphra/Zonos-v0.1-transformer"
@lru_cache(maxsize = 1)
def get_model():
return Zonos.from_pretrained(MODEL_ID, device = "cuda").eval().requires_grad_(False)
@spaces.GPU(duration = 120)
def synthesize(
text,
language,
ref_audio,
pitch_std,
speaking_rate,
guidance_scale,
seed,
happiness,
sadness,
disgust,
fear,
surprise,
anger,
other,
neutral
):
model = get_model()
if seed == 0:
seed = int(torch.randint(0, 2 ** 32 - 1, (1,)).item())
torch.manual_seed(seed)
speaker = None
if ref_audio:
wav, sr = torchaudio.load(ref_audio)
speaker = model.make_speaker_embedding(wav, sr).to("cuda", dtype = torch.bfloat16)
emotion = torch.tensor(
[happiness, sadness, disgust, fear, surprise, anger, other, neutral],
device = "cuda"
)
cond = make_cond_dict(
text = text,
language = language,
speaker = speaker,
emotion = emotion,
fmax = 24_000.0,
pitch_std = float(pitch_std),
speaking_rate = float(speaking_rate),
device = "cuda"
)
conditioning = model.prepare_conditioning(cond)
codes = model.generate(conditioning, cfg_scale = float(guidance_scale), max_new_tokens = 2_590)
wav_out = model.autoencoder.decode(codes).cpu().squeeze()
return (model.autoencoder.sampling_rate, wav_out.numpy()), seed
with gr.Blocks() as demo:
gr.Markdown("## zonos v0.1 tts")
text = gr.Textbox(label = "text to synthesize", lines = 3)
language = gr.Dropdown(supported_language_codes, value = "en-us", label = "language")
ref_audio = gr.Audio(label = "reference audio (zeroshot tts)", type = "filepath")
pitch_std = gr.Slider(0.0, 300.0, 45.0, step = 1, label = "pitch variation")
speaking_rate = gr.Slider(5.0, 30.0, 15.0, step = 0.5, label = "speaking rate")
guidance_scale = gr.Slider(1.0, 5.0, 2.0, step = 0.1, label = "guidance scale")
seed = gr.Number(value = 0, label = "seed (0 = random)", precision = 0)
gr.Markdown("### emotion settings")
happiness = gr.Slider(0, 1, 0.0, step = 0.01, label = "happiness")
sadness = gr.Slider(0, 1, 0.0, step = 0.01, label = "sadness")
disgust = gr.Slider(0, 1, 0.0, step = 0.01, label = "disgust")
fear = gr.Slider(0, 1, 0.0, step = 0.01, label = "fear")
surprise = gr.Slider(0, 1, 0.0, step = 0.01, label = "surprise")
anger = gr.Slider(0, 1, 0.0, step = 0.01, label = "anger")
other = gr.Slider(0, 1, 0.0, step = 0.01, label = "other")
neutral = gr.Slider(0, 1, 1.0, step = 0.01, label = "neutral")
btn = gr.Button("synthesize")
out_audio = gr.Audio(label = "output")
out_seed = gr.Number(label = "used seed", interactive = False)
btn.click(
synthesize,
[
text,
language,
ref_audio,
pitch_std,
speaking_rate,
guidance_scale,
seed,
happiness,
sadness,
disgust,
fear,
surprise,
anger,
other,
neutral
],
[out_audio, out_seed]
)
demo.launch() |