File size: 3,371 Bytes
2f05b19
d9a9c0c
 
 
 
 
 
2f05b19
 
 
d9a9c0c
2f05b19
 
d9a9c0c
 
 
 
 
 
 
 
 
 
 
 
 
 
2f05b19
d9a9c0c
2f05b19
d9a9c0c
2f05b19
d9a9c0c
 
2f05b19
d9a9c0c
2f05b19
d9a9c0c
2f05b19
 
d9a9c0c
 
2f05b19
 
 
 
 
 
 
 
d9a9c0c
 
2f05b19
d9a9c0c
 
 
cd19b02
d9a9c0c
2f05b19
 
 
 
 
 
 
d9a9c0c
2f05b19
 
 
 
 
 
 
 
d9a9c0c
2f05b19
 
d9a9c0c
2f05b19
d9a9c0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f05b19
d9a9c0c
2f05b19
d9a9c0c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import torch, torchaudio, gradio as gr, spaces
from functools import lru_cache
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict, supported_language_codes

MODEL_ID = "Zyphra/Zonos-v0.1-transformer"

@lru_cache(maxsize = 1)
def get_model():
    return Zonos.from_pretrained(MODEL_ID, device = "cuda").eval().requires_grad_(False)

@spaces.GPU(duration = 120)
def synthesize(
    text,
    language,
    ref_audio,
    pitch_std,
    speaking_rate,
    guidance_scale,
    seed,
    happiness,
    sadness,
    disgust,
    fear,
    surprise,
    anger,
    other,
    neutral
):
    model = get_model()
    if seed == 0:
        seed = int(torch.randint(0, 2 ** 32 - 1, (1,)).item())
    torch.manual_seed(seed)
    speaker = None
    if ref_audio:
        wav, sr = torchaudio.load(ref_audio)
        speaker = model.make_speaker_embedding(wav, sr).to("cuda", dtype = torch.bfloat16)
    emotion = torch.tensor(
        [happiness, sadness, disgust, fear, surprise, anger, other, neutral],
        device = "cuda"
    )
    cond = make_cond_dict(
        text = text,
        language = language,
        speaker = speaker,
        emotion = emotion,
        fmax = 24_000.0,
        pitch_std = float(pitch_std),
        speaking_rate = float(speaking_rate),
        device = "cuda"
    )
    conditioning = model.prepare_conditioning(cond)
    codes = model.generate(conditioning, cfg_scale = float(guidance_scale), max_new_tokens = 2_590)
    wav_out = model.autoencoder.decode(codes).cpu().squeeze()
    return (model.autoencoder.sampling_rate, wav_out.numpy()), seed

with gr.Blocks() as demo:
    gr.Markdown("## zonos v0.1 tts")
    text = gr.Textbox(label = "text to synthesize", lines = 3)
    language = gr.Dropdown(supported_language_codes, value = "en-us", label = "language")
    ref_audio = gr.Audio(label = "reference audio (zeroshot tts)", type = "filepath")
    pitch_std = gr.Slider(0.0, 300.0, 45.0, step = 1, label = "pitch variation")
    speaking_rate = gr.Slider(5.0, 30.0, 15.0, step = 0.5, label = "speaking rate")
    guidance_scale = gr.Slider(1.0, 5.0, 2.0, step = 0.1, label = "guidance scale")
    seed = gr.Number(value = 0, label = "seed (0 = random)", precision = 0)
    gr.Markdown("### emotion settings")
    happiness = gr.Slider(0, 1, 0.0, step = 0.01, label = "happiness")
    sadness = gr.Slider(0, 1, 0.0, step = 0.01, label = "sadness")
    disgust = gr.Slider(0, 1, 0.0, step = 0.01, label = "disgust")
    fear = gr.Slider(0, 1, 0.0, step = 0.01, label = "fear")
    surprise = gr.Slider(0, 1, 0.0, step = 0.01, label = "surprise")
    anger = gr.Slider(0, 1, 0.0, step = 0.01, label = "anger")
    other = gr.Slider(0, 1, 0.0, step = 0.01, label = "other")
    neutral = gr.Slider(0, 1, 1.0, step = 0.01, label = "neutral")
    btn = gr.Button("synthesize")
    out_audio = gr.Audio(label = "output")
    out_seed = gr.Number(label = "used seed", interactive = False)
    btn.click(
        synthesize,
        [
            text,
            language,
            ref_audio,
            pitch_std,
            speaking_rate,
            guidance_scale,
            seed,
            happiness,
            sadness,
            disgust,
            fear,
            surprise,
            anger,
            other,
            neutral
        ],
        [out_audio, out_seed]
    )
    demo.launch()