Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,99 +1,54 @@
|
|
1 |
-
|
2 |
-
|
|
|
3 |
from zonos.model import Zonos
|
4 |
-
from zonos.conditioning import make_cond_dict
|
5 |
|
6 |
-
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
11 |
|
12 |
-
@spaces.GPU
|
13 |
-
def
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
)
|
30 |
-
|
31 |
-
|
32 |
-
seed = int(torch.randint(0, 2 ** 32 - 1, (1,)).item())
|
33 |
-
torch.manual_seed(seed)
|
34 |
-
speaker = None
|
35 |
-
if ref_audio:
|
36 |
-
wav, sr = torchaudio.load(ref_audio)
|
37 |
-
speaker = model.make_speaker_embedding(wav, sr).to("cuda", dtype = torch.bfloat16)
|
38 |
-
emotion = torch.tensor(
|
39 |
-
[happiness, sadness, disgust, fear, surprise, anger, other, neutral],
|
40 |
-
device = "cuda"
|
41 |
-
)
|
42 |
-
cond = make_cond_dict(
|
43 |
-
text = text,
|
44 |
-
language = language,
|
45 |
-
speaker = speaker,
|
46 |
-
emotion = emotion,
|
47 |
-
fmax = 24_000.0,
|
48 |
-
pitch_std = float(pitch_std),
|
49 |
-
speaking_rate = float(speaking_rate),
|
50 |
-
device = "cuda"
|
51 |
-
)
|
52 |
-
conditioning = model.prepare_conditioning(cond)
|
53 |
-
codes = model.generate(conditioning, cfg_scale = float(guidance_scale), max_new_tokens = 2_590)
|
54 |
-
wav_out = model.autoencoder.decode(codes).cpu().squeeze()
|
55 |
-
return (model.autoencoder.sampling_rate, wav_out.numpy()), seed
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
other = gr.Slider(0, 1, 0.0, step = 0.01, label = "other")
|
74 |
-
neutral = gr.Slider(0, 1, 1.0, step = 0.01, label = "neutral")
|
75 |
-
btn = gr.Button("synthesize")
|
76 |
-
out_audio = gr.Audio(label = "output")
|
77 |
-
out_seed = gr.Number(label = "used seed", interactive = False)
|
78 |
-
btn.click(
|
79 |
-
synthesize,
|
80 |
-
[
|
81 |
-
text,
|
82 |
-
language,
|
83 |
-
ref_audio,
|
84 |
-
pitch_std,
|
85 |
-
speaking_rate,
|
86 |
-
guidance_scale,
|
87 |
-
seed,
|
88 |
-
happiness,
|
89 |
-
sadness,
|
90 |
-
disgust,
|
91 |
-
fear,
|
92 |
-
surprise,
|
93 |
-
anger,
|
94 |
-
other,
|
95 |
-
neutral
|
96 |
-
],
|
97 |
-
[out_audio, out_seed]
|
98 |
-
)
|
99 |
demo.launch()
|
|
|
1 |
+
# app.py
|
2 |
+
import gradio as gr
|
3 |
+
import torch, torchaudio, spaces
|
4 |
from zonos.model import Zonos
|
5 |
+
from zonos.conditioning import make_cond_dict
|
6 |
|
7 |
+
cpu = "cpu"
|
8 |
+
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device=cpu)
|
9 |
|
10 |
+
def _speaker_embed(audio):
|
11 |
+
if audio is None:
|
12 |
+
return None
|
13 |
+
sr, wav = audio
|
14 |
+
wav = torch.tensor(wav).unsqueeze(0)
|
15 |
+
return model.make_speaker_embedding(wav, sr)
|
16 |
|
17 |
+
@spaces.GPU
|
18 |
+
def tts(text, language, speaker_audio, emotion_vec, speaking_rate, pitch_std):
|
19 |
+
speaker = _speaker_embed(speaker_audio)
|
20 |
+
emotion = [float(x) for x in emotion_vec.split(",")] if emotion_vec else None
|
21 |
+
model.to("cuda")
|
22 |
+
with torch.no_grad():
|
23 |
+
cond = make_cond_dict(
|
24 |
+
text=text,
|
25 |
+
language=language,
|
26 |
+
speaker=speaker,
|
27 |
+
emotion=emotion,
|
28 |
+
speaking_rate=float(speaking_rate),
|
29 |
+
pitch_std=float(pitch_std),
|
30 |
+
)
|
31 |
+
conditioning = model.prepare_conditioning(cond)
|
32 |
+
codes = model.generate(conditioning)
|
33 |
+
wav = model.autoencoder.decode(codes)[0].cpu()
|
34 |
+
model.to(cpu)
|
35 |
+
torch.cuda.empty_cache()
|
36 |
+
return (model.autoencoder.sampling_rate, wav.numpy())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
langs = ["en-us", "fr-fr", "ja", "de-de", "zh"]
|
39 |
+
demo = gr.Interface(
|
40 |
+
fn=tts,
|
41 |
+
inputs=[
|
42 |
+
gr.Textbox(label="text"),
|
43 |
+
gr.Dropdown(langs, value="en-us", label="language"),
|
44 |
+
gr.Audio(type="numpy", label="speaker reference (optional)"),
|
45 |
+
gr.Textbox(value="0.3,0,0,0,0,0,0.2,0.5", label="emotion (8 comma-sep floats)"),
|
46 |
+
gr.Slider(0, 40, value=15, step=1, label="speaking_rate"),
|
47 |
+
gr.Slider(0, 400, value=20, step=1, label="pitch_std"),
|
48 |
+
],
|
49 |
+
outputs=gr.Audio(label="generated speech"),
|
50 |
+
title="zonos-v0.1 zerogpu tts",
|
51 |
+
)
|
52 |
+
|
53 |
+
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
demo.launch()
|