Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
-
# app.py
|
2 |
import gradio as gr
|
3 |
-
import torch,
|
4 |
from zonos.model import Zonos
|
5 |
from zonos.conditioning import make_cond_dict
|
6 |
|
@@ -8,16 +7,23 @@ cpu = "cpu"
|
|
8 |
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=cpu)
|
9 |
|
10 |
def _speaker_embed(audio):
|
11 |
-
if audio is None:
|
12 |
-
return None
|
13 |
sr, wav = audio
|
14 |
-
wav
|
|
|
|
|
|
|
|
|
15 |
return model.make_speaker_embedding(wav, sr)
|
16 |
|
17 |
@spaces.GPU
|
18 |
-
def tts(
|
|
|
|
|
|
|
|
|
19 |
speaker = _speaker_embed(speaker_audio)
|
20 |
-
emotion = [
|
21 |
model.to("cuda")
|
22 |
with torch.no_grad():
|
23 |
cond = make_cond_dict(
|
@@ -42,13 +48,20 @@ demo = gr.Interface(
|
|
42 |
gr.Textbox(label="text"),
|
43 |
gr.Dropdown(langs, value="en-us", label="language"),
|
44 |
gr.Audio(type="numpy", label="speaker reference (optional)"),
|
45 |
-
gr.
|
46 |
-
gr.Slider(0,
|
47 |
-
gr.Slider(0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
],
|
49 |
outputs=gr.Audio(label="generated speech"),
|
50 |
title="zonos-v0.1 zerogpu tts",
|
51 |
)
|
52 |
|
53 |
if __name__ == "__main__":
|
54 |
-
demo.launch()
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import torch, numpy as np, spaces
|
3 |
from zonos.model import Zonos
|
4 |
from zonos.conditioning import make_cond_dict
|
5 |
|
|
|
7 |
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=cpu)
|
8 |
|
9 |
def _speaker_embed(audio):
|
10 |
+
if audio is None: return None
|
|
|
11 |
sr, wav = audio
|
12 |
+
if wav.dtype.kind in "iu":
|
13 |
+
wav = wav.astype(np.float32) / np.iinfo(wav.dtype).max
|
14 |
+
else:
|
15 |
+
wav = wav.astype(np.float32)
|
16 |
+
wav = torch.from_numpy(wav).unsqueeze(0)
|
17 |
return model.make_speaker_embedding(wav, sr)
|
18 |
|
19 |
@spaces.GPU
|
20 |
+
def tts(
|
21 |
+
text, language, speaker_audio,
|
22 |
+
happy, sad, disgust, fear, surprise, anger, other, neutral,
|
23 |
+
speaking_rate, pitch_std
|
24 |
+
):
|
25 |
speaker = _speaker_embed(speaker_audio)
|
26 |
+
emotion = [happy, sad, disgust, fear, surprise, anger, other, neutral]
|
27 |
model.to("cuda")
|
28 |
with torch.no_grad():
|
29 |
cond = make_cond_dict(
|
|
|
48 |
gr.Textbox(label="text"),
|
49 |
gr.Dropdown(langs, value="en-us", label="language"),
|
50 |
gr.Audio(type="numpy", label="speaker reference (optional)"),
|
51 |
+
gr.Slider(0, 1, 0.3, 0.05, label="happiness"),
|
52 |
+
gr.Slider(0, 1, 0.0, 0.05, label="sadness"),
|
53 |
+
gr.Slider(0, 1, 0.0, 0.05, label="disgust"),
|
54 |
+
gr.Slider(0, 1, 0.0, 0.05, label="fear"),
|
55 |
+
gr.Slider(0, 1, 0.0, 0.05, label="surprise"),
|
56 |
+
gr.Slider(0, 1, 0.0, 0.05, label="anger"),
|
57 |
+
gr.Slider(0, 1, 0.2, 0.05, label="other"),
|
58 |
+
gr.Slider(0, 1, 0.5, 0.05, label="neutral"),
|
59 |
+
gr.Slider(0, 40, 15, 1, label="speaking_rate"),
|
60 |
+
gr.Slider(0, 400, 20, 1, label="pitch_std"),
|
61 |
],
|
62 |
outputs=gr.Audio(label="generated speech"),
|
63 |
title="zonos-v0.1 zerogpu tts",
|
64 |
)
|
65 |
|
66 |
if __name__ == "__main__":
|
67 |
+
demo.launch(share=True)
|