Staticaliza commited on
Commit
ad81304
·
verified ·
1 Parent(s): bacbf22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -11
app.py CHANGED
@@ -1,6 +1,5 @@
1
- # app.py
2
  import gradio as gr
3
- import torch, torchaudio, spaces
4
  from zonos.model import Zonos
5
  from zonos.conditioning import make_cond_dict
6
 
@@ -8,16 +7,23 @@ cpu = "cpu"
8
  model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=cpu)
9
 
10
  def _speaker_embed(audio):
11
- if audio is None:
12
- return None
13
  sr, wav = audio
14
- wav = torch.tensor(wav).unsqueeze(0)
 
 
 
 
15
  return model.make_speaker_embedding(wav, sr)
16
 
17
  @spaces.GPU
18
- def tts(text, language, speaker_audio, emotion_vec, speaking_rate, pitch_std):
 
 
 
 
19
  speaker = _speaker_embed(speaker_audio)
20
- emotion = [float(x) for x in emotion_vec.split(",")] if emotion_vec else None
21
  model.to("cuda")
22
  with torch.no_grad():
23
  cond = make_cond_dict(
@@ -42,13 +48,20 @@ demo = gr.Interface(
42
  gr.Textbox(label="text"),
43
  gr.Dropdown(langs, value="en-us", label="language"),
44
  gr.Audio(type="numpy", label="speaker reference (optional)"),
45
- gr.Textbox(value="0.3,0,0,0,0,0,0.2,0.5", label="emotion (8 comma-sep floats)"),
46
- gr.Slider(0, 40, value=15, step=1, label="speaking_rate"),
47
- gr.Slider(0, 400, value=20, step=1, label="pitch_std"),
 
 
 
 
 
 
 
48
  ],
49
  outputs=gr.Audio(label="generated speech"),
50
  title="zonos-v0.1 zerogpu tts",
51
  )
52
 
53
  if __name__ == "__main__":
54
- demo.launch()
 
 
1
  import gradio as gr
2
+ import torch, numpy as np, spaces
3
  from zonos.model import Zonos
4
  from zonos.conditioning import make_cond_dict
5
 
 
7
  model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=cpu)
8
 
9
  def _speaker_embed(audio):
10
+ if audio is None: return None
 
11
  sr, wav = audio
12
+ if wav.dtype.kind in "iu":
13
+ wav = wav.astype(np.float32) / np.iinfo(wav.dtype).max
14
+ else:
15
+ wav = wav.astype(np.float32)
16
+ wav = torch.from_numpy(wav).unsqueeze(0)
17
  return model.make_speaker_embedding(wav, sr)
18
 
19
  @spaces.GPU
20
+ def tts(
21
+ text, language, speaker_audio,
22
+ happy, sad, disgust, fear, surprise, anger, other, neutral,
23
+ speaking_rate, pitch_std
24
+ ):
25
  speaker = _speaker_embed(speaker_audio)
26
+ emotion = [happy, sad, disgust, fear, surprise, anger, other, neutral]
27
  model.to("cuda")
28
  with torch.no_grad():
29
  cond = make_cond_dict(
 
48
  gr.Textbox(label="text"),
49
  gr.Dropdown(langs, value="en-us", label="language"),
50
  gr.Audio(type="numpy", label="speaker reference (optional)"),
51
+ gr.Slider(0, 1, 0.3, 0.05, label="happiness"),
52
+ gr.Slider(0, 1, 0.0, 0.05, label="sadness"),
53
+ gr.Slider(0, 1, 0.0, 0.05, label="disgust"),
54
+ gr.Slider(0, 1, 0.0, 0.05, label="fear"),
55
+ gr.Slider(0, 1, 0.0, 0.05, label="surprise"),
56
+ gr.Slider(0, 1, 0.0, 0.05, label="anger"),
57
+ gr.Slider(0, 1, 0.2, 0.05, label="other"),
58
+ gr.Slider(0, 1, 0.5, 0.05, label="neutral"),
59
+ gr.Slider(0, 40, 15, 1, label="speaking_rate"),
60
+ gr.Slider(0, 400, 20, 1, label="pitch_std"),
61
  ],
62
  outputs=gr.Audio(label="generated speech"),
63
  title="zonos-v0.1 zerogpu tts",
64
  )
65
 
66
  if __name__ == "__main__":
67
+ demo.launch(share=True)