Staticaliza commited on
Commit
2f05b19
·
verified ·
1 Parent(s): d9a9c0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -59
app.py CHANGED
@@ -1,23 +1,19 @@
1
- import torch
2
- import torchaudio
3
- import gradio as gr
4
- import spaces
5
  from functools import lru_cache
6
  from zonos.model import Zonos
7
  from zonos.conditioning import make_cond_dict, supported_language_codes
8
 
9
  MODEL_ID = "Zyphra/Zonos-v0.1-transformer"
10
 
11
- @lru_cache(maxsize=1)
12
- def load_model():
13
- return Zonos.from_pretrained(MODEL_ID, device="cuda").eval().requires_grad_(False)
14
 
15
- @spaces.GPU(duration=120)
16
- def tts(
17
  text,
18
  language,
19
  ref_audio,
20
- fmax,
21
  pitch_std,
22
  speaking_rate,
23
  guidance_scale,
@@ -29,73 +25,62 @@ def tts(
29
  surprise,
30
  anger,
31
  other,
32
- neutral,
33
  ):
34
- model = load_model()
35
  if seed == 0:
36
- seed = int(torch.randint(0, 2**32 - 1, (1,)).item())
37
  torch.manual_seed(seed)
38
-
39
  speaker = None
40
- if ref_audio is not None:
41
  wav, sr = torchaudio.load(ref_audio)
42
- speaker = model.make_speaker_embedding(wav, sr).to("cuda", dtype=torch.bfloat16)
43
-
44
  emotion = torch.tensor(
45
- [happiness, sadness, disgust, fear, surprise, anger, other, neutral], device="cuda"
 
46
  )
47
  cond = make_cond_dict(
48
- text=text,
49
- language=language,
50
- speaker=speaker,
51
- emotion=emotion,
52
- fmax=float(fmax),
53
- pitch_std=float(pitch_std),
54
- speaking_rate=float(speaking_rate),
55
- device="cuda",
56
  )
57
  conditioning = model.prepare_conditioning(cond)
58
- codes = model.generate(conditioning, cfg_scale=float(guidance_scale), max_new_tokens=2590)
59
  wav_out = model.autoencoder.decode(codes).cpu().squeeze()
60
  return (model.autoencoder.sampling_rate, wav_out.numpy()), seed
61
 
62
-
63
  with gr.Blocks() as demo:
64
  gr.Markdown("## zonos v0.1 tts")
65
- text = gr.Textbox(label="text to synthesize", lines=3)
66
- language = gr.Dropdown(
67
- choices=supported_language_codes, value="en-us", label="language"
68
- )
69
- ref_audio = gr.Audio(label="reference audio (zeroshot tts)", type="filepath")
70
-
71
- fmax = gr.Slider(0, 24000, 24000, step=1, label="frequency max (Hz)")
72
- pitch_std = gr.Slider(0.0, 300.0, 45.0, step=1, label="pitch variation")
73
- speaking_rate = gr.Slider(5.0, 30.0, 15.0, step=0.5, label="speaking rate")
74
-
75
- guidance_scale = gr.Slider(1.0, 5.0, 2.0, step=0.1, label="guidance scale")
76
- seed = gr.Number(value=0, label="seed (0 = random)", precision=0)
77
-
78
  gr.Markdown("### emotion settings")
79
- happiness = gr.Slider(0, 1, 0.0, step=0.01, label="happiness")
80
- sadness = gr.Slider(0, 1, 0.0, step=0.01, label="sadness")
81
- disgust = gr.Slider(0, 1, 0.0, step=0.01, label="disgust")
82
- fear = gr.Slider(0, 1, 0.0, step=0.01, label="fear")
83
- surprise = gr.Slider(0, 1, 0.0, step=0.01, label="surprise")
84
- anger = gr.Slider(0, 1, 0.0, step=0.01, label="anger")
85
- other = gr.Slider(0, 1, 0.0, step=0.01, label="other")
86
- neutral = gr.Slider(0, 1, 1.0, step=0.01, label="neutral")
87
-
88
  btn = gr.Button("synthesize")
89
- out_audio = gr.Audio(label="output")
90
- out_seed = gr.Number(label="used seed", interactive=False)
91
-
92
  btn.click(
93
- tts,
94
  [
95
  text,
96
  language,
97
  ref_audio,
98
- fmax,
99
  pitch_std,
100
  speaking_rate,
101
  guidance_scale,
@@ -107,10 +92,8 @@ with gr.Blocks() as demo:
107
  surprise,
108
  anger,
109
  other,
110
- neutral,
111
  ],
112
- [out_audio, out_seed],
113
  )
114
-
115
- if __name__ == "__main__":
116
  demo.launch()
 
1
+ import torch, torchaudio, gradio as gr, spaces
 
 
 
2
  from functools import lru_cache
3
  from zonos.model import Zonos
4
  from zonos.conditioning import make_cond_dict, supported_language_codes
5
 
6
  MODEL_ID = "Zyphra/Zonos-v0.1-transformer"
7
 
8
+ @lru_cache(maxsize = 1)
9
+ def get_model():
10
+ return Zonos.from_pretrained(MODEL_ID, device = "cuda").eval().requires_grad_(False)
11
 
12
+ @spaces.GPU(duration = 120)
13
+ def synthesize(
14
  text,
15
  language,
16
  ref_audio,
 
17
  pitch_std,
18
  speaking_rate,
19
  guidance_scale,
 
25
  surprise,
26
  anger,
27
  other,
28
+ neutral
29
  ):
30
+ model = get_model()
31
  if seed == 0:
32
+ seed = int(torch.randint(0, 2 ** 32 - 1, (1,)).item())
33
  torch.manual_seed(seed)
 
34
  speaker = None
35
+ if ref_audio:
36
  wav, sr = torchaudio.load(ref_audio)
37
+ speaker = model.make_speaker_embedding(wav, sr).to("cuda", dtype = torch.bfloat16)
 
38
  emotion = torch.tensor(
39
+ [happiness, sadness, disgust, fear, surprise, anger, other, neutral],
40
+ device = "cuda"
41
  )
42
  cond = make_cond_dict(
43
+ text = text,
44
+ language = language,
45
+ speaker = speaker,
46
+ emotion = emotion,
47
+ fmax = 24_000.0,
48
+ pitch_std = float(pitch_std),
49
+ speaking_rate = float(speaking_rate),
50
+ device = "cuda"
51
  )
52
  conditioning = model.prepare_conditioning(cond)
53
+ codes = model.generate(conditioning, cfg_scale = float(guidance_scale), max_new_tokens = 2_590)
54
  wav_out = model.autoencoder.decode(codes).cpu().squeeze()
55
  return (model.autoencoder.sampling_rate, wav_out.numpy()), seed
56
 
 
57
  with gr.Blocks() as demo:
58
  gr.Markdown("## zonos v0.1 tts")
59
+ text = gr.Textbox(label = "text to synthesize", lines = 3)
60
+ language = gr.Dropdown(supported_language_codes, value = "en-us", label = "language")
61
+ ref_audio = gr.Audio(label = "reference audio (zeroshot tts)", type = "filepath")
62
+ pitch_std = gr.Slider(0.0, 300.0, 45.0, step = 1, label = "pitch variation")
63
+ speaking_rate = gr.Slider(5.0, 30.0, 15.0, step = 0.5, label = "speaking rate")
64
+ guidance_scale = gr.Slider(1.0, 5.0, 2.0, step = 0.1, label = "guidance scale")
65
+ seed = gr.Number(value = 0, label = "seed (0 = random)", precision = 0)
 
 
 
 
 
 
66
  gr.Markdown("### emotion settings")
67
+ happiness = gr.Slider(0, 1, 0.0, step = 0.01, label = "happiness")
68
+ sadness = gr.Slider(0, 1, 0.0, step = 0.01, label = "sadness")
69
+ disgust = gr.Slider(0, 1, 0.0, step = 0.01, label = "disgust")
70
+ fear = gr.Slider(0, 1, 0.0, step = 0.01, label = "fear")
71
+ surprise = gr.Slider(0, 1, 0.0, step = 0.01, label = "surprise")
72
+ anger = gr.Slider(0, 1, 0.0, step = 0.01, label = "anger")
73
+ other = gr.Slider(0, 1, 0.0, step = 0.01, label = "other")
74
+ neutral = gr.Slider(0, 1, 1.0, step = 0.01, label = "neutral")
 
75
  btn = gr.Button("synthesize")
76
+ out_audio = gr.Audio(label = "output")
77
+ out_seed = gr.Number(label = "used seed", interactive = False)
 
78
  btn.click(
79
+ synthesize,
80
  [
81
  text,
82
  language,
83
  ref_audio,
 
84
  pitch_std,
85
  speaking_rate,
86
  guidance_scale,
 
92
  surprise,
93
  anger,
94
  other,
95
+ neutral
96
  ],
97
+ [out_audio, out_seed]
98
  )
 
 
99
  demo.launch()