Staticaliza commited on
Commit
a527f65
·
verified ·
1 Parent(s): d778d44

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -131
app.py CHANGED
@@ -1,71 +1,46 @@
1
- import os, torch, torchaudio, gradio as gr
2
-
3
  import spaces
 
 
 
4
 
5
  from zonos.model import Zonos
6
  from zonos.conditioning import make_cond_dict, supported_language_codes
7
 
8
- device = "cuda"
9
- MODEL_NAME = "Zyphra/Zonos-v0.1-transformer"
10
- MODEL = Zonos.from_pretrained(MODEL_NAME, device=device)
11
 
12
- # def _patch_cuda_props():
13
- # if torch.cuda.is_available():
14
- # for i in range(torch.cuda.device_count()):
15
- # p = torch.cuda.get_device_properties(i)
16
- # if not hasattr(p, "regs_per_multiprocessor"):
17
- # setattr(p, "regs_per_multiprocessor", 65536)
18
- # if not hasattr(p, "max_threads_per_multi_processor"):
19
- # setattr(p, "max_threads_per_multi_processor", 2048)
20
 
 
 
21
 
22
- # _patch_cuda_props()
 
 
 
 
 
 
 
 
23
 
24
  @spaces.GPU
25
- def generate_audio(
26
- text,
27
- language,
28
- speaker_audio,
29
- e1,
30
- e2,
31
- e3,
32
- e4,
33
- e5,
34
- e6,
35
- e7,
36
- e8,
37
- clarity,
38
- fmax,
39
- pitch_std,
40
- speaking_rate,
41
- dnsmos_ovrl,
42
- cfg_scale,
43
- min_p,
44
- steps,
45
- seed,
46
- randomize_seed,
47
- progress=gr.Progress(),
48
- ):
49
- if randomize_seed:
50
- seed = torch.randint(0, 2**32 - 1, (1,)).item()
51
- torch.manual_seed(int(seed))
52
 
53
  speaker_embedding = None
54
  if speaker_audio is not None:
55
  wav, sr = torchaudio.load(speaker_audio)
56
- speaker_embedding = (
57
- MODEL.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16)
58
- )
59
 
60
- emotion_tensor = torch.tensor(
61
- [e1, e2, e3, e4, e5, e6, e7, e8], device=device, dtype=torch.float32
62
- )
63
- vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.float32).unsqueeze(
64
- 0
65
- )
66
 
67
  cond_dict = make_cond_dict(
68
- text=text,
69
  language=language,
70
  speaker=speaker_embedding,
71
  emotion=emotion_tensor,
@@ -76,15 +51,10 @@ def generate_audio(
76
  dnsmos_ovrl=float(dnsmos_ovrl),
77
  device=device,
78
  )
79
- conditioning = MODEL.prepare_conditioning(cond_dict)
80
-
81
- estimated_total_steps = int(steps)
82
 
83
- def cb(_, step, __):
84
- progress((step, estimated_total_steps))
85
- return True
86
-
87
- codes = MODEL.generate(
88
  prefix_conditioning=conditioning,
89
  max_new_tokens=int(steps),
90
  cfg_scale=float(cfg_scale),
@@ -93,78 +63,48 @@ def generate_audio(
93
  callback=cb,
94
  )
95
 
96
- wav_out = MODEL.autoencoder.decode(codes).cpu().detach()
97
- sr_out = MODEL.autoencoder.sampling_rate
98
- if wav_out.dim() == 2 and wav_out.size(0) > 1:
99
- wav_out = wav_out[0:1, :]
100
- return (sr_out, wav_out.squeeze().numpy()), seed
101
-
102
-
103
- def build_interface():
104
- with gr.Blocks() as demo:
105
- text = gr.Textbox(label="text", value="hello, world!", lines=4, max_length=500)
106
- language = gr.Dropdown(choices=supported_language_codes, value="en-us", label="language")
107
- speaker_audio = gr.Audio(label="voice reference", type="filepath")
108
-
109
- clarity_slider = gr.Slider(0.5, 0.8, 0.8, 0.01, label="clarity")
110
- steps_slider = gr.Slider(1, 3000, 300, 1, label="steps")
111
-
112
- dnsmos_slider = gr.Slider(1.0, 5.0, 5.0, 0.1, label="quality")
113
- fmax_slider = gr.Slider(0, 24000, 24000, 1, label="fmax")
114
- pitch_std_slider = gr.Slider(0.0, 300.0, 30.0, 1, label="pitch std")
115
- speaking_rate_slider = gr.Slider(5.0, 30.0, 15.0, 0.1, label="rate")
116
-
117
- cfg_scale_slider = gr.Slider(1.0, 5.0, 2.5, 0.1, label="guidance")
118
- min_p_slider = gr.Slider(0.0, 1.0, 0.05, 0.01, label="min p")
119
-
120
- with gr.Row():
121
- e1 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="happy")
122
- e2 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="sad")
123
- e3 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="disgust")
124
- e4 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="fear")
125
-
126
- with gr.Row():
127
- e5 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="surprise")
128
- e6 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="anger")
129
- e7 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="other")
130
- e8 = gr.Slider(0.0, 1.0, 1.0, 0.01, label="neutral")
131
-
132
- seed_number = gr.Number(label="seed", value=420, precision=0)
133
- randomize_seed_toggle = gr.Checkbox(label="randomize seed", value=True)
134
-
135
- generate_button = gr.Button("generate")
136
- output_audio = gr.Audio(label="output", type="numpy", autoplay=True)
137
-
138
- generate_button.click(
139
- fn=generate_audio,
140
- inputs=[
141
- text,
142
- language,
143
- speaker_audio,
144
- e1,
145
- e2,
146
- e3,
147
- e4,
148
- e5,
149
- e6,
150
- e7,
151
- e8,
152
- clarity_slider,
153
- fmax_slider,
154
- pitch_std_slider,
155
- speaking_rate_slider,
156
- dnsmos_slider,
157
- cfg_scale_slider,
158
- min_p_slider,
159
- steps_slider,
160
- seed_number,
161
- randomize_seed_toggle,
162
- ],
163
- outputs=[output_audio, seed_number],
164
- )
165
 
166
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
 
168
 
169
- if __name__ == "__main__":
170
- build_interface().launch()
 
1
+ # Imports
2
+ import gradio as gr
3
  import spaces
4
+ import os
5
+ import torch
6
+ import torchaudio
7
 
8
  from zonos.model import Zonos
9
  from zonos.conditioning import make_cond_dict, supported_language_codes
10
 
11
+ # Variables
12
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
 
13
 
14
+ device = "cuda"
 
 
 
 
 
 
 
15
 
16
+ REPO = "Zyphra/Zonos-v0.1-transformer"
17
+ model = Zonos.from_pretrained(REPO, device=device)
18
 
19
+ # Functions
20
+ def patch_cuda():
21
+ if torch.cuda.is_available():
22
+ for i in range(torch.cuda.device_count()):
23
+ p = torch.cuda.get_device_properties(i)
24
+ if not hasattr(p, "regs_per_multiprocessor"):
25
+ setattr(p, "regs_per_multiprocessor", 65536)
26
+ if not hasattr(p, "max_threads_per_multi_processor"):
27
+ setattr(p, "max_threads_per_multi_processor", 2048)
28
 
29
  @spaces.GPU
30
+ def generate(input, language, speaker_audio, emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral, clarity, fmax, pitch_std, speaking_rate, dnsmos_ovrl, cfg_scale, min_p, steps, seed, randomize_seed):
31
+ if randomize_seed: seed = int(time.time())
32
+ torch.manual_seed(seed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  speaker_embedding = None
35
  if speaker_audio is not None:
36
  wav, sr = torchaudio.load(speaker_audio)
37
+ speaker_embedding = (model.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16))
 
 
38
 
39
+ emotion_tensor = torch.tensor([emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral], device=device, dtype=torch.bfloat16)
40
+ vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.bfloat16).unsqueeze(0)
 
 
 
 
41
 
42
  cond_dict = make_cond_dict(
43
+ text=input,
44
  language=language,
45
  speaker=speaker_embedding,
46
  emotion=emotion_tensor,
 
51
  dnsmos_ovrl=float(dnsmos_ovrl),
52
  device=device,
53
  )
54
+
55
+ conditioning = model.prepare_conditioning(cond_dict)
 
56
 
57
+ codes = model.generate(
 
 
 
 
58
  prefix_conditioning=conditioning,
59
  max_new_tokens=int(steps),
60
  cfg_scale=float(cfg_scale),
 
63
  callback=cb,
64
  )
65
 
66
+ wav_out = model.autoencoder.decode(codes).cpu().detach()
67
+ sr_out = model.autoencoder.sampling_rate
68
+
69
+ if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ return (sr_out, wav_out.squeeze().numpy())
72
+
73
+ # Initialize
74
+ patch_cuda()
75
+
76
+ with gr.Blocks() as main:
77
+ text = gr.Textbox(label="text", value="hello, world!")
78
+ language = gr.Dropdown(choices=supported_language_codes, value="en-us", label="language")
79
+ speaker_audio = gr.Audio(label="voice reference", type="filepath")
80
+
81
+ clarity_slider = gr.Slider(0.5, 0.8, 0.8, 0.01, label="clarity")
82
+ steps_slider = gr.Slider(1, 3000, 300, 1, label="steps")
83
+
84
+ dnsmos_slider = gr.Slider(1.0, 5.0, 5.0, 0.1, label="quality")
85
+ fmax_slider = gr.Slider(0, 24000, 24000, 1, label="fmax")
86
+ pitch_std_slider = gr.Slider(0.0, 300.0, 30.0, 1, label="pitch std")
87
+ speaking_rate_slider = gr.Slider(5.0, 30.0, 15.0, 0.1, label="rate")
88
+
89
+ cfg_scale_slider = gr.Slider(1.0, 5.0, 2.5, 0.1, label="guidance")
90
+ min_p_slider = gr.Slider(0.0, 1.0, 0.05, 0.01, label="min p")
91
+
92
+ with gr.Row():
93
+ e1 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="happy")
94
+ e2 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="sad")
95
+ e3 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="disgust")
96
+ e4 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="fear")
97
+ e5 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="surprise")
98
+ e6 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="anger")
99
+ e7 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="other")
100
+ e8 = gr.Slider(0.0, 1.0, 1.0, 0.01, label="neutral")
101
+
102
+ seed_number = gr.Number(label="seed", value=42, precision=0)
103
+ randomize_seed_toggle = gr.Checkbox(label="randomize seed", value=True)
104
+
105
+ generate_button = gr.Button("generate")
106
+ output_audio = gr.Audio(label="output", type="numpy", autoplay=True)
107
 
108
+ generate_button.click(fn=generate, inputs=[text, language, speaker_audio, e1, e2, e3, e4, e5, e6, e7, e8, clarity_slider, fmax_slider, pitch_std_slider, speaking_rate_slider, dnsmos_slider, cfg_scale_slider, min_p_slider, steps_slider, seed_number, randomize_seed_toggle], outputs=output_audio)
109
 
110
+ main.launch()