ginipick commited on
Commit
5374cdd
·
verified ·
1 Parent(s): 74e330e

Delete app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +0 -370
app-backup.py DELETED
@@ -1,370 +0,0 @@
1
- import os
2
- import shlex
3
- import subprocess
4
-
5
- subprocess.run(shlex.split("pip install flash-attn --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True)
6
- subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)
7
- subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)
8
-
9
- import spaces
10
- import torch
11
- import torchaudio
12
- import gradio as gr
13
- from os import getenv
14
-
15
- from zonos.model import Zonos
16
- from zonos.conditioning import make_cond_dict, supported_language_codes
17
-
18
- device = "cuda"
19
- MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
20
- MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
21
- for model in MODELS.values():
22
- model.requires_grad_(False).eval()
23
-
24
-
25
- def update_ui(model_choice):
26
- """
27
- Dynamically show/hide UI elements based on the model's conditioners.
28
- We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
29
- """
30
- model = MODELS[model_choice]
31
- cond_names = [c.name for c in model.prefix_conditioner.conditioners]
32
- print("Conditioners in this model:", cond_names)
33
-
34
- text_update = gr.update(visible=("espeak" in cond_names))
35
- language_update = gr.update(visible=("espeak" in cond_names))
36
- speaker_audio_update = gr.update(visible=("speaker" in cond_names))
37
- prefix_audio_update = gr.update(visible=True)
38
- emotion1_update = gr.update(visible=("emotion" in cond_names))
39
- emotion2_update = gr.update(visible=("emotion" in cond_names))
40
- emotion3_update = gr.update(visible=("emotion" in cond_names))
41
- emotion4_update = gr.update(visible=("emotion" in cond_names))
42
- emotion5_update = gr.update(visible=("emotion" in cond_names))
43
- emotion6_update = gr.update(visible=("emotion" in cond_names))
44
- emotion7_update = gr.update(visible=("emotion" in cond_names))
45
- emotion8_update = gr.update(visible=("emotion" in cond_names))
46
- vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
47
- fmax_slider_update = gr.update(visible=("fmax" in cond_names))
48
- pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
49
- speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
50
- dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
51
- speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
52
- unconditional_keys_update = gr.update(
53
- choices=[name for name in cond_names if name not in ("espeak", "language_id")]
54
- )
55
-
56
- return (
57
- text_update,
58
- language_update,
59
- speaker_audio_update,
60
- prefix_audio_update,
61
- emotion1_update,
62
- emotion2_update,
63
- emotion3_update,
64
- emotion4_update,
65
- emotion5_update,
66
- emotion6_update,
67
- emotion7_update,
68
- emotion8_update,
69
- vq_single_slider_update,
70
- fmax_slider_update,
71
- pitch_std_slider_update,
72
- speaking_rate_slider_update,
73
- dnsmos_slider_update,
74
- speaker_noised_checkbox_update,
75
- unconditional_keys_update,
76
- )
77
-
78
-
79
- @spaces.GPU(duration=120)
80
- def generate_audio(
81
- model_choice,
82
- text,
83
- language,
84
- speaker_audio,
85
- prefix_audio,
86
- e1,
87
- e2,
88
- e3,
89
- e4,
90
- e5,
91
- e6,
92
- e7,
93
- e8,
94
- vq_single,
95
- fmax,
96
- pitch_std,
97
- speaking_rate,
98
- dnsmos_ovrl,
99
- speaker_noised,
100
- cfg_scale,
101
- min_p,
102
- seed,
103
- randomize_seed,
104
- unconditional_keys,
105
- progress=gr.Progress(),
106
- ):
107
- """
108
- Generates audio based on the provided UI parameters.
109
- We do NOT use language_id or ctc_loss even if the model has them.
110
- """
111
- selected_model = MODELS[model_choice]
112
-
113
- speaker_noised_bool = bool(speaker_noised)
114
- fmax = float(fmax)
115
- pitch_std = float(pitch_std)
116
- speaking_rate = float(speaking_rate)
117
- dnsmos_ovrl = float(dnsmos_ovrl)
118
- cfg_scale = float(cfg_scale)
119
- min_p = float(min_p)
120
- seed = int(seed)
121
- max_new_tokens = 86 * 30
122
-
123
- if randomize_seed:
124
- seed = torch.randint(0, 2**32 - 1, (1,)).item()
125
- torch.manual_seed(seed)
126
-
127
- speaker_embedding = None
128
- if speaker_audio is not None and "speaker" not in unconditional_keys:
129
- wav, sr = torchaudio.load(speaker_audio)
130
- speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
131
- speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
132
-
133
- audio_prefix_codes = None
134
- if prefix_audio is not None:
135
- wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
136
- wav_prefix = wav_prefix.mean(0, keepdim=True)
137
- wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
138
- wav_prefix = wav_prefix.to(device, dtype=torch.float32)
139
- with torch.autocast(device, dtype=torch.float32):
140
- audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
141
-
142
- emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
143
-
144
- vq_val = float(vq_single)
145
- vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
146
-
147
- cond_dict = make_cond_dict(
148
- text=text,
149
- language=language,
150
- speaker=speaker_embedding,
151
- emotion=emotion_tensor,
152
- vqscore_8=vq_tensor,
153
- fmax=fmax,
154
- pitch_std=pitch_std,
155
- speaking_rate=speaking_rate,
156
- dnsmos_ovrl=dnsmos_ovrl,
157
- speaker_noised=speaker_noised_bool,
158
- device=device,
159
- unconditional_keys=unconditional_keys,
160
- )
161
- conditioning = selected_model.prepare_conditioning(cond_dict)
162
-
163
- estimated_generation_duration = 30 * len(text) / 400
164
- estimated_total_steps = int(estimated_generation_duration * 86)
165
-
166
- def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
167
- progress((step, estimated_total_steps))
168
- return True
169
-
170
- codes = selected_model.generate(
171
- prefix_conditioning=conditioning,
172
- audio_prefix_codes=audio_prefix_codes,
173
- max_new_tokens=max_new_tokens,
174
- cfg_scale=cfg_scale,
175
- batch_size=1,
176
- sampling_params=dict(min_p=min_p),
177
- callback=update_progress,
178
- )
179
-
180
- wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
181
- sr_out = selected_model.autoencoder.sampling_rate
182
- if wav_out.dim() == 2 and wav_out.size(0) > 1:
183
- wav_out = wav_out[0:1, :]
184
- return (sr_out, wav_out.squeeze().numpy()), seed
185
-
186
-
187
- def build_interface():
188
- with gr.Blocks(theme='ParityError/Interstellar') as demo:
189
- with gr.Row():
190
- with gr.Column():
191
- model_choice = gr.Dropdown(
192
- choices=MODEL_NAMES,
193
- value="Zyphra/Zonos-v0.1-transformer",
194
- label="Zonos Model Type",
195
- info="Select the model variant to use.",
196
- )
197
- text = gr.Textbox(
198
- label="Text to Synthesize",
199
- value="Zonos uses eSpeak for text to phoneme conversion!",
200
- lines=4,
201
- max_length=500, # approximately
202
- )
203
- language = gr.Dropdown(
204
- choices=supported_language_codes,
205
- value="en-us",
206
- label="Language Code",
207
- info="Select a language code.",
208
- )
209
- prefix_audio = gr.Audio(
210
- value="assets/silence_100ms.wav",
211
- label="Optional Prefix Audio (continue from this audio)",
212
- type="filepath",
213
- )
214
- with gr.Column():
215
- speaker_audio = gr.Audio(
216
- label="Optional Speaker Audio (for cloning)",
217
- type="filepath",
218
- )
219
- speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
220
-
221
- with gr.Row():
222
- with gr.Column():
223
- gr.Markdown("## Conditioning Parameters")
224
- dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="DNSMOS Overall")
225
- fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Fmax (Hz)")
226
- vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="VQ Score")
227
- pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Std")
228
- speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate")
229
-
230
- with gr.Column():
231
- gr.Markdown("## Generation Parameters")
232
- cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
233
- min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
234
- seed_number = gr.Number(label="Seed", value=420, precision=0)
235
- randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
236
-
237
- with gr.Accordion("Advanced Parameters", open=False):
238
- gr.Markdown(
239
- "### Unconditional Toggles\n"
240
- "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
241
- 'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
242
- )
243
- with gr.Row():
244
- unconditional_keys = gr.CheckboxGroup(
245
- [
246
- "speaker",
247
- "emotion",
248
- "vqscore_8",
249
- "fmax",
250
- "pitch_std",
251
- "speaking_rate",
252
- "dnsmos_ovrl",
253
- "speaker_noised",
254
- ],
255
- value=["emotion"],
256
- label="Unconditional Keys",
257
- )
258
-
259
- gr.Markdown(
260
- "### Emotion Sliders\n"
261
- "Warning: The way these sliders work is not intuitive and may require some trial and error to get the desired effect.\n"
262
- "Certain configurations can cause the model to become unstable. Setting emotion to unconditional may help."
263
- )
264
- with gr.Row():
265
- emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness")
266
- emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness")
267
- emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust")
268
- emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear")
269
- with gr.Row():
270
- emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise")
271
- emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger")
272
- emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other")
273
- emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral")
274
-
275
- with gr.Column():
276
- generate_button = gr.Button("Generate Audio")
277
- output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
278
-
279
- model_choice.change(
280
- fn=update_ui,
281
- inputs=[model_choice],
282
- outputs=[
283
- text,
284
- language,
285
- speaker_audio,
286
- prefix_audio,
287
- emotion1,
288
- emotion2,
289
- emotion3,
290
- emotion4,
291
- emotion5,
292
- emotion6,
293
- emotion7,
294
- emotion8,
295
- vq_single_slider,
296
- fmax_slider,
297
- pitch_std_slider,
298
- speaking_rate_slider,
299
- dnsmos_slider,
300
- speaker_noised_checkbox,
301
- unconditional_keys,
302
- ],
303
- )
304
-
305
- # On page load, trigger the same UI refresh
306
- demo.load(
307
- fn=update_ui,
308
- inputs=[model_choice],
309
- outputs=[
310
- text,
311
- language,
312
- speaker_audio,
313
- prefix_audio,
314
- emotion1,
315
- emotion2,
316
- emotion3,
317
- emotion4,
318
- emotion5,
319
- emotion6,
320
- emotion7,
321
- emotion8,
322
- vq_single_slider,
323
- fmax_slider,
324
- pitch_std_slider,
325
- speaking_rate_slider,
326
- dnsmos_slider,
327
- speaker_noised_checkbox,
328
- unconditional_keys,
329
- ],
330
- )
331
-
332
- # Generate audio on button click
333
- generate_button.click(
334
- fn=generate_audio,
335
- inputs=[
336
- model_choice,
337
- text,
338
- language,
339
- speaker_audio,
340
- prefix_audio,
341
- emotion1,
342
- emotion2,
343
- emotion3,
344
- emotion4,
345
- emotion5,
346
- emotion6,
347
- emotion7,
348
- emotion8,
349
- vq_single_slider,
350
- fmax_slider,
351
- pitch_std_slider,
352
- speaking_rate_slider,
353
- dnsmos_slider,
354
- speaker_noised_checkbox,
355
- cfg_scale_slider,
356
- min_p_slider,
357
- seed_number,
358
- randomize_seed_toggle,
359
- unconditional_keys,
360
- ],
361
- outputs=[output_audio, seed_number],
362
- )
363
-
364
- return demo
365
-
366
-
367
- if __name__ == "__main__":
368
- demo = build_interface()
369
- share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
370
- demo.launch(server_name="0.0.0.0", server_port=7860, share=share)