ginipick commited on
Commit
ab9b56e
·
verified ·
1 Parent(s): 075fd65

Delete app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +0 -381
app-backup.py DELETED
@@ -1,381 +0,0 @@
1
- import os
2
- import shlex
3
- import subprocess
4
-
5
- subprocess.run(
6
- shlex.split("pip install flash-attn --no-build-isolation"),
7
- env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
8
- check=True,
9
- )
10
- subprocess.run(
11
- shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
12
- check=True,
13
- )
14
- subprocess.run(
15
- shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"),
16
- check=True,
17
- )
18
-
19
- import spaces
20
- import torch
21
- import torchaudio
22
- import gradio as gr
23
- from os import getenv
24
-
25
- from zonos.model import Zonos
26
- from zonos.conditioning import make_cond_dict, supported_language_codes
27
-
28
- device = "cuda"
29
- MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
30
- MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
31
- for model in MODELS.values():
32
- model.requires_grad_(False).eval()
33
-
34
-
35
- def update_ui(model_choice):
36
- """
37
- Dynamically show/hide UI elements based on the model's conditioners.
38
- We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
39
- """
40
- model = MODELS[model_choice]
41
- cond_names = [c.name for c in model.prefix_conditioner.conditioners]
42
- print("Conditioners in this model:", cond_names)
43
-
44
- text_update = gr.update(visible=("espeak" in cond_names))
45
- language_update = gr.update(visible=("espeak" in cond_names))
46
- speaker_audio_update = gr.update(visible=("speaker" in cond_names))
47
- prefix_audio_update = gr.update(visible=True)
48
- emotion1_update = gr.update(visible=("emotion" in cond_names))
49
- emotion2_update = gr.update(visible=("emotion" in cond_names))
50
- emotion3_update = gr.update(visible=("emotion" in cond_names))
51
- emotion4_update = gr.update(visible=("emotion" in cond_names))
52
- emotion5_update = gr.update(visible=("emotion" in cond_names))
53
- emotion6_update = gr.update(visible=("emotion" in cond_names))
54
- emotion7_update = gr.update(visible=("emotion" in cond_names))
55
- emotion8_update = gr.update(visible=("emotion" in cond_names))
56
- vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
57
- fmax_slider_update = gr.update(visible=("fmax" in cond_names))
58
- pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
59
- speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
60
- dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
61
- speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
62
- unconditional_keys_update = gr.update(
63
- choices=[name for name in cond_names if name not in ("espeak", "language_id")]
64
- )
65
-
66
- return (
67
- text_update,
68
- language_update,
69
- speaker_audio_update,
70
- prefix_audio_update,
71
- emotion1_update,
72
- emotion2_update,
73
- emotion3_update,
74
- emotion4_update,
75
- emotion5_update,
76
- emotion6_update,
77
- emotion7_update,
78
- emotion8_update,
79
- vq_single_slider_update,
80
- fmax_slider_update,
81
- pitch_std_slider_update,
82
- speaking_rate_slider_update,
83
- dnsmos_slider_update,
84
- speaker_noised_checkbox_update,
85
- unconditional_keys_update,
86
- )
87
-
88
-
89
- @spaces.GPU(duration=120)
90
- def generate_audio(
91
- model_choice,
92
- text,
93
- language,
94
- speaker_audio,
95
- prefix_audio,
96
- e1,
97
- e2,
98
- e3,
99
- e4,
100
- e5,
101
- e6,
102
- e7,
103
- e8,
104
- vq_single,
105
- fmax,
106
- pitch_std,
107
- speaking_rate,
108
- dnsmos_ovrl,
109
- speaker_noised,
110
- cfg_scale,
111
- min_p,
112
- seed,
113
- randomize_seed,
114
- unconditional_keys,
115
- progress=gr.Progress(),
116
- ):
117
- """
118
- Generates audio based on the provided UI parameters.
119
- We do NOT use language_id or ctc_loss even if the model has them.
120
- """
121
- selected_model = MODELS[model_choice]
122
-
123
- speaker_noised_bool = bool(speaker_noised)
124
- fmax = float(fmax)
125
- pitch_std = float(pitch_std)
126
- speaking_rate = float(speaking_rate)
127
- dnsmos_ovrl = float(dnsmos_ovrl)
128
- cfg_scale = float(cfg_scale)
129
- min_p = float(min_p)
130
- seed = int(seed)
131
- max_new_tokens = 86 * 30
132
-
133
- if randomize_seed:
134
- seed = torch.randint(0, 2**32 - 1, (1,)).item()
135
- torch.manual_seed(seed)
136
-
137
- speaker_embedding = None
138
- if speaker_audio is not None and "speaker" not in unconditional_keys:
139
- wav, sr = torchaudio.load(speaker_audio)
140
- speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
141
- speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
142
-
143
- audio_prefix_codes = None
144
- if prefix_audio is not None:
145
- wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
146
- wav_prefix = wav_prefix.mean(0, keepdim=True)
147
- wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
148
- wav_prefix = wav_prefix.to(device, dtype=torch.float32)
149
- with torch.autocast(device, dtype=torch.float32):
150
- audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
151
-
152
- emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
153
-
154
- vq_val = float(vq_single)
155
- vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
156
-
157
- cond_dict = make_cond_dict(
158
- text=text,
159
- language=language,
160
- speaker=speaker_embedding,
161
- emotion=emotion_tensor,
162
- vqscore_8=vq_tensor,
163
- fmax=fmax,
164
- pitch_std=pitch_std,
165
- speaking_rate=speaking_rate,
166
- dnsmos_ovrl=dnsmos_ovrl,
167
- speaker_noised=speaker_noised_bool,
168
- device=device,
169
- unconditional_keys=unconditional_keys,
170
- )
171
- conditioning = selected_model.prepare_conditioning(cond_dict)
172
-
173
- estimated_generation_duration = 30 * len(text) / 400
174
- estimated_total_steps = int(estimated_generation_duration * 86)
175
-
176
- def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
177
- progress((step, estimated_total_steps))
178
- return True
179
-
180
- codes = selected_model.generate(
181
- prefix_conditioning=conditioning,
182
- audio_prefix_codes=audio_prefix_codes,
183
- max_new_tokens=max_new_tokens,
184
- cfg_scale=cfg_scale,
185
- batch_size=1,
186
- sampling_params=dict(min_p=min_p),
187
- callback=update_progress,
188
- )
189
-
190
- wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
191
- sr_out = selected_model.autoencoder.sampling_rate
192
- if wav_out.dim() == 2 and wav_out.size(0) > 1:
193
- wav_out = wav_out[0:1, :]
194
- return (sr_out, wav_out.squeeze().numpy()), seed
195
-
196
-
197
- def build_interface():
198
- # UI 테마를 세련되게 변경하기 위해 gr.themes.Soft() 테마를 사용합니다.
199
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
200
- with gr.Row():
201
- with gr.Column():
202
- model_choice = gr.Dropdown(
203
- choices=MODEL_NAMES,
204
- value="Zyphra/Zonos-v0.1-transformer",
205
- label="Zonos Model Type",
206
- info="Select the model variant to use.",
207
- )
208
- text = gr.Textbox(
209
- label="Text to Synthesize",
210
- value="Zonos uses eSpeak for text to phoneme conversion!",
211
- lines=4,
212
- max_length=500,
213
- )
214
- language = gr.Dropdown(
215
- choices=supported_language_codes,
216
- value="en-us",
217
- label="Language Code",
218
- info="Select a language code.",
219
- )
220
- prefix_audio = gr.Audio(
221
- value="assets/silence_100ms.wav",
222
- label="Optional Prefix Audio (continue from this audio)",
223
- type="filepath",
224
- )
225
- with gr.Column():
226
- speaker_audio = gr.Audio(
227
- label="Optional Speaker Audio (for cloning)",
228
- type="filepath",
229
- )
230
- speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
231
-
232
- with gr.Row():
233
- with gr.Column():
234
- gr.Markdown("## Conditioning Parameters")
235
- dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="DNSMOS Overall")
236
- fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Fmax (Hz)")
237
- vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="VQ Score")
238
- pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Std")
239
- speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate")
240
-
241
- with gr.Column():
242
- gr.Markdown("## Generation Parameters")
243
- cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
244
- min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
245
- seed_number = gr.Number(label="Seed", value=420, precision=0)
246
- randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
247
-
248
- with gr.Accordion("Advanced Parameters", open=False):
249
- gr.Markdown(
250
- "### Unconditional Toggles\n"
251
- "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
252
- 'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
253
- )
254
- with gr.Row():
255
- unconditional_keys = gr.CheckboxGroup(
256
- [
257
- "speaker",
258
- "emotion",
259
- "vqscore_8",
260
- "fmax",
261
- "pitch_std",
262
- "speaking_rate",
263
- "dnsmos_ovrl",
264
- "speaker_noised",
265
- ],
266
- value=["emotion"],
267
- label="Unconditional Keys",
268
- )
269
-
270
- gr.Markdown(
271
- "### Emotion Sliders\n"
272
- "Warning: The way these sliders work is not intuitive and may require some trial and error to get the desired effect.\n"
273
- "Certain configurations can cause the model to become unstable. Setting emotion to unconditional may help."
274
- )
275
- with gr.Row():
276
- emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness")
277
- emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness")
278
- emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust")
279
- emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear")
280
- with gr.Row():
281
- emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise")
282
- emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger")
283
- emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other")
284
- emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral")
285
-
286
- with gr.Column():
287
- generate_button = gr.Button("Generate Audio")
288
- output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
289
-
290
- model_choice.change(
291
- fn=update_ui,
292
- inputs=[model_choice],
293
- outputs=[
294
- text,
295
- language,
296
- speaker_audio,
297
- prefix_audio,
298
- emotion1,
299
- emotion2,
300
- emotion3,
301
- emotion4,
302
- emotion5,
303
- emotion6,
304
- emotion7,
305
- emotion8,
306
- vq_single_slider,
307
- fmax_slider,
308
- pitch_std_slider,
309
- speaking_rate_slider,
310
- dnsmos_slider,
311
- speaker_noised_checkbox,
312
- unconditional_keys,
313
- ],
314
- )
315
-
316
- # On page load, trigger the same UI refresh
317
- demo.load(
318
- fn=update_ui,
319
- inputs=[model_choice],
320
- outputs=[
321
- text,
322
- language,
323
- speaker_audio,
324
- prefix_audio,
325
- emotion1,
326
- emotion2,
327
- emotion3,
328
- emotion4,
329
- emotion5,
330
- emotion6,
331
- emotion7,
332
- emotion8,
333
- vq_single_slider,
334
- fmax_slider,
335
- pitch_std_slider,
336
- speaking_rate_slider,
337
- dnsmos_slider,
338
- speaker_noised_checkbox,
339
- unconditional_keys,
340
- ],
341
- )
342
-
343
- # Generate audio on button click
344
- generate_button.click(
345
- fn=generate_audio,
346
- inputs=[
347
- model_choice,
348
- text,
349
- language,
350
- speaker_audio,
351
- prefix_audio,
352
- emotion1,
353
- emotion2,
354
- emotion3,
355
- emotion4,
356
- emotion5,
357
- emotion6,
358
- emotion7,
359
- emotion8,
360
- vq_single_slider,
361
- fmax_slider,
362
- pitch_std_slider,
363
- speaking_rate_slider,
364
- dnsmos_slider,
365
- speaker_noised_checkbox,
366
- cfg_scale_slider,
367
- min_p_slider,
368
- seed_number,
369
- randomize_seed_toggle,
370
- unconditional_keys,
371
- ],
372
- outputs=[output_audio, seed_number],
373
- )
374
-
375
- return demo
376
-
377
-
378
- if __name__ == "__main__":
379
- demo = build_interface()
380
- share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
381
- demo.launch(server_name="0.0.0.0", server_port=7860, share=share)