yl4579 commited on
Commit
1b8d1f0
·
verified ·
1 Parent(s): 2598aa3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +279 -401
app.py CHANGED
@@ -2,471 +2,349 @@ import gradio as gr
2
  import torch
3
  import torchaudio
4
  import numpy as np
5
- from pathlib import Path
6
  import tempfile
 
 
 
 
7
 
8
- # Import the DMOInference class (assuming it's in a file called dmo_inference.py)
9
  from infer import DMOInference
10
 
11
- def initialize_model(student_checkpoint, duration_predictor_checkpoint, model_type, device, cuda_device_id):
12
- """Initialize the DMOSpeech 2 model with given checkpoints."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  try:
 
 
 
 
 
 
 
14
  model = DMOInference(
15
- student_checkpoint_path=student_checkpoint,
16
- duration_predictor_path=duration_predictor_checkpoint,
17
  device=device,
18
- model_type=model_type,
19
- tokenizer="pinyin",
20
- dataset_name="Emilia_ZH_EN",
21
- cuda_device_id=str(cuda_device_id)
22
  )
23
- return model, "Model initialized successfully!"
 
 
24
  except Exception as e:
25
- return None, f"Error initializing model: {str(e)}"
 
 
 
26
 
27
  def generate_speech(
28
- model,
29
- generation_mode,
30
  prompt_audio,
31
  prompt_text,
32
  target_text,
33
- # Duration settings
34
- duration_mode,
35
- manual_duration,
36
- dp_softmax_range,
37
- dp_temperature,
38
- # Teacher-student settings
39
- teacher_steps,
40
- teacher_stopping_time,
41
- student_start_step,
42
  # Advanced settings
43
- eta,
44
- cfg_strength,
45
- sway_coefficient,
46
- # Teacher-guided specific
47
- tg_switch_time,
48
- tg_teacher_steps,
49
- tg_student_steps
50
  ):
51
- """Generate speech using the selected mode and parameters."""
52
 
53
- if model is None:
54
- return None, "Please initialize the model first!"
55
 
56
  if prompt_audio is None:
57
- return None, "Please upload a reference audio!"
58
 
59
  if not target_text:
60
- return None, "Please enter target text to generate!"
61
 
62
  try:
63
- # Convert prompt_text to None if empty (for ASR)
64
- prompt_text = prompt_text.strip() if prompt_text else None
65
 
66
- # Determine duration
67
- if duration_mode == "automatic":
68
- duration = None
69
- else:
70
- duration = int(manual_duration)
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- # Generate based on selected mode
73
- if generation_mode == "Student-Only (4 steps)":
74
- # Standard DMOSpeech 2 generation
75
- generated_wave = model.generate(
76
- gen_text=target_text,
77
- audio_path=prompt_audio,
78
- prompt_text=prompt_text,
79
- teacher_steps=0, # No teacher guidance
80
- student_start_step=1,
81
- duration=duration,
82
- dp_softmax_range=dp_softmax_range,
83
- temperature=dp_temperature,
84
- eta=eta,
85
- cfg_strength=cfg_strength,
86
- sway_coefficient=sway_coefficient,
87
- verbose=True
88
- )
89
-
90
- elif generation_mode == "Teacher-Student Distillation":
91
- # Full teacher-student distillation
92
- generated_wave = model.generate(
93
- gen_text=target_text,
94
- audio_path=prompt_audio,
95
- prompt_text=prompt_text,
96
- teacher_steps=teacher_steps,
97
- teacher_stopping_time=teacher_stopping_time,
98
- student_start_step=student_start_step,
99
- duration=duration,
100
- dp_softmax_range=dp_softmax_range,
101
- temperature=dp_temperature,
102
- eta=eta,
103
- cfg_strength=cfg_strength,
104
- sway_coefficient=sway_coefficient,
105
- verbose=True
106
- )
107
-
108
- elif generation_mode == "Teacher-Only":
109
- # Teacher-only generation
110
- generated_wave = model.generate_teacher_only(
111
- gen_text=target_text,
112
- audio_path=prompt_audio,
113
- prompt_text=prompt_text,
114
- teacher_steps=teacher_steps,
115
- duration=duration,
116
- eta=eta,
117
- cfg_strength=cfg_strength,
118
- sway_coefficient=sway_coefficient
119
- )
120
-
121
- elif generation_mode == "Teacher-Guided Sampling":
122
- # Implement teacher-guided sampling
123
- # This would require implementing the teacher-guided sampling algorithm
124
- # For now, we'll use the regular generation with specific parameters
125
- total_teacher_steps = tg_teacher_steps
126
-
127
- generated_wave = model.generate(
128
- gen_text=target_text,
129
- audio_path=prompt_audio,
130
- prompt_text=prompt_text,
131
- teacher_steps=total_teacher_steps,
132
- teacher_stopping_time=tg_switch_time,
133
- student_start_step=1,
134
- duration=duration,
135
- dp_softmax_range=dp_softmax_range,
136
- temperature=dp_temperature,
137
- eta=eta,
138
- cfg_strength=cfg_strength,
139
- sway_coefficient=sway_coefficient,
140
- verbose=True
141
- )
142
 
143
- # Save generated audio
 
 
 
 
 
 
 
144
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
145
  output_path = tmp_file.name
146
 
147
- # Convert to tensor and save
148
- if isinstance(generated_wave, np.ndarray):
149
- generated_wave = torch.from_numpy(generated_wave)
150
 
151
- if generated_wave.dim() == 1:
152
- generated_wave = generated_wave.unsqueeze(0)
153
 
154
- torchaudio.save(output_path, generated_wave, 24000)
155
-
156
- return output_path, "Speech generated successfully!"
157
-
158
- except Exception as e:
159
- return None, f"Error generating speech: {str(e)}"
160
-
161
- def predict_duration_only(
162
- model,
163
- prompt_audio,
164
- prompt_text,
165
- target_text,
166
- dp_softmax_range,
167
- dp_temperature
168
- ):
169
- """Predict duration for the target text."""
170
- if model is None:
171
- return "Please initialize the model first!"
172
-
173
- if prompt_audio is None:
174
- return "Please upload a reference audio!"
175
-
176
- if not target_text:
177
- return "Please enter target text!"
178
-
179
- try:
180
- prompt_text = prompt_text.strip() if prompt_text else None
181
 
182
- predicted_duration = model.predict_duration(
183
- pmt_wav_path=prompt_audio,
184
- tar_text=target_text,
185
- pmt_text=prompt_text,
186
- dp_softmax_range=dp_softmax_range,
187
- temperature=dp_temperature
188
- )
189
 
190
- return f"Predicted duration: {predicted_duration} frames (~{predicted_duration/100:.2f} seconds)"
191
 
192
  except Exception as e:
193
- return f"Error predicting duration: {str(e)}"
194
 
195
  # Create Gradio interface
196
- with gr.Blocks(title="DMOSpeech 2: Advanced Zero-Shot TTS") as demo:
197
- gr.Markdown("""
198
- # DMOSpeech 2: Reinforcement Learning for Duration Prediction in Metric-Optimized Speech Synthesis
199
 
200
- This demo showcases DMOSpeech 2, which features:
201
- - **Direct metric optimization** for speaker similarity and intelligibility
202
- - **RL-optimized duration prediction** for better speech quality
203
- - **Teacher-guided sampling** for improved diversity
204
- - **Efficient 4-step generation** while maintaining high quality
205
- """)
206
 
207
- # Model state
208
- model_state = gr.State(None)
209
 
210
- with gr.Tab("Model Setup"):
211
- gr.Markdown("### Initialize Model")
212
- with gr.Row():
213
- student_checkpoint = gr.Textbox(
214
- label="Student Model Checkpoint Path",
215
- placeholder="/path/to/student_checkpoint.pt"
 
216
  )
217
- duration_checkpoint = gr.Textbox(
218
- label="Duration Predictor Checkpoint Path",
219
- placeholder="/path/to/duration_predictor.pt"
220
- )
221
-
222
- with gr.Row():
223
- model_type = gr.Dropdown(
224
- choices=["F5TTS_Base", "E2TTS_Base"],
225
- value="F5TTS_Base",
226
- label="Model Type"
227
  )
228
- device = gr.Dropdown(
229
- choices=["cuda", "cpu"],
230
- value="cuda",
231
- label="Device"
 
232
  )
233
- cuda_device_id = gr.Number(
234
- value=0,
235
- label="CUDA Device ID",
236
- precision=0
 
 
 
 
 
 
 
 
237
  )
238
-
239
- init_button = gr.Button("Initialize Model", variant="primary")
240
- init_status = gr.Textbox(label="Initialization Status", interactive=False)
241
-
242
- with gr.Tab("Speech Generation"):
243
- with gr.Row():
244
- with gr.Column(scale=1):
245
- gr.Markdown("### Input Settings")
246
-
247
- prompt_audio = gr.Audio(
248
- label="Reference Audio",
249
- type="filepath",
250
- sources=["upload", "microphone"]
251
- )
252
-
253
- prompt_text = gr.Textbox(
254
- label="Reference Text (optional - will use ASR if empty)",
255
- placeholder="The text spoken in the reference audio..."
256
- )
257
-
258
- target_text = gr.Textbox(
259
- label="Target Text to Generate",
260
- placeholder="Enter the text you want to synthesize...",
261
- lines=3
262
- )
263
-
264
- generation_mode = gr.Radio(
265
- choices=[
266
- "Student-Only (4 steps)",
267
- "Teacher-Student Distillation",
268
- "Teacher-Only",
269
- "Teacher-Guided Sampling"
270
- ],
271
- value="Student-Only (4 steps)",
272
- label="Generation Mode"
273
- )
274
-
275
- with gr.Column(scale=1):
276
- gr.Markdown("### Duration Settings")
277
-
278
- duration_mode = gr.Radio(
279
- choices=["automatic", "manual"],
280
- value="automatic",
281
- label="Duration Mode"
282
- )
283
-
284
- manual_duration = gr.Slider(
285
- minimum=100,
286
- maximum=3000,
287
- value=500,
288
- step=10,
289
- label="Manual Duration (frames)",
290
- visible=False
291
- )
292
-
293
- dp_softmax_range = gr.Slider(
294
- minimum=0.1,
295
- maximum=1.0,
296
- value=0.7,
297
- step=0.1,
298
- label="Duration Predictor Softmax Range"
299
- )
300
 
301
- dp_temperature = gr.Slider(
302
  minimum=0.0,
303
  maximum=2.0,
304
  value=0.0,
305
  step=0.1,
306
- label="Duration Predictor Temperature (0=argmax)"
307
- )
308
-
309
- predict_duration_btn = gr.Button("Predict Duration Only")
310
- duration_output = gr.Textbox(label="Predicted Duration", interactive=False)
311
-
312
- with gr.Accordion("Advanced Settings", open=False):
313
- with gr.Tab("Teacher-Student Settings"):
314
- teacher_steps = gr.Slider(
315
- minimum=0,
316
- maximum=32,
317
- value=16,
318
- step=1,
319
- label="Teacher Steps"
320
- )
321
-
322
- teacher_stopping_time = gr.Slider(
323
- minimum=0.0,
324
- maximum=1.0,
325
- value=0.07,
326
- step=0.01,
327
- label="Teacher Stopping Time"
328
- )
329
-
330
- student_start_step = gr.Slider(
331
- minimum=1,
332
- maximum=4,
333
- value=1,
334
- step=1,
335
- label="Student Start Step"
336
- )
337
-
338
- with gr.Tab("Sampling Settings"):
339
- eta = gr.Slider(
340
- minimum=0.0,
341
- maximum=1.0,
342
- value=1.0,
343
- step=0.1,
344
- label="Eta (Stochasticity: 0=DDIM, 1=DDPM)"
345
- )
346
-
347
- cfg_strength = gr.Slider(
348
- minimum=0.0,
349
- maximum=5.0,
350
- value=2.0,
351
- step=0.1,
352
- label="CFG Strength"
353
- )
354
-
355
- sway_coefficient = gr.Slider(
356
- minimum=-2.0,
357
- maximum=2.0,
358
- value=-1.0,
359
- step=0.1,
360
- label="Sway Sampling Coefficient"
361
- )
362
-
363
- with gr.Tab("Teacher-Guided Settings"):
364
- tg_switch_time = gr.Slider(
365
- minimum=0.1,
366
- maximum=0.5,
367
- value=0.25,
368
- step=0.05,
369
- label="Switch Time (when to transition to student)"
370
- )
371
-
372
- tg_teacher_steps = gr.Slider(
373
- minimum=6,
374
- maximum=20,
375
- value=14,
376
- step=1,
377
- label="Teacher Steps"
378
  )
379
 
380
- tg_student_steps = gr.Slider(
381
- minimum=1,
382
- maximum=4,
383
- value=2,
384
- step=1,
385
- label="Student Steps"
386
  )
387
-
388
- generate_button = gr.Button("Generate Speech", variant="primary")
389
-
390
- with gr.Row():
391
- output_audio = gr.Audio(label="Generated Speech", type="filepath")
392
- generation_status = gr.Textbox(label="Generation Status", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
- with gr.Tab("Examples & Info"):
395
- gr.Markdown("""
396
- ### Usage Tips:
397
-
398
- 1. **Generation Modes:**
399
- - **Student-Only (4 steps)**: Fastest, uses the distilled model with direct metric optimization
400
- - **Teacher-Student Distillation**: Uses teacher guidance for initial steps
401
- - **Teacher-Only**: Full quality but slower (32 steps)
402
- - **Teacher-Guided Sampling**: Best balance of quality and diversity
403
-
404
- 2. **Duration Settings:**
405
- - **Automatic**: Uses RL-optimized duration predictor
406
- - **Manual**: Specify exact duration in frames (100 frames ≈ 1 second)
407
-
408
- 3. **Advanced Parameters:**
409
- - **Eta**: Controls sampling stochasticity (0 = deterministic, 1 = fully stochastic)
410
- - **CFG Strength**: Higher values = stronger adherence to text
411
- - **Sway Coefficient**: Negative values focus on early denoising steps
412
-
413
- ### Key Features:
414
- - ✅ 5× faster than teacher model
415
- - ✅ Better WER and speaker similarity
416
- - ✅ RL-optimized duration prediction
417
- - ✅ Maintains prosodic diversity with teacher-guided sampling
418
- """)
419
 
420
- # Event handlers
421
- duration_mode.change(
422
- lambda x: gr.update(visible=(x == "manual")),
423
- inputs=[duration_mode],
424
- outputs=[manual_duration]
425
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
- init_button.click(
428
- lambda sc, dc, mt, d, cid: initialize_model(sc, dc, mt, d, cid),
429
- inputs=[student_checkpoint, duration_checkpoint, model_type, device, cuda_device_id],
430
- outputs=[model_state, init_status]
431
- )
432
 
433
- generate_button.click(
 
434
  generate_speech,
435
  inputs=[
436
- model_state,
437
- generation_mode,
438
  prompt_audio,
439
  prompt_text,
440
  target_text,
441
- duration_mode,
442
- manual_duration,
443
- dp_softmax_range,
444
- dp_temperature,
445
- teacher_steps,
446
- teacher_stopping_time,
447
- student_start_step,
448
- eta,
449
- cfg_strength,
450
- sway_coefficient,
451
- tg_switch_time,
452
- tg_teacher_steps,
453
- tg_student_steps
454
  ],
455
- outputs=[output_audio, generation_status]
456
  )
457
 
458
- predict_duration_btn.click(
459
- predict_duration_only,
460
- inputs=[
461
- model_state,
462
- prompt_audio,
463
- prompt_text,
464
- target_text,
465
- dp_softmax_range,
466
- dp_temperature
467
- ],
468
- outputs=[duration_output]
469
  )
470
 
 
471
  if __name__ == "__main__":
472
- demo.launch(share=True)
 
 
 
 
2
  import torch
3
  import torchaudio
4
  import numpy as np
 
5
  import tempfile
6
+ import time
7
+ from pathlib import Path
8
+ from huggingface_hub import hf_hub_download
9
+ import os
10
 
11
+ # Import the inference module (assuming it's named 'infer.py' based on the notebook)
12
  from infer import DMOInference
13
 
14
+ # Global model instance
15
+ model = None
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+
18
+ def download_models():
19
+ """Download models from HuggingFace Hub."""
20
+ try:
21
+ print("Downloading models from HuggingFace...")
22
+
23
+ # Download student model
24
+ student_path = hf_hub_download(
25
+ repo_id="yl4579/DMOSpeech2",
26
+ filename="model_85000.pt",
27
+ cache_dir="./models"
28
+ )
29
+
30
+ # Download duration predictor
31
+ duration_path = hf_hub_download(
32
+ repo_id="yl4579/DMOSpeech2",
33
+ filename="model_1500.pt",
34
+ cache_dir="./models"
35
+ )
36
+
37
+ print(f"Student model: {student_path}")
38
+ print(f"Duration model: {duration_path}")
39
+
40
+ return student_path, duration_path
41
+
42
+ except Exception as e:
43
+ print(f"Error downloading models: {e}")
44
+ return None, None
45
+
46
+ def initialize_model():
47
+ """Initialize the model on startup."""
48
+ global model
49
+
50
  try:
51
+ # Download models
52
+ student_path, duration_path = download_models()
53
+
54
+ if not student_path or not duration_path:
55
+ return False, "Failed to download models from HuggingFace"
56
+
57
+ # Initialize model
58
  model = DMOInference(
59
+ student_checkpoint_path=student_path,
60
+ duration_predictor_path=duration_path,
61
  device=device,
62
+ model_type="F5TTS_Base"
 
 
 
63
  )
64
+
65
+ return True, f"Model loaded successfully on {device.upper()}"
66
+
67
  except Exception as e:
68
+ return False, f"Error initializing model: {str(e)}"
69
+
70
+ # Initialize model on startup
71
+ model_loaded, status_message = initialize_model()
72
 
73
  def generate_speech(
 
 
74
  prompt_audio,
75
  prompt_text,
76
  target_text,
77
+ mode,
 
 
 
 
 
 
 
 
78
  # Advanced settings
79
+ custom_teacher_steps,
80
+ custom_teacher_stopping_time,
81
+ custom_student_start_step,
82
+ temperature,
83
+ verbose
 
 
84
  ):
85
+ """Generate speech with different configurations."""
86
 
87
+ if not model_loaded or model is None:
88
+ return None, "Model not loaded! Please refresh the page.", "", ""
89
 
90
  if prompt_audio is None:
91
+ return None, "Please upload a reference audio!", "", ""
92
 
93
  if not target_text:
94
+ return None, "Please enter text to generate!", "", ""
95
 
96
  try:
97
+ start_time = time.time()
 
98
 
99
+ # Configure parameters based on mode
100
+ if mode == "Student Only (4 steps)":
101
+ teacher_steps = 0
102
+ student_start_step = 0
103
+ teacher_stopping_time = 1.0
104
+ elif mode == "Teacher-Guided (8 steps)":
105
+ # Default configuration from the notebook
106
+ teacher_steps = 16
107
+ teacher_stopping_time = 0.07
108
+ student_start_step = 1
109
+ elif mode == "High Diversity (16 steps)":
110
+ teacher_steps = 24
111
+ teacher_stopping_time = 0.3
112
+ student_start_step = 2
113
+ else: # Custom
114
+ teacher_steps = custom_teacher_steps
115
+ teacher_stopping_time = custom_teacher_stopping_time
116
+ student_start_step = custom_student_start_step
117
 
118
+ # Generate speech
119
+ generated_audio = model.generate(
120
+ gen_text=target_text,
121
+ audio_path=prompt_audio,
122
+ prompt_text=prompt_text if prompt_text else None,
123
+ teacher_steps=teacher_steps,
124
+ teacher_stopping_time=teacher_stopping_time,
125
+ student_start_step=student_start_step,
126
+ temperature=temperature,
127
+ verbose=verbose
128
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ end_time = time.time()
131
+
132
+ # Calculate metrics
133
+ processing_time = end_time - start_time
134
+ audio_duration = generated_audio.shape[-1] / 24000
135
+ rtf = processing_time / audio_duration
136
+
137
+ # Save audio
138
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
139
  output_path = tmp_file.name
140
 
141
+ if isinstance(generated_audio, np.ndarray):
142
+ generated_audio = torch.from_numpy(generated_audio)
 
143
 
144
+ if generated_audio.dim() == 1:
145
+ generated_audio = generated_audio.unsqueeze(0)
146
 
147
+ torchaudio.save(output_path, generated_audio, 24000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
+ # Format metrics
150
+ metrics = f"RTF: {rtf:.2f}x ({1/rtf:.2f}x speed) | Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio"
 
 
 
 
 
151
 
152
+ return output_path, "Success!", metrics, f"Mode: {mode}"
153
 
154
  except Exception as e:
155
+ return None, f"Error: {str(e)}", "", ""
156
 
157
  # Create Gradio interface
158
+ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as demo:
159
+ gr.Markdown(f"""
160
+ # 🎙️ DMOSpeech 2: Zero-Shot Text-to-Speech
161
 
162
+ Generate natural speech in any voice with just a short reference audio!
 
 
 
 
 
163
 
164
+ **Model Status:** {status_message} | **Device:** {device.upper()}
165
+ """)
166
 
167
+ with gr.Row():
168
+ with gr.Column(scale=1):
169
+ # Reference audio input
170
+ prompt_audio = gr.Audio(
171
+ label="📎 Reference Audio",
172
+ type="filepath",
173
+ sources=["upload", "microphone"]
174
  )
175
+
176
+ prompt_text = gr.Textbox(
177
+ label="📝 Reference Text (optional - will auto-transcribe if empty)",
178
+ placeholder="The text spoken in the reference audio...",
179
+ lines=2
 
 
 
 
 
180
  )
181
+
182
+ target_text = gr.Textbox(
183
+ label="✍️ Text to Generate",
184
+ placeholder="Enter the text you want to synthesize...",
185
+ lines=4
186
  )
187
+
188
+ # Generation mode
189
+ mode = gr.Radio(
190
+ choices=[
191
+ "Student Only (4 steps)",
192
+ "Teacher-Guided (8 steps)",
193
+ "High Diversity (16 steps)",
194
+ "Custom"
195
+ ],
196
+ value="Teacher-Guided (8 steps)",
197
+ label="🚀 Generation Mode",
198
+ info="Choose speed vs quality/diversity tradeoff"
199
  )
200
+
201
+ # Advanced settings (collapsible)
202
+ with gr.Accordion("⚙️ Advanced Settings", open=False):
203
+ with gr.Row():
204
+ custom_teacher_steps = gr.Slider(
205
+ minimum=0,
206
+ maximum=32,
207
+ value=16,
208
+ step=1,
209
+ label="Teacher Steps",
210
+ info="More steps = higher quality"
211
+ )
212
+
213
+ custom_teacher_stopping_time = gr.Slider(
214
+ minimum=0.0,
215
+ maximum=1.0,
216
+ value=0.07,
217
+ step=0.01,
218
+ label="Teacher Stopping Time",
219
+ info="When to switch to student"
220
+ )
221
+
222
+ custom_student_start_step = gr.Slider(
223
+ minimum=0,
224
+ maximum=4,
225
+ value=1,
226
+ step=1,
227
+ label="Student Start Step",
228
+ info="Which student step to start from"
229
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
+ temperature = gr.Slider(
232
  minimum=0.0,
233
  maximum=2.0,
234
  value=0.0,
235
  step=0.1,
236
+ label="Duration Temperature",
237
+ info="0 = deterministic, >0 = more variation in speech rhythm"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  )
239
 
240
+ verbose = gr.Checkbox(
241
+ value=False,
242
+ label="Verbose Output",
243
+ info="Show detailed generation steps"
 
 
244
  )
245
+
246
+ generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
247
+
248
+ with gr.Column(scale=1):
249
+ # Output
250
+ output_audio = gr.Audio(
251
+ label="🔊 Generated Speech",
252
+ type="filepath",
253
+ autoplay=True
254
+ )
255
+
256
+ status = gr.Textbox(
257
+ label="Status",
258
+ interactive=False
259
+ )
260
+
261
+ metrics = gr.Textbox(
262
+ label="Performance Metrics",
263
+ interactive=False
264
+ )
265
+
266
+ info = gr.Textbox(
267
+ label="Generation Info",
268
+ interactive=False
269
+ )
270
+
271
+ # Tips
272
+ gr.Markdown("""
273
+ ### 💡 Quick Tips:
274
+
275
+ - **Student Only**: Fastest (4 steps), good quality
276
+ - **Teacher-Guided**: Best balance (8 steps), recommended
277
+ - **High Diversity**: More natural prosody (16 steps)
278
+ - **Temperature**: Add randomness to speech rhythm
279
+
280
+ ### 📊 Expected RTF (Real-Time Factor):
281
+ - Student Only: ~0.05x (20x faster than real-time)
282
+ - Teacher-Guided: ~0.10x (10x faster)
283
+ - High Diversity: ~0.20x (5x faster)
284
+ """)
285
 
286
+ # Examples section
287
+ gr.Markdown("### 🎯 Examples")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ examples = [
290
+ [
291
+ None, # Will be replaced with actual audio path
292
+ "Some call me nature, others call me mother nature.",
293
+ "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring.",
294
+ "Teacher-Guided (8 steps)",
295
+ 16, 0.07, 1, 0.0, False
296
+ ],
297
+ [
298
+ None, # Will be replaced with actual audio path
299
+ "对,这就是我,万人敬仰的太乙真人。",
300
+ '突然,身边一阵笑声。我看着他们,意气风发地挺直了胸膛,甩了甩那稍显肉感的双臂,轻笑道:"我身上的肉,是为了掩饰我爆棚的魅力,否则,岂不吓坏了你们呢?"',
301
+ "Teacher-Guided (8 steps)",
302
+ 16, 0.07, 1, 0.0, False
303
+ ],
304
+ [
305
+ None,
306
+ "对,这就是我,万人敬仰的太乙真人。",
307
+ '突然,身边一阵笑声。我看着他们,意气风发地挺直了胸膛,甩了甩那稍显肉感的双臂,轻笑道:"我身上的肉,是为了掩饰我爆棚的魅力,否则,岂不吓坏了你们呢?"',
308
+ "High Diversity (16 steps)",
309
+ 24, 0.3, 2, 0.8, False
310
+ ]
311
+ ]
312
 
313
+ # Note about example audio files
314
+ gr.Markdown("""
315
+ *Note: Example audio files should be uploaded to the Space. The examples above show the text configurations used in the original notebook.*
316
+ """)
 
317
 
318
+ # Event handler
319
+ generate_btn.click(
320
  generate_speech,
321
  inputs=[
 
 
322
  prompt_audio,
323
  prompt_text,
324
  target_text,
325
+ mode,
326
+ custom_teacher_steps,
327
+ custom_teacher_stopping_time,
328
+ custom_student_start_step,
329
+ temperature,
330
+ verbose
 
 
 
 
 
 
 
331
  ],
332
+ outputs=[output_audio, status, metrics, info]
333
  )
334
 
335
+ # Update visibility of custom settings based on mode
336
+ def update_custom_visibility(mode):
337
+ return gr.update(visible=(mode == "Custom"))
338
+
339
+ mode.change(
340
+ lambda x: [gr.update(interactive=(x == "Custom"))] * 3,
341
+ inputs=[mode],
342
+ outputs=[custom_teacher_steps, custom_teacher_stopping_time, custom_student_start_step]
 
 
 
343
  )
344
 
345
+ # Launch the app
346
  if __name__ == "__main__":
347
+ if not model_loaded:
348
+ print(f"Warning: Model failed to load - {status_message}")
349
+
350
+ demo.launch()