dattazigzag commited on
Commit
88d6cf9
·
verified ·
1 Parent(s): 5c15424

onnyx upload

Browse files
Files changed (1) hide show
  1. kokoro_onnx_gradio.py +642 -0
kokoro_onnx_gradio.py ADDED
@@ -0,0 +1,642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import time
4
+ import re
5
+ import os
6
+ import soundfile as sf
7
+ import warnings
8
+
9
+ from kokoro_onnx import Kokoro
10
+ from kokoro_onnx.tokenizer import Tokenizer
11
+
12
+ # Suppress warnings
13
+ warnings.filterwarnings("ignore")
14
+
15
+ # Initialize tokenizer and model
16
+ tokenizer = Tokenizer()
17
+ kokoro = Kokoro("onnx_deps/kokoro-v1.0.onnx", "onnx_deps/voices-v1.0.bin")
18
+
19
+ # Constants
20
+ SUPPORTED_LANGUAGES = ["en-us", "en-gb", "es", "fr-fr", "hi", "it", "ja", "pt-br", "zh"]
21
+ AUDIO_DIR = "audio_exports"
22
+ CURRENT_VOICE = "af_sky" # Default voice
23
+
24
+ # Create output directory if it doesn't exist
25
+ os.makedirs(AUDIO_DIR, exist_ok=True)
26
+
27
+ # Split pattern presets
28
+ SPLIT_PATTERNS = {
29
+ "Paragraphs (one or more newlines)": r"\n+",
30
+ "Sentences (periods, question marks, exclamation points)": r"(?<=[.!?])\s+",
31
+ "Commas and semicolons": r"[,;]\s+",
32
+ "No splitting (process as one chunk)": r"$^", # Pattern that won't match anything
33
+ "Custom": "custom",
34
+ }
35
+
36
+
37
+ def preview_text_splitting(text, split_pattern):
38
+ """
39
+ Preview how text will be split based on the pattern
40
+ """
41
+ try:
42
+ if split_pattern == "$^": # Special case for no splitting
43
+ return [text]
44
+
45
+ chunks = re.split(split_pattern, text)
46
+ # Filter out empty chunks
47
+ chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
48
+ return chunks
49
+ except Exception as e:
50
+ return [f"Error previewing split: {e}"]
51
+
52
+
53
+ def run_performance_tests(text, voice, language, split_pattern, speed):
54
+ """
55
+ Run performance tests comparing different approaches
56
+
57
+ Returns:
58
+ String with detailed test results
59
+ """
60
+ results = []
61
+ results.append("=== KOKORO-ONNX PERFORMANCE TEST RESULTS ===\n")
62
+
63
+ # Split text into chunks for comparison
64
+ chunks = re.split(split_pattern, text)
65
+ chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
66
+ results.append(f"Text split into {len(chunks)} chunks\n")
67
+
68
+ # Test 1: Per-chunk vs. Full-text tokenization
69
+ results.append("TEST #1: TOKENIZATION STRATEGIES")
70
+
71
+ # Approach 1: Per-chunk tokenization
72
+ start_time = time.time()
73
+ all_phonemes = []
74
+ for chunk in chunks:
75
+ phonemes = tokenizer.phonemize(chunk, lang=language)
76
+ all_phonemes.append(phonemes)
77
+ per_chunk_time = time.time() - start_time
78
+ results.append(f"Per-chunk tokenization: {per_chunk_time:.6f}s")
79
+
80
+ # Approach 2: Single tokenization for entire text
81
+ start_time = time.time()
82
+ full_phonemes = tokenizer.phonemize(text, lang=language)
83
+ full_tokenization_time = time.time() - start_time
84
+ results.append(f"Full text tokenization: {full_tokenization_time:.6f}s")
85
+ if full_tokenization_time > 0:
86
+ results.append(f"Speedup: {per_chunk_time / full_tokenization_time:.2f}x\n")
87
+
88
+ # Test 2: Audio generation strategies
89
+ results.append("TEST #2: AUDIO GENERATION STRATEGIES")
90
+
91
+ # Approach 1: Generate per chunk
92
+ start_time = time.time()
93
+ audio_chunks = []
94
+ for p in all_phonemes:
95
+ if p.strip(): # Skip empty phonemes
96
+ audio, _ = kokoro.create(p, voice=voice, speed=speed, is_phonemes=True)
97
+ audio_chunks.append(audio)
98
+ split_gen_time = time.time() - start_time
99
+ results.append(f"Generate per chunk: {split_gen_time:.6f}s")
100
+
101
+ # Approach 2: Generate for full text
102
+ start_time = time.time()
103
+ audio_full, _ = kokoro.create(
104
+ full_phonemes, voice=voice, speed=speed, is_phonemes=True
105
+ )
106
+ full_gen_time = time.time() - start_time
107
+ results.append(f"Generate full text: {full_gen_time:.6f}s")
108
+ if full_gen_time > 0:
109
+ results.append(f"Speedup: {split_gen_time / full_gen_time:.2f}x\n")
110
+
111
+ # Test 3: Total processing time comparison
112
+ results.append("TEST #3: TOTAL PROCESSING TIME")
113
+ total_chunked = per_chunk_time + split_gen_time
114
+ total_full = full_tokenization_time + full_gen_time
115
+ results.append(f"Total time (chunked): {total_chunked:.6f}s")
116
+ results.append(f"Total time (full text): {total_full:.6f}s")
117
+ if total_full > 0:
118
+ results.append(f"Overall speedup: {total_chunked / total_full:.2f}x")
119
+
120
+ # Recommendations
121
+ results.append("\nRECOMMENDATIONS:")
122
+ if per_chunk_time > full_tokenization_time:
123
+ results.append("- Tokenize entire text at once instead of per-chunk")
124
+ if split_gen_time > full_gen_time:
125
+ results.append("- Generate audio for entire text rather than per-chunk")
126
+ elif split_gen_time < full_gen_time:
127
+ results.append("- Keep generating audio in chunks for better performance")
128
+
129
+ return "\n".join(results)
130
+
131
+
132
+
133
+ # [OLD] Chunking create func
134
+ def create(text: str, voice: str, language: str, blend_voice_name: str = None,
135
+ blend_ratio: float = 0.5, split_pattern: str = r"\n+", speed: float = 1.0,
136
+ output_dir: str = AUDIO_DIR):
137
+ """
138
+ Generate audio using Kokoro-ONNX with added features
139
+
140
+ Args:
141
+ text: Text to synthesize
142
+ voice: Primary voice to use
143
+ language: Language code
144
+ blend_voice_name: Optional secondary voice for blending
145
+ blend_ratio: Ratio of primary to secondary voice (0.0-1.0)
146
+ split_pattern: Pattern to split text into chunks
147
+ speed: Speech rate
148
+ output_dir: Directory to save audio files
149
+
150
+ Returns:
151
+ Tuple of (audio_tuple, phonemes, split_info, timing_info)
152
+ """
153
+ global CURRENT_VOICE
154
+
155
+ # Create output directory if it doesn't exist
156
+ os.makedirs(output_dir, exist_ok=True)
157
+
158
+ # Update current voice
159
+ if voice != CURRENT_VOICE and not blend_voice_name:
160
+ print(f"Voice changed from {CURRENT_VOICE} to {voice}")
161
+ CURRENT_VOICE = voice
162
+
163
+ # Start total timing
164
+ start_total_time = time.time()
165
+
166
+ # Split text into chunks
167
+ chunks = preview_text_splitting(text, split_pattern)
168
+ split_info = f"Text split into {len(chunks)} chunks using pattern: '{split_pattern}'"
169
+ print(split_info)
170
+
171
+ # Initialize variables for processing
172
+ all_audio = []
173
+ all_phonemes = []
174
+ sample_rate = 24000 # Kokoro's sample rate
175
+
176
+ # Timing metrics
177
+ phoneme_times = []
178
+ generation_times = []
179
+ save_times = []
180
+
181
+ # Process each chunk
182
+ for i, chunk in enumerate(chunks):
183
+ # Skip empty chunks
184
+ if not chunk.strip():
185
+ continue
186
+
187
+ # Time phonemization
188
+ phoneme_start = time.time()
189
+ phonemes = tokenizer.phonemize(chunk, lang=language)
190
+ phoneme_time = time.time() - phoneme_start
191
+ phoneme_times.append(phoneme_time)
192
+ print(f"Chunk {i+1} phonemized in {phoneme_time:.6f}s")
193
+
194
+ # Save phonemes
195
+ all_phonemes.append(f"Chunk {i+1}: {phonemes}")
196
+
197
+ # Handle voice blending
198
+ voice_blend_start = time.time()
199
+ voice_to_use = voice
200
+ if blend_voice_name:
201
+ first_voice = kokoro.get_voice_style(voice)
202
+ second_voice = kokoro.get_voice_style(blend_voice_name)
203
+ voice_to_use = np.add(first_voice * blend_ratio, second_voice * (1 - blend_ratio))
204
+ print(f"Voices blended in {time.time() - voice_blend_start:.6f}s")
205
+
206
+ # Generate audio
207
+ gen_start = time.time()
208
+ audio, sr = kokoro.create(phonemes, voice=voice_to_use, speed=speed, is_phonemes=True)
209
+ gen_time = time.time() - gen_start
210
+ generation_times.append(gen_time)
211
+ print(f"Chunk {i+1} audio generated in {gen_time:.6f}s")
212
+
213
+ # Add to audio list
214
+ all_audio.append(audio)
215
+
216
+ # Save individual chunk to file
217
+ save_start = time.time()
218
+ voice_label = voice.split('_')[1] if isinstance(voice, str) else 'blend'
219
+ chunk_filename = os.path.join(output_dir, f"chunk_{i+1}_{voice_label}.wav")
220
+ sf.write(chunk_filename, audio, sr)
221
+ save_time = time.time() - save_start
222
+ save_times.append(save_time)
223
+ print(f"Chunk {i+1} saved to {chunk_filename} in {save_time:.6f}s")
224
+
225
+ # Time to combine chunks
226
+ combine_start = time.time()
227
+ if len(all_audio) > 1:
228
+ audio_data = np.concatenate(all_audio)
229
+ combine_time = time.time() - combine_start
230
+ print(f"Combined {len(all_audio)} chunks in {combine_time:.6f}s")
231
+ else:
232
+ audio_data = all_audio[0] if all_audio else np.array([])
233
+ combine_time = 0
234
+
235
+ # Time to save combined file
236
+ save_combined_start = time.time()
237
+ voice_label = voice.split('_')[1] if isinstance(voice, str) else 'blend'
238
+ combined_filename = os.path.join(output_dir, f"combined_{voice_label}.wav")
239
+ sf.write(combined_filename, audio_data, sample_rate)
240
+ save_combined_time = time.time() - save_combined_start
241
+ print(f"Combined audio saved to {combined_filename} in {save_combined_time:.6f}s")
242
+
243
+ # Calculate total time
244
+ total_time = time.time() - start_total_time
245
+
246
+ # Create detailed timing info
247
+ chunks_count = len(all_audio)
248
+ timing_lines = []
249
+
250
+ # Add summary of processing times
251
+ timing_lines.append(f"Phonemization time: {sum(phoneme_times):.6f}s")
252
+ timing_lines.append(f"Audio generation time: {sum(generation_times):.6f}s")
253
+
254
+ # Per-chunk timing
255
+ if chunks_count > 1:
256
+ timing_lines.append("\nChunk details:")
257
+ for i in range(chunks_count):
258
+ timing_lines.append(f" Chunk {i+1}: Phoneme {phoneme_times[i]:.6f}s, Gen {generation_times[i]:.6f}s, Save {save_times[i]:.6f}s")
259
+
260
+ # Combine and save timing
261
+ if chunks_count > 1:
262
+ timing_lines.append(f"\nCombine chunks: {combine_time:.6f}s")
263
+ timing_lines.append(f"Save combined: {save_combined_time:.6f}s")
264
+
265
+ # Total timing
266
+ timing_lines.append(f"\nTotal processing time: {total_time:.6f}s")
267
+
268
+ # Format timing info for display
269
+ timing_info = "\n".join(timing_lines)
270
+
271
+ # Combine phonemes
272
+ phonemes_text = "\n\n".join(all_phonemes)
273
+
274
+ # Update split info
275
+ if chunks_count > 1:
276
+ split_info = f"Text was split into {chunks_count} chunks and saved to {output_dir}"
277
+ else:
278
+ split_info = f"Text processed as a single chunk and saved to {output_dir}"
279
+
280
+ return [(sample_rate, audio_data), phonemes_text, split_info, timing_info]
281
+
282
+ # Optimized -- over rides paragraph splitting behavior...
283
+ # def create(
284
+ # text: str,
285
+ # voice: str,
286
+ # language: str,
287
+ # blend_voice_name: str = None,
288
+ # blend_ratio: float = 0.5,
289
+ # split_pattern: str = r"\n+",
290
+ # speed: float = 1.0,
291
+ # output_dir: str = AUDIO_DIR,
292
+ # ):
293
+ # """
294
+ # Generate audio using Kokoro-ONNX with optimized processing
295
+
296
+ # Args:
297
+ # text: Text to synthesize
298
+ # voice: Primary voice to use
299
+ # language: Language code
300
+ # blend_voice_name: Optional secondary voice for blending
301
+ # blend_ratio: Ratio of primary to secondary voice (0.0-1.0)
302
+ # split_pattern: Pattern to split text into chunks
303
+ # speed: Speech rate
304
+ # output_dir: Directory to save audio files
305
+
306
+ # Returns:
307
+ # Tuple of (audio_tuple, phonemes, split_info, timing_info)
308
+ # """
309
+ # global CURRENT_VOICE
310
+
311
+ # # Create output directory if it doesn't exist
312
+ # os.makedirs(output_dir, exist_ok=True)
313
+
314
+ # # Update current voice
315
+ # if voice != CURRENT_VOICE and not blend_voice_name:
316
+ # print(f"Voice changed from {CURRENT_VOICE} to {voice}")
317
+ # CURRENT_VOICE = voice
318
+
319
+ # # Start total timing
320
+ # start_total_time = time.time()
321
+
322
+ # # Split text only for display purposes
323
+ # chunks = preview_text_splitting(text, split_pattern)
324
+ # split_info = (
325
+ # f"Text split into {len(chunks)} chunks using pattern: '{split_pattern}'"
326
+ # )
327
+ # print(split_info)
328
+
329
+ # # Phonemize the entire text at once (optimization #1)
330
+ # phoneme_start = time.time()
331
+ # phonemes = tokenizer.phonemize(text, lang=language)
332
+ # phoneme_time = time.time() - phoneme_start
333
+ # print(f"Text phonemized in {phoneme_time:.6f}s")
334
+
335
+ # # Handle voice blending
336
+ # voice_blend_start = time.time()
337
+ # voice_to_use = voice
338
+ # if blend_voice_name:
339
+ # first_voice = kokoro.get_voice_style(voice)
340
+ # second_voice = kokoro.get_voice_style(blend_voice_name)
341
+ # voice_to_use = np.add(
342
+ # first_voice * blend_ratio, second_voice * (1 - blend_ratio)
343
+ # )
344
+ # voice_blend_time = time.time() - voice_blend_start
345
+ # print(f"Voices blended in {voice_blend_time:.6f}s")
346
+
347
+ # # Generate audio for entire text at once (optimization #2)
348
+ # gen_start = time.time()
349
+ # audio, sample_rate = kokoro.create(
350
+ # phonemes, voice=voice_to_use, speed=speed, is_phonemes=True
351
+ # )
352
+ # gen_time = time.time() - gen_start
353
+ # print(f"Audio generated in {gen_time:.6f}s")
354
+
355
+ # # Save to file
356
+ # save_start = time.time()
357
+ # voice_label = voice.split("_")[1] if isinstance(voice, str) else "blend"
358
+ # filename = os.path.join(output_dir, f"full_{voice_label}.wav")
359
+ # sf.write(filename, audio, sample_rate)
360
+ # save_time = time.time() - save_start
361
+ # print(f"Audio saved to {filename} in {save_time:.6f}s")
362
+
363
+ # # Calculate total time
364
+ # total_time = time.time() - start_total_time
365
+
366
+ # # Create timing info
367
+ # timing_lines = [
368
+ # f"Phonemization time: {phoneme_time:.6f}s",
369
+ # f"Audio generation time: {gen_time:.6f}s",
370
+ # f"Save time: {save_time:.6f}s",
371
+ # f"\nTotal processing time: {total_time:.6f}s",
372
+ # f"\nOptimized approach: Processing entire text at once (2.1x faster)",
373
+ # ]
374
+
375
+ # timing_info = "\n".join(timing_lines)
376
+
377
+ # # For display, still show the text chunks
378
+ # chunk_display = []
379
+ # for i, chunk in enumerate(chunks):
380
+ # chunk_display.append(f"Chunk {i + 1}: Text: {chunk[:50]}...")
381
+
382
+ # phonemes_display = (
383
+ # "Full text phonemes (first 100 chars):\n" + phonemes[:100] + "..."
384
+ # )
385
+
386
+ # return [(sample_rate, audio), phonemes_display, split_info, timing_info]
387
+
388
+
389
+ def on_split_pattern_change(pattern_name, custom_pattern):
390
+ """
391
+ Handle changes to the split pattern selection
392
+ """
393
+ if pattern_name == "Custom":
394
+ return custom_pattern, gr.update(visible=True)
395
+ else:
396
+ return SPLIT_PATTERNS[pattern_name], gr.update(visible=False)
397
+
398
+
399
+ def preview_splits(text, pattern):
400
+ """
401
+ Preview how text will be split based on the pattern
402
+ """
403
+ chunks = preview_text_splitting(text, pattern)
404
+ if len(chunks) == 1 and pattern == "$^":
405
+ return "Text will be processed as a single chunk (no splitting)"
406
+
407
+ result = f"Text will be split into {len(chunks)} chunks:\n\n"
408
+ for i, chunk in enumerate(chunks):
409
+ # Truncate very long chunks in the preview
410
+ display_chunk = chunk[:100] + "..." if len(chunk) > 100 else chunk
411
+ result += f"Chunk {i + 1}: {display_chunk}\n\n"
412
+
413
+ return result
414
+
415
+
416
+ def create_app():
417
+ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])) as ui:
418
+ # Title
419
+ gr.Markdown("# Kokoro-ONNX TTS Demo")
420
+ gr.Markdown("#### Optimized ONNX implementation with Voice Blending")
421
+
422
+ # Input controls
423
+ with gr.Row():
424
+ with gr.Column(scale=1):
425
+ text_input = gr.TextArea(
426
+ label="Input Text",
427
+ rtl=False,
428
+ value="Hello!\n\nThis is a multi-paragraph test.\nWith multiple lines.\n\nKokoro can split on paragraphs, sentences, or other patterns.",
429
+ lines=8,
430
+ )
431
+
432
+ # Information about split patterns
433
+ with gr.Accordion("About Text Splitting", open=False):
434
+ gr.Markdown("""
435
+ ### Understanding Text Splitting
436
+
437
+ The splitting pattern controls how Kokoro breaks your text into manageable chunks for processing.
438
+
439
+ **Common patterns:**
440
+ - `\\n+`: Split on one or more newlines (paragraphs)
441
+ - `(?<=[.!?])\\s+`: Split after periods, question marks, and exclamation points (sentences)
442
+ - `[,;]\\s+`: Split after commas and semicolons
443
+ - `$^`: Special pattern that won't match anything (processes the entire text as one chunk)
444
+
445
+ **Benefits of splitting:**
446
+ - Better phrasing and natural pauses
447
+ - Improved handling of longer texts
448
+ - More consistent pronunciation across chunks
449
+ """)
450
+
451
+ # Split Pattern Selection
452
+ split_pattern_dropdown = gr.Dropdown(
453
+ label="Split Text Using",
454
+ value="Paragraphs (one or more newlines)",
455
+ choices=list(SPLIT_PATTERNS.keys()),
456
+ info="Select how to split your text into chunks",
457
+ )
458
+
459
+ custom_pattern_input = gr.Textbox(
460
+ label="Custom Split Pattern (Regular Expression)",
461
+ value=r"\n+",
462
+ visible=False,
463
+ info="Enter a custom regex pattern for splitting text",
464
+ )
465
+
466
+ preview_button = gr.Button("Preview Text Splitting")
467
+ split_preview = gr.Textbox(
468
+ label="Split Preview",
469
+ value="Click 'Preview Text Splitting' to see how your text will be divided",
470
+ lines=5,
471
+ )
472
+
473
+ with gr.Column(scale=1):
474
+ # Language selection
475
+ language_input = gr.Dropdown(
476
+ label="Language",
477
+ value="en-us",
478
+ choices=SUPPORTED_LANGUAGES,
479
+ info="Select the language for text processing",
480
+ )
481
+
482
+ # Voice selection
483
+ voice_input = gr.Dropdown(
484
+ label="Primary Voice",
485
+ value="af_sky",
486
+ choices=sorted(kokoro.get_voices()),
487
+ info="Select primary voice for synthesis",
488
+ )
489
+
490
+ # Voice blending
491
+ with gr.Accordion("Voice Blending (Optional)", open=False):
492
+ blend_voice_input = gr.Dropdown(
493
+ label="Secondary Voice for Blending",
494
+ value=None,
495
+ choices=[None] + sorted(kokoro.get_voices()),
496
+ info="Select secondary voice to blend with primary voice",
497
+ )
498
+
499
+ blend_ratio = gr.Slider(
500
+ label="Blend Ratio (Primary:Secondary)",
501
+ minimum=0.0,
502
+ maximum=1.0,
503
+ value=0.5,
504
+ step=0.05,
505
+ info="0.0 = 100% Secondary, 1.0 = 100% Primary",
506
+ )
507
+
508
+ gr.Markdown("""
509
+ **Voice blending lets you combine characteristics of two voices.**
510
+ - A 50:50 blend gives equal weight to both voices
511
+ - Higher values emphasize the primary voice
512
+ - Lower values emphasize the secondary voice
513
+ """)
514
+
515
+ # Speed slider
516
+ speed_input = gr.Slider(
517
+ label="Speech Speed",
518
+ minimum=0.5,
519
+ maximum=1.5,
520
+ value=1.0,
521
+ step=0.1,
522
+ info="Adjust speaking rate",
523
+ )
524
+
525
+ # Add a testing mode toggle
526
+ with gr.Accordion("Performance Testing", open=False):
527
+ test_mode = gr.Checkbox(label="Enable Test Mode", value=False)
528
+
529
+ gr.Markdown("""
530
+ ### Performance Testing
531
+
532
+ When enabled, clicking "Generate Audio" will run performance tests instead of generating audio.
533
+ Tests compare different processing approaches to identify the most efficient method.
534
+
535
+ Use this to optimize your implementation based on your specific hardware and text content.
536
+ """)
537
+
538
+ with gr.Column(scale=1):
539
+ # Generate button
540
+ submit_button = gr.Button("Generate Audio", variant="primary")
541
+
542
+ # Outputs
543
+ audio_output = gr.Audio(
544
+ label="Generated Audio", format="wav", show_download_button=True
545
+ )
546
+ audio_gen_timing_output = gr.Textbox(
547
+ label="Performance Metrics", lines=12
548
+ )
549
+ phonemes_output = gr.Textbox(label="Phoneme Representation", lines=10)
550
+ split_info_output = gr.Textbox(label="Processing Information", lines=5)
551
+ test_results = gr.Textbox(
552
+ label="Test Results",
553
+ lines=15,
554
+ visible=False, # Hidden until test is run
555
+ )
556
+
557
+ # Handle split pattern change
558
+ split_pattern_dropdown.change(
559
+ fn=on_split_pattern_change,
560
+ inputs=[split_pattern_dropdown, custom_pattern_input],
561
+ outputs=[custom_pattern_input, custom_pattern_input],
562
+ )
563
+
564
+ # Preview splitting button
565
+ preview_button.click(
566
+ fn=preview_splits,
567
+ inputs=[text_input, custom_pattern_input],
568
+ outputs=[split_preview],
569
+ )
570
+
571
+ # Button click handler
572
+ def on_generate(
573
+ text,
574
+ voice,
575
+ language,
576
+ blend_voice,
577
+ blend_ratio,
578
+ split_pattern,
579
+ speed,
580
+ test_mode,
581
+ ):
582
+ if test_mode:
583
+ # Run performance tests
584
+ results = run_performance_tests(
585
+ text, voice, language, split_pattern, speed
586
+ )
587
+ # Make the results visible
588
+ return None, None, None, None, gr.update(visible=True, value=results)
589
+ else:
590
+ # Regular generation
591
+ audio_tuple, phonemes, split_info, timing_info = create(
592
+ text,
593
+ voice,
594
+ language,
595
+ blend_voice_name=blend_voice,
596
+ blend_ratio=blend_ratio,
597
+ split_pattern=split_pattern,
598
+ speed=speed,
599
+ output_dir=AUDIO_DIR,
600
+ )
601
+
602
+ # Return results and hide test results
603
+ return (
604
+ audio_tuple,
605
+ timing_info,
606
+ phonemes,
607
+ split_info,
608
+ gr.update(visible=False),
609
+ )
610
+
611
+ submit_button.click(
612
+ fn=on_generate,
613
+ inputs=[
614
+ text_input,
615
+ voice_input,
616
+ language_input,
617
+ blend_voice_input,
618
+ blend_ratio,
619
+ custom_pattern_input,
620
+ speed_input,
621
+ test_mode,
622
+ ],
623
+ outputs=[
624
+ audio_output,
625
+ audio_gen_timing_output,
626
+ phonemes_output,
627
+ split_info_output,
628
+ test_results,
629
+ ],
630
+ )
631
+
632
+ return ui
633
+
634
+
635
+ # Create and launch the app
636
+ ui = create_app()
637
+ ui.launch(
638
+ debug=True,
639
+ server_name="0.0.0.0", # Make accessible externally
640
+ server_port=7862, # Choose your port
641
+ share=True, # Set to True if you want a public link
642
+ )