Spaces:

dattazigzag
/

kokoro-onnx

Sleeping

App Files Files Community

dattazigzag commited on Apr 16

Commit

88d6cf9

verified ·

1 Parent(s): 5c15424

onnyx upload

Browse files

Files changed (1) hide show

kokoro_onnx_gradio.py +642 -0

kokoro_onnx_gradio.py ADDED Viewed

	@@ -0,0 +1,642 @@

+import gradio as gr
+import numpy as np
+import time
+import re
+import os
+import soundfile as sf
+import warnings
+from kokoro_onnx import Kokoro
+from kokoro_onnx.tokenizer import Tokenizer
+# Suppress warnings
+warnings.filterwarnings("ignore")
+# Initialize tokenizer and model
+tokenizer = Tokenizer()
+kokoro = Kokoro("onnx_deps/kokoro-v1.0.onnx", "onnx_deps/voices-v1.0.bin")
+# Constants
+SUPPORTED_LANGUAGES = ["en-us", "en-gb", "es", "fr-fr", "hi", "it", "ja", "pt-br", "zh"]
+AUDIO_DIR = "audio_exports"
+CURRENT_VOICE = "af_sky"  # Default voice
+# Create output directory if it doesn't exist
+os.makedirs(AUDIO_DIR, exist_ok=True)
+# Split pattern presets
+SPLIT_PATTERNS = {
+    "Paragraphs (one or more newlines)": r"\n+",
+    "Sentences (periods, question marks, exclamation points)": r"(?<=[.!?])\s+",
+    "Commas and semicolons": r"[,;]\s+",
+    "No splitting (process as one chunk)": r"$^",  # Pattern that won't match anything
+    "Custom": "custom",
+}
+def preview_text_splitting(text, split_pattern):
+    """
+    Preview how text will be split based on the pattern
+    """
+    try:
+        if split_pattern == "$^":  # Special case for no splitting
+            return [text]
+        chunks = re.split(split_pattern, text)
+        # Filter out empty chunks
+        chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
+        return chunks
+    except Exception as e:
+        return [f"Error previewing split: {e}"]
+def run_performance_tests(text, voice, language, split_pattern, speed):
+    """
+    Run performance tests comparing different approaches
+    Returns:
+        String with detailed test results
+    """
+    results = []
+    results.append("=== KOKORO-ONNX PERFORMANCE TEST RESULTS ===\n")
+    # Split text into chunks for comparison
+    chunks = re.split(split_pattern, text)
+    chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
+    results.append(f"Text split into {len(chunks)} chunks\n")
+    # Test 1: Per-chunk vs. Full-text tokenization
+    results.append("TEST #1: TOKENIZATION STRATEGIES")
+    # Approach 1: Per-chunk tokenization
+    start_time = time.time()
+    all_phonemes = []
+    for chunk in chunks:
+        phonemes = tokenizer.phonemize(chunk, lang=language)
+        all_phonemes.append(phonemes)
+    per_chunk_time = time.time() - start_time
+    results.append(f"Per-chunk tokenization: {per_chunk_time:.6f}s")
+    # Approach 2: Single tokenization for entire text
+    start_time = time.time()
+    full_phonemes = tokenizer.phonemize(text, lang=language)
+    full_tokenization_time = time.time() - start_time
+    results.append(f"Full text tokenization: {full_tokenization_time:.6f}s")
+    if full_tokenization_time > 0:
+        results.append(f"Speedup: {per_chunk_time / full_tokenization_time:.2f}x\n")
+    # Test 2: Audio generation strategies
+    results.append("TEST #2: AUDIO GENERATION STRATEGIES")
+    # Approach 1: Generate per chunk
+    start_time = time.time()
+    audio_chunks = []
+    for p in all_phonemes:
+        if p.strip():  # Skip empty phonemes
+            audio, _ = kokoro.create(p, voice=voice, speed=speed, is_phonemes=True)
+            audio_chunks.append(audio)
+    split_gen_time = time.time() - start_time
+    results.append(f"Generate per chunk: {split_gen_time:.6f}s")
+    # Approach 2: Generate for full text
+    start_time = time.time()
+    audio_full, _ = kokoro.create(
+        full_phonemes, voice=voice, speed=speed, is_phonemes=True
+    )
+    full_gen_time = time.time() - start_time
+    results.append(f"Generate full text: {full_gen_time:.6f}s")
+    if full_gen_time > 0:
+        results.append(f"Speedup: {split_gen_time / full_gen_time:.2f}x\n")
+    # Test 3: Total processing time comparison
+    results.append("TEST #3: TOTAL PROCESSING TIME")
+    total_chunked = per_chunk_time + split_gen_time
+    total_full = full_tokenization_time + full_gen_time
+    results.append(f"Total time (chunked): {total_chunked:.6f}s")
+    results.append(f"Total time (full text): {total_full:.6f}s")
+    if total_full > 0:
+        results.append(f"Overall speedup: {total_chunked / total_full:.2f}x")
+    # Recommendations
+    results.append("\nRECOMMENDATIONS:")
+    if per_chunk_time > full_tokenization_time:
+        results.append("- Tokenize entire text at once instead of per-chunk")
+    if split_gen_time > full_gen_time:
+        results.append("- Generate audio for entire text rather than per-chunk")
+    elif split_gen_time < full_gen_time:
+        results.append("- Keep generating audio in chunks for better performance")
+    return "\n".join(results)
+# [OLD] Chunking create func
+def create(text: str, voice: str, language: str, blend_voice_name: str = None,
+           blend_ratio: float = 0.5, split_pattern: str = r"\n+", speed: float = 1.0,
+           output_dir: str = AUDIO_DIR):
+    """
+    Generate audio using Kokoro-ONNX with added features
+    Args:
+        text: Text to synthesize
+        voice: Primary voice to use
+        language: Language code
+        blend_voice_name: Optional secondary voice for blending
+        blend_ratio: Ratio of primary to secondary voice (0.0-1.0)
+        split_pattern: Pattern to split text into chunks
+        speed: Speech rate
+        output_dir: Directory to save audio files
+    Returns:
+        Tuple of (audio_tuple, phonemes, split_info, timing_info)
+    """
+    global CURRENT_VOICE
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+    # Update current voice
+    if voice != CURRENT_VOICE and not blend_voice_name:
+        print(f"Voice changed from {CURRENT_VOICE} to {voice}")
+        CURRENT_VOICE = voice
+    # Start total timing
+    start_total_time = time.time()
+    # Split text into chunks
+    chunks = preview_text_splitting(text, split_pattern)
+    split_info = f"Text split into {len(chunks)} chunks using pattern: '{split_pattern}'"
+    print(split_info)
+    # Initialize variables for processing
+    all_audio = []
+    all_phonemes = []
+    sample_rate = 24000  # Kokoro's sample rate
+    # Timing metrics
+    phoneme_times = []
+    generation_times = []
+    save_times = []
+    # Process each chunk
+    for i, chunk in enumerate(chunks):
+        # Skip empty chunks
+        if not chunk.strip():
+            continue
+        # Time phonemization
+        phoneme_start = time.time()
+        phonemes = tokenizer.phonemize(chunk, lang=language)
+        phoneme_time = time.time() - phoneme_start
+        phoneme_times.append(phoneme_time)
+        print(f"Chunk {i+1} phonemized in {phoneme_time:.6f}s")
+        # Save phonemes
+        all_phonemes.append(f"Chunk {i+1}: {phonemes}")
+        # Handle voice blending
+        voice_blend_start = time.time()
+        voice_to_use = voice
+        if blend_voice_name:
+            first_voice = kokoro.get_voice_style(voice)
+            second_voice = kokoro.get_voice_style(blend_voice_name)
+            voice_to_use = np.add(first_voice * blend_ratio, second_voice * (1 - blend_ratio))
+            print(f"Voices blended in {time.time() - voice_blend_start:.6f}s")
+        # Generate audio
+        gen_start = time.time()
+        audio, sr = kokoro.create(phonemes, voice=voice_to_use, speed=speed, is_phonemes=True)
+        gen_time = time.time() - gen_start
+        generation_times.append(gen_time)
+        print(f"Chunk {i+1} audio generated in {gen_time:.6f}s")
+        # Add to audio list
+        all_audio.append(audio)
+        # Save individual chunk to file
+        save_start = time.time()
+        voice_label = voice.split('_')[1] if isinstance(voice, str) else 'blend'
+        chunk_filename = os.path.join(output_dir, f"chunk_{i+1}_{voice_label}.wav")
+        sf.write(chunk_filename, audio, sr)
+        save_time = time.time() - save_start
+        save_times.append(save_time)
+        print(f"Chunk {i+1} saved to {chunk_filename} in {save_time:.6f}s")
+    # Time to combine chunks
+    combine_start = time.time()
+    if len(all_audio) > 1:
+        audio_data = np.concatenate(all_audio)
+        combine_time = time.time() - combine_start
+        print(f"Combined {len(all_audio)} chunks in {combine_time:.6f}s")
+    else:
+        audio_data = all_audio[0] if all_audio else np.array([])
+        combine_time = 0
+    # Time to save combined file
+    save_combined_start = time.time()
+    voice_label = voice.split('_')[1] if isinstance(voice, str) else 'blend'
+    combined_filename = os.path.join(output_dir, f"combined_{voice_label}.wav")
+    sf.write(combined_filename, audio_data, sample_rate)
+    save_combined_time = time.time() - save_combined_start
+    print(f"Combined audio saved to {combined_filename} in {save_combined_time:.6f}s")
+    # Calculate total time
+    total_time = time.time() - start_total_time
+    # Create detailed timing info
+    chunks_count = len(all_audio)
+    timing_lines = []
+    # Add summary of processing times
+    timing_lines.append(f"Phonemization time: {sum(phoneme_times):.6f}s")
+    timing_lines.append(f"Audio generation time: {sum(generation_times):.6f}s")
+    # Per-chunk timing
+    if chunks_count > 1:
+        timing_lines.append("\nChunk details:")
+        for i in range(chunks_count):
+            timing_lines.append(f"  Chunk {i+1}: Phoneme {phoneme_times[i]:.6f}s, Gen {generation_times[i]:.6f}s, Save {save_times[i]:.6f}s")
+    # Combine and save timing
+    if chunks_count > 1:
+        timing_lines.append(f"\nCombine chunks: {combine_time:.6f}s")
+    timing_lines.append(f"Save combined: {save_combined_time:.6f}s")
+    # Total timing
+    timing_lines.append(f"\nTotal processing time: {total_time:.6f}s")
+    # Format timing info for display
+    timing_info = "\n".join(timing_lines)
+    # Combine phonemes
+    phonemes_text = "\n\n".join(all_phonemes)
+    # Update split info
+    if chunks_count > 1:
+        split_info = f"Text was split into {chunks_count} chunks and saved to {output_dir}"
+    else:
+        split_info = f"Text processed as a single chunk and saved to {output_dir}"
+    return [(sample_rate, audio_data), phonemes_text, split_info, timing_info]
+# Optimized -- over rides paragraph splitting behavior...
+# def create(
+#     text: str,
+#     voice: str,
+#     language: str,
+#     blend_voice_name: str = None,
+#     blend_ratio: float = 0.5,
+#     split_pattern: str = r"\n+",
+#     speed: float = 1.0,
+#     output_dir: str = AUDIO_DIR,
+# ):
+#     """
+#      Generate audio using Kokoro-ONNX with optimized processing
+#     Args:
+#         text: Text to synthesize
+#         voice: Primary voice to use
+#         language: Language code
+#         blend_voice_name: Optional secondary voice for blending
+#         blend_ratio: Ratio of primary to secondary voice (0.0-1.0)
+#         split_pattern: Pattern to split text into chunks
+#         speed: Speech rate
+#         output_dir: Directory to save audio files
+#     Returns:
+#         Tuple of (audio_tuple, phonemes, split_info, timing_info)
+#     """
+#     global CURRENT_VOICE
+#     # Create output directory if it doesn't exist
+#     os.makedirs(output_dir, exist_ok=True)
+#     # Update current voice
+#     if voice != CURRENT_VOICE and not blend_voice_name:
+#         print(f"Voice changed from {CURRENT_VOICE} to {voice}")
+#         CURRENT_VOICE = voice
+#     # Start total timing
+#     start_total_time = time.time()
+#     # Split text only for display purposes
+#     chunks = preview_text_splitting(text, split_pattern)
+#     split_info = (
+#         f"Text split into {len(chunks)} chunks using pattern: '{split_pattern}'"
+#     )
+#     print(split_info)
+#     # Phonemize the entire text at once (optimization #1)
+#     phoneme_start = time.time()
+#     phonemes = tokenizer.phonemize(text, lang=language)
+#     phoneme_time = time.time() - phoneme_start
+#     print(f"Text phonemized in {phoneme_time:.6f}s")
+#     # Handle voice blending
+#     voice_blend_start = time.time()
+#     voice_to_use = voice
+#     if blend_voice_name:
+#         first_voice = kokoro.get_voice_style(voice)
+#         second_voice = kokoro.get_voice_style(blend_voice_name)
+#         voice_to_use = np.add(
+#             first_voice * blend_ratio, second_voice * (1 - blend_ratio)
+#         )
+#         voice_blend_time = time.time() - voice_blend_start
+#         print(f"Voices blended in {voice_blend_time:.6f}s")
+#     # Generate audio for entire text at once (optimization #2)
+#     gen_start = time.time()
+#     audio, sample_rate = kokoro.create(
+#         phonemes, voice=voice_to_use, speed=speed, is_phonemes=True
+#     )
+#     gen_time = time.time() - gen_start
+#     print(f"Audio generated in {gen_time:.6f}s")
+#     # Save to file
+#     save_start = time.time()
+#     voice_label = voice.split("_")[1] if isinstance(voice, str) else "blend"
+#     filename = os.path.join(output_dir, f"full_{voice_label}.wav")
+#     sf.write(filename, audio, sample_rate)
+#     save_time = time.time() - save_start
+#     print(f"Audio saved to {filename} in {save_time:.6f}s")
+#     # Calculate total time
+#     total_time = time.time() - start_total_time
+#     # Create timing info
+#     timing_lines = [
+#         f"Phonemization time: {phoneme_time:.6f}s",
+#         f"Audio generation time: {gen_time:.6f}s",
+#         f"Save time: {save_time:.6f}s",
+#         f"\nTotal processing time: {total_time:.6f}s",
+#         f"\nOptimized approach: Processing entire text at once (2.1x faster)",
+#     ]
+#     timing_info = "\n".join(timing_lines)
+#     # For display, still show the text chunks
+#     chunk_display = []
+#     for i, chunk in enumerate(chunks):
+#         chunk_display.append(f"Chunk {i + 1}: Text: {chunk[:50]}...")
+#     phonemes_display = (
+#         "Full text phonemes (first 100 chars):\n" + phonemes[:100] + "..."
+#     )
+#     return [(sample_rate, audio), phonemes_display, split_info, timing_info]
+def on_split_pattern_change(pattern_name, custom_pattern):
+    """
+    Handle changes to the split pattern selection
+    """
+    if pattern_name == "Custom":
+        return custom_pattern, gr.update(visible=True)
+    else:
+        return SPLIT_PATTERNS[pattern_name], gr.update(visible=False)
+def preview_splits(text, pattern):
+    """
+    Preview how text will be split based on the pattern
+    """
+    chunks = preview_text_splitting(text, pattern)
+    if len(chunks) == 1 and pattern == "$^":
+        return "Text will be processed as a single chunk (no splitting)"
+    result = f"Text will be split into {len(chunks)} chunks:\n\n"
+    for i, chunk in enumerate(chunks):
+        # Truncate very long chunks in the preview
+        display_chunk = chunk[:100] + "..." if len(chunk) > 100 else chunk
+        result += f"Chunk {i + 1}: {display_chunk}\n\n"
+    return result
+def create_app():
+    with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])) as ui:
+        # Title
+        gr.Markdown("# Kokoro-ONNX TTS Demo")
+        gr.Markdown("#### Optimized ONNX implementation with Voice Blending")
+        # Input controls
+        with gr.Row():
+            with gr.Column(scale=1):
+                text_input = gr.TextArea(
+                    label="Input Text",
+                    rtl=False,
+                    value="Hello!\n\nThis is a multi-paragraph test.\nWith multiple lines.\n\nKokoro can split on paragraphs, sentences, or other patterns.",
+                    lines=8,
+                )
+                # Information about split patterns
+                with gr.Accordion("About Text Splitting", open=False):
+                    gr.Markdown("""
+                    ### Understanding Text Splitting
+                    The splitting pattern controls how Kokoro breaks your text into manageable chunks for processing.
+                    **Common patterns:**
+                    - `\\n+`: Split on one or more newlines (paragraphs)
+                    - `(?<=[.!?])\\s+`: Split after periods, question marks, and exclamation points (sentences)
+                    - `[,;]\\s+`: Split after commas and semicolons
+                    - `$^`: Special pattern that won't match anything (processes the entire text as one chunk)
+                    **Benefits of splitting:**
+                    - Better phrasing and natural pauses
+                    - Improved handling of longer texts
+                    - More consistent pronunciation across chunks
+                    """)
+                # Split Pattern Selection
+                split_pattern_dropdown = gr.Dropdown(
+                    label="Split Text Using",
+                    value="Paragraphs (one or more newlines)",
+                    choices=list(SPLIT_PATTERNS.keys()),
+                    info="Select how to split your text into chunks",
+                )
+                custom_pattern_input = gr.Textbox(
+                    label="Custom Split Pattern (Regular Expression)",
+                    value=r"\n+",
+                    visible=False,
+                    info="Enter a custom regex pattern for splitting text",
+                )
+                preview_button = gr.Button("Preview Text Splitting")
+                split_preview = gr.Textbox(
+                    label="Split Preview",
+                    value="Click 'Preview Text Splitting' to see how your text will be divided",
+                    lines=5,
+                )
+            with gr.Column(scale=1):
+                # Language selection
+                language_input = gr.Dropdown(
+                    label="Language",
+                    value="en-us",
+                    choices=SUPPORTED_LANGUAGES,
+                    info="Select the language for text processing",
+                )
+                # Voice selection
+                voice_input = gr.Dropdown(
+                    label="Primary Voice",
+                    value="af_sky",
+                    choices=sorted(kokoro.get_voices()),
+                    info="Select primary voice for synthesis",
+                )
+                # Voice blending
+                with gr.Accordion("Voice Blending (Optional)", open=False):
+                    blend_voice_input = gr.Dropdown(
+                        label="Secondary Voice for Blending",
+                        value=None,
+                        choices=[None] + sorted(kokoro.get_voices()),
+                        info="Select secondary voice to blend with primary voice",
+                    )
+                    blend_ratio = gr.Slider(
+                        label="Blend Ratio (Primary:Secondary)",
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.5,
+                        step=0.05,
+                        info="0.0 = 100% Secondary, 1.0 = 100% Primary",
+                    )
+                    gr.Markdown("""
+                    **Voice blending lets you combine characteristics of two voices.**
+                    - A 50:50 blend gives equal weight to both voices
+                    - Higher values emphasize the primary voice
+                    - Lower values emphasize the secondary voice
+                    """)
+                # Speed slider
+                speed_input = gr.Slider(
+                    label="Speech Speed",
+                    minimum=0.5,
+                    maximum=1.5,
+                    value=1.0,
+                    step=0.1,
+                    info="Adjust speaking rate",
+                )
+                # Add a testing mode toggle
+                with gr.Accordion("Performance Testing", open=False):
+                    test_mode = gr.Checkbox(label="Enable Test Mode", value=False)
+                    gr.Markdown("""
+                    ### Performance Testing
+                    When enabled, clicking "Generate Audio" will run performance tests instead of generating audio.
+                    Tests compare different processing approaches to identify the most efficient method.
+                    Use this to optimize your implementation based on your specific hardware and text content.
+                    """)
+            with gr.Column(scale=1):
+                # Generate button
+                submit_button = gr.Button("Generate Audio", variant="primary")
+                # Outputs
+                audio_output = gr.Audio(
+                    label="Generated Audio", format="wav", show_download_button=True
+                )
+                audio_gen_timing_output = gr.Textbox(
+                    label="Performance Metrics", lines=12
+                )
+                phonemes_output = gr.Textbox(label="Phoneme Representation", lines=10)
+                split_info_output = gr.Textbox(label="Processing Information", lines=5)
+                test_results = gr.Textbox(
+                    label="Test Results",
+                    lines=15,
+                    visible=False,  # Hidden until test is run
+                )
+        # Handle split pattern change
+        split_pattern_dropdown.change(
+            fn=on_split_pattern_change,
+            inputs=[split_pattern_dropdown, custom_pattern_input],
+            outputs=[custom_pattern_input, custom_pattern_input],
+        )
+        # Preview splitting button
+        preview_button.click(
+            fn=preview_splits,
+            inputs=[text_input, custom_pattern_input],
+            outputs=[split_preview],
+        )
+        # Button click handler
+        def on_generate(
+            text,
+            voice,
+            language,
+            blend_voice,
+            blend_ratio,
+            split_pattern,
+            speed,
+            test_mode,
+        ):
+            if test_mode:
+                # Run performance tests
+                results = run_performance_tests(
+                    text, voice, language, split_pattern, speed
+                )
+                # Make the results visible
+                return None, None, None, None, gr.update(visible=True, value=results)
+            else:
+                # Regular generation
+                audio_tuple, phonemes, split_info, timing_info = create(
+                    text,
+                    voice,
+                    language,
+                    blend_voice_name=blend_voice,
+                    blend_ratio=blend_ratio,
+                    split_pattern=split_pattern,
+                    speed=speed,
+                    output_dir=AUDIO_DIR,
+                )
+                # Return results and hide test results
+                return (
+                    audio_tuple,
+                    timing_info,
+                    phonemes,
+                    split_info,
+                    gr.update(visible=False),
+                )
+        submit_button.click(
+            fn=on_generate,
+            inputs=[
+                text_input,
+                voice_input,
+                language_input,
+                blend_voice_input,
+                blend_ratio,
+                custom_pattern_input,
+                speed_input,
+                test_mode,
+            ],
+            outputs=[
+                audio_output,
+                audio_gen_timing_output,
+                phonemes_output,
+                split_info_output,
+                test_results,
+            ],
+        )
+    return ui
+# Create and launch the app
+ui = create_app()
+ui.launch(
+    debug=True,
+    server_name="0.0.0.0",  # Make accessible externally
+    server_port=7862,  # Choose your port
+    share=True,  # Set to True if you want a public link
+)