Spaces:

Hassan-16
/

TTS

Running

App Files Files Community

Hassan-16 commited on Jun 28

Commit

db34aa6

verified ·

1 Parent(s): 21977f5

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -179

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import os
 import torch
 import logging
 import soundfile as sf
-import time
 from kokoro import KModel, KPipeline
 # Configure logging
@@ -25,13 +24,7 @@ device = "cuda" if CUDA_AVAILABLE else "cpu"
 logger.info(f"Using hardware: {device}")
 # Load a single model instance
-try:
-    start_time = time.time()
-    model = KModel("hexgrad/Kokoro-82M").to(device).eval()
-    logger.info(f"Model loading time: {time.time() - start_time} seconds")
-except Exception as e:
-    logger.error(f"Failed to load model: {e}")
-    raise
 # Define pipelines for American ('a') and British ('b') English
 pipelines = {
@@ -46,81 +39,33 @@ try:
 except AttributeError as e:
     logger.warning(f"Could not set custom pronunciations: {e}")
-# Cache voice choices to avoid repeated file scanning
-VOICE_CHOICES = None
-def load_voice_choices():
-    global VOICE_CHOICES
-    if VOICE_CHOICES is not None:
-        return VOICE_CHOICES
-    voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
-    choices = {}
-    for voice_file in voice_files:
-        prefix = voice_file[:2]
-        if prefix == 'af':
-            label = f"🇺🇸 Female: {voice_file[3:-3].capitalize()}"
-        elif prefix == 'am':
-            label = f"🇺🇸 Male: {voice_file[3:-3].capitalize()}"
-        elif prefix == 'bf':
-            label = f"🇬🇧 Female: {voice_file[3:-3].capitalize()}"
-        elif prefix == 'bm':
-            label = f"🇬🇧 Male: {voice_file[3:-3].capitalize()}"
-        else:
-            label = f"Unknown: {voice_file[:-3]}"
-        choices[label] = voice_file
-    if not choices:
-        logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
-        choices = {"🇺🇸 Female: Bella": "af_bella.pt"}
-    VOICE_CHOICES = choices
-    return choices
-CHOICES = load_voice_choices()
-# Log available voices
-for label, voice_path in CHOICES.items():
-    full_path = os.path.join(VOICE_DIR, voice_path)
-    if not os.path.exists(full_path):
-        logger.warning(f"Voice file not found: {full_path}")
-    else:
-        logger.info(f"Loaded voice: {label} ({voice_path})")
 def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
-    start_time = time.time()
-    if len(text) > 510:
-        text = text[:510]
-        gr.Warning("Text truncated to 510 characters for faster processing.")
     voice_path = os.path.join(VOICE_DIR, voice)
     if not os.path.exists(voice_path):
-        raise gr.Error(f"Voice file not found: {voice_path}")
     pipeline = pipelines[voice[0]]
     use_gpu = use_gpu and CUDA_AVAILABLE
     try:
-        if not use_gpu and model.device.type != "cpu":
-            model.to("cpu")
         generator = pipeline(text, voice=voice_path, speed=speed)
         for _, ps, audio in generator:
-            logger.info(f"Generation time: {time.time() - start_time} seconds")
             return (24000, audio.numpy()), ps
     except gr.exceptions.Error as e:
         if use_gpu:
             gr.Warning(str(e))
-            gr.Info("Retrying with CPU.")
             model.to("cpu")
             generator = pipeline(text, voice=voice_path, speed=speed)
             for _, ps, audio in generator:
-                logger.info(f"Generation time (CPU retry): {time.time() - start_time} seconds")
                 return (24000, audio.numpy()), ps
         else:
             raise gr.Error(e)
     return None, ""
 def tokenize_first(text, voice="af_bella.pt"):
-    if len(text) > 510:
-        text = text[:510]
-        gr.Warning("Text truncated to 510 characters for faster processing.")
     voice_path = os.path.join(VOICE_DIR, voice)
     if not os.path.exists(voice_path):
-        raise gr.Error(f"Voice file not found: {voice_path}")
     pipeline = pipelines[voice[0]]
     generator = pipeline(text, voice=voice_path)
@@ -129,146 +74,105 @@ def tokenize_first(text, voice="af_bella.pt"):
     return ""
 def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
-    start_time = time.time()
-    if len(text) > 510:
-        text = text[:510]
-        gr.Warning("Text truncated to 510 characters for faster processing.")
     voice_path = os.path.join(VOICE_DIR, voice)
     if not os.path.exists(voice_path):
-        raise gr.Error(f"Voice file not found: {voice_path}")
     pipeline = pipelines[voice[0]]
     use_gpu = use_gpu and CUDA_AVAILABLE
-    if not use_gpu and model.device.type != "cpu":
-        model.to("cpu")
     first = True
     generator = pipeline(text, voice=voice_path, speed=speed)
     for _, _, audio in generator:
         yield 24000, audio.numpy()
         if first:
             first = False
             yield 24000, torch.zeros(1).numpy()
-    logger.info(f"Streaming generation time: {time.time() - start_time} seconds")
-TOKEN_NOTE = '''
-**How to Customize Pronunciation**
-- Use Markdown link syntax, e.g., `[Kokoro](/kˈOkəɹO/)` for custom pronunciation.
-- Adjust intonation with punctuation: `;:,.!?—…"()“”`.
-- Control stress: `[word](-1)` or `[word](-2)` to lower, `[word](+1)` or `[word](+2)` to raise stress.
-'''
-with gr.Blocks(theme="huggingface", css=".gr-button-primary {background-color: #1e88e5 !important; color: white !important;}") as app:
-    gr.Markdown("# Kokoro TTS: Text-to-Speech Generator")
-    gr.Markdown("Enter text and select a voice to generate high-quality audio. Adjust speed for faster or slower speech.")
-    with gr.Column():
-        text = gr.Textbox(
-            label="Input Text",
-            value=TEXT,
-            placeholder="Type your text here (max 510 characters)",
-            lines=3,
-            max_lines=5,
-            info="Enter text to convert to speech."
-        )
-        with gr.Row():
-            voice = gr.Dropdown(
-                choices=list(CHOICES.items()),
-                value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0],
-                label="Voice",
-                info="Choose a voice for the audio output."
-            )
-            use_gpu = gr.Dropdown(
-                choices=[("GPU 🚀 (Faster)", True), ("CPU 🐌 (Slower)", False)],
-                value=CUDA_AVAILABLE,
-                label="Hardware",
-                info="GPU is faster but requires CUDA support.",
-                interactive=CUDA_AVAILABLE
-            )
-        speed = gr.Slider(
-            minimum=0.5,
-            maximum=2,
-            value=1,
-            step=0.1,
-            label="Speech Speed",
-            info="Adjust the speed of the generated audio (0.5 = slower, 2 = faster)."
-        )
-        with gr.Tabs():
-            with gr.Tab(label="Generate Audio"):
-                out_audio = gr.Audio(
-                    label="Generated Audio",
-                    interactive=False,
-                    streaming=False,
-                    autoplay=True
-                )
-                status = gr.Textbox(
-                    value="Ready to generate audio.",
-                    label="Status",
-                    interactive=False
-                )
-                generate_btn = gr.Button("Generate Audio", variant="primary")
-                with gr.Accordion("Pronunciation Tokens", open=False):
-                    out_ps = gr.Textbox(
-                        interactive=False,
-                        show_label=False,
-                        info="Tokens used to generate the audio."
-                    )
-                    tokenize_btn = gr.Button("Show Tokens", variant="secondary")
-                    gr.Markdown(TOKEN_NOTE)
-            with gr.Tab(label="Stream Audio"):
-                out_stream = gr.Audio(
-                    label="Streaming Audio",
-                    interactive=False,
-                    streaming=True,
-                    autoplay=True
-                )
-                status_stream = gr.Textbox(
-                    value="Ready to stream audio.",
-                    label="Status",
-                    interactive=False
-                )
-                with gr.Row():
-                    stream_btn = gr.Button("Start Streaming", variant="primary")
-                    stop_btn = gr.Button("Stop Streaming", variant="stop")
-                gr.Markdown("⚠️ Streaming may have slight delays due to processing.")
-    def update_status_generate(text, voice, speed, use_gpu):
-        status.value = "Generating audio..."
-        result, ps = generate_first(text, voice, speed, use_gpu)
-        status.value = "Audio generated successfully!" if result else "Failed to generate audio."
-        return result, ps
-    def update_status_tokenize(text, voice):
-        status.value = "Tokenizing text..."
-        result = tokenize_first(text, voice)
-        status.value = "Tokenization complete!" if result else "Failed to tokenize."
-        return result
-    def update_status_stream(text, voice, speed, use_gpu):
-        status_stream.value = "Starting audio stream..."
-        for audio in generate_all(text, voice, speed, use_gpu):
-            yield audio
-        status_stream.value = "Streaming complete!"
-    generate_btn.click(
-        fn=update_status_generate,
-        inputs=[text, voice, speed, use_gpu],
-        outputs=[out_audio, out_ps, status]
-    )
-    tokenize_btn.click(
-        fn=update_status_tokenize,
-        inputs=[text, voice],
-        outputs=[out_ps, status]
-    )
-    stream_event = stream_btn.click(
-        fn=update_status_stream,
-        inputs=[text, voice, speed, use_gpu],
-        outputs=[out_stream, status_stream]
-    )
     stop_btn.click(fn=None, cancels=[stream_event])
 if __name__ == "__main__":
-    logger.info("Starting Gradio app...")
-    app.launch(queue=False)
-    logger.info("Gradio app started.")

 import torch
 import logging
 import soundfile as sf
 from kokoro import KModel, KPipeline
 # Configure logging
 logger.info(f"Using hardware: {device}")
 # Load a single model instance
+model = KModel("hexgrad/Kokoro-82M").to(device).eval()
 # Define pipelines for American ('a') and British ('b') English
 pipelines = {
 except AttributeError as e:
     logger.warning(f"Could not set custom pronunciations: {e}")
 def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
     voice_path = os.path.join(VOICE_DIR, voice)
     if not os.path.exists(voice_path):
+        raise FileNotFoundError(f"Voice file not found: {voice_path}")
     pipeline = pipelines[voice[0]]
     use_gpu = use_gpu and CUDA_AVAILABLE
     try:
         generator = pipeline(text, voice=voice_path, speed=speed)
         for _, ps, audio in generator:
             return (24000, audio.numpy()), ps
     except gr.exceptions.Error as e:
         if use_gpu:
             gr.Warning(str(e))
+            gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
             model.to("cpu")
             generator = pipeline(text, voice=voice_path, speed=speed)
             for _, ps, audio in generator:
                 return (24000, audio.numpy()), ps
         else:
             raise gr.Error(e)
     return None, ""
 def tokenize_first(text, voice="af_bella.pt"):
     voice_path = os.path.join(VOICE_DIR, voice)
     if not os.path.exists(voice_path):
+        raise FileNotFoundError(f"Voice file not found: {voice_path}")
     pipeline = pipelines[voice[0]]
     generator = pipeline(text, voice=voice_path)
     return ""
 def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
     voice_path = os.path.join(VOICE_DIR, voice)
     if not os.path.exists(voice_path):
+        raise FileNotFoundError(f"Voice file not found: {voice_path}")
     pipeline = pipelines[voice[0]]
     use_gpu = use_gpu and CUDA_AVAILABLE
     first = True
+    if not use_gpu:
+        model.to("cpu")
     generator = pipeline(text, voice=voice_path, speed=speed)
     for _, _, audio in generator:
         yield 24000, audio.numpy()
         if first:
             first = False
             yield 24000, torch.zeros(1).numpy()
+# Dynamically load .pt voice files from VOICE_DIR
+def load_voice_choices():
+    voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
+    choices = {}
+    for voice_file in voice_files:
+        prefix = voice_file[:2]
+        if prefix == 'af':
+            label = f"🇺🇸 🚺 {voice_file[3:-3].capitalize()}"
+        elif prefix == 'am':
+            label = f"🇺🇸 🚹 {voice_file[3:-3].capitalize()}"
+        elif prefix == 'bf':
+            label = f"🇬🇧 🚺 {voice_file[3:-3].capitalize()}"
+        elif prefix == 'bm':
+            label = f"🇬🇧 🚹 {voice_file[3:-3].capitalize()}"
+        else:
+            label = f"Unknown {voice_file[:-3]}"
+        choices[label] = voice_file
+    return choices
+CHOICES = load_voice_choices()
+# Log available voices
+for label, voice_path in CHOICES.items():
+    full_path = os.path.join(VOICE_DIR, voice_path)
+    if not os.path.exists(full_path):
+        logger.warning(f"Voice file not found: {full_path}")
+    else:
+        logger.info(f"Loaded voice: {label} ({voice_path})")
+# If no voices are found, add a default fallback
+if not CHOICES:
+    logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
+    CHOICES = {"🇺🇸 🚺 Bella 🔥": "af_bella.pt"}
+TOKEN_NOTE = '''
+💡 Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/)
+💬 To adjust intonation, try punctuation ;:,.!?—…"()“” or stress ˈ and ˌ
+⬇️ Lower stress [1 level](-1) or [2 levels](-2)
+⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
+'''
+with gr.Blocks() as generate_tab:
+    out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
+    generate_btn = gr.Button("Generate", variant="primary")
+    with gr.Accordion("Output Tokens", open=True):
+        out_ps = gr.Textbox(interactive=False, show_label=False,
+                            info="Tokens used to generate the audio, up to 510 context length.")
+        tokenize_btn = gr.Button("Tokenize", variant="secondary")
+        gr.Markdown(TOKEN_NOTE)
+with gr.Blocks() as stream_tab:
+    out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
+    with gr.Row():
+        stream_btn = gr.Button("Stream", variant="primary")
+        stop_btn = gr.Button("Stop", variant="stop")
+    with gr.Accordion("Note", open=True):
+        gr.Markdown("⚠️ There may be delays in streaming audio due to processing limitations.")
+with gr.Blocks() as app:
+    with gr.Row():
+        with gr.Column():
+            text = gr.Textbox(label="Input Text", info="Arbitrarily many characters supported")
+            with gr.Row():
+                voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
+                                    info="Quality and availability vary by language")
+                use_gpu = gr.Dropdown(
+                    [("GPU �-held", True), ("CPU 🐌", False)],
+                    value=CUDA_AVAILABLE,
+                    label="Hardware",
+                    info="GPU is usually faster, but may require CUDA support",
+                    interactive=CUDA_AVAILABLE
+                )
+            speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
+        with gr.Column():
+            gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"])
+    generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu],
+                       outputs=[out_audio, out_ps])
+    tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
+    stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
     stop_btn.click(fn=None, cancels=[stream_event])
 if __name__ == "__main__":
+    app.queue().launch()