Spaces:

Hassan-16
/

TTS

Running

App Files Files Community

Hassan-16 commited on Jun 28

Commit

fa5c1e1

verified ·

1 Parent(s): 0774a70

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -31

app.py CHANGED Viewed

@@ -1,19 +1,23 @@
 from kokoro import KModel, KPipeline
 import gradio as gr
 import os
-import random
 import torch
 import logging
-# Configuration
-VOICE_DIR = "model/voices"
-OUTPUT_DIR = "output_audio"
-TEXT = "Hello, this is a test of the Kokoro TTS system."
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Device setup
 CUDA_AVAILABLE = torch.cuda.is_available()
 device = "cuda" if CUDA_AVAILABLE else "cpu"
@@ -35,10 +39,9 @@ try:
 except AttributeError as e:
     logger.warning(f"Could not set custom pronunciations: {e}")
-# Core functions for voice generation
 def forward_gpu(text, voice_path, speed):
     pipeline = pipelines[voice_path[0]]
-    pipeline.model = models[True]  # Use GPU model
     generator = pipeline(text, voice=voice_path, speed=speed)
     for _, _, audio in generator:
         return audio
@@ -55,52 +58,138 @@ def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
         if use_gpu:
             audio = forward_gpu(text, voice_path, speed)
         else:
-            pipeline.model = models[False]
             generator = pipeline(text, voice=voice_path, speed=speed)
             for _, ps, audio in generator:
                 return (24000, audio.numpy()), ps
     except gr.exceptions.Error as e:
         if use_gpu:
             gr.Warning(str(e))
-            pipeline.model = models[False]
             generator = pipeline(text, voice=voice_path, speed=speed)
-for _, ps, audio in generator:
-    return (24000, audio.numpy()), ps
         else:
             raise gr.Error(e)
     return None, ""
-# Load available voices
 def load_voice_choices():
-    if not os.path.exists(VOICE_DIR):
-        os.makedirs(VOICE_DIR)
     voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
-    choices = {voice_file: voice_file for voice_file in voice_files}
     return choices
 CHOICES = load_voice_choices()
 if not CHOICES:
     logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
-    CHOICES = {"Bella": "af_bella.pt"}
 TOKEN_NOTE = '''
 💡 Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/)
-⬆️ Adjust stress levels using special notations.
 '''
-# Gradio Interface
-with gr.Blocks() as app:
     with gr.Row():
-        text = gr.Textbox(label="Input Text", value=TEXT)
-        voice = gr.Dropdown(list(CHOICES.values()), label="Voice", value=list(CHOICES.values())[0])
-        speed = gr.Slider(0.5, 2, value=1, label="Speed")
-    output_audio = gr.Audio(label="Output Audio", interactive=False)
-    generate_btn = gr.Button("Generate")
-    generate_btn.click(fn=generate_first, inputs=[text, voice, speed], outputs=[output_audio])
-# Run the app
-if __name__ == "__main__":
-    app.launch(server_name="0.0.0.0", server_port=7860)

 from kokoro import KModel, KPipeline
 import gradio as gr
 import os
 import torch
 import logging
+import soundfile as sf
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Configuration
+VOICE_DIR = os.path.join(os.path.dirname(__file__), "voices")
+OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "output_audio")
+TEXT = "Hello, this is a test of the Kokoro TTS system."
+# Ensure directories exist
+os.makedirs(VOICE_DIR, exist_ok=True)
+os.makedirs(OUTPUT_DIR, exist_ok=True)
 # Device setup
 CUDA_AVAILABLE = torch.cuda.is_available()
 device = "cuda" if CUDA_AVAILABLE else "cpu"
 except AttributeError as e:
     logger.warning(f"Could not set custom pronunciations: {e}")
 def forward_gpu(text, voice_path, speed):
     pipeline = pipelines[voice_path[0]]
+    pipeline.model = models[True]  # Switch to GPU model
     generator = pipeline(text, voice=voice_path, speed=speed)
     for _, _, audio in generator:
         return audio
         if use_gpu:
             audio = forward_gpu(text, voice_path, speed)
         else:
+            pipeline.model = models[False]  # Ensure CPU model is used
             generator = pipeline(text, voice=voice_path, speed=speed)
             for _, ps, audio in generator:
                 return (24000, audio.numpy()), ps
     except gr.exceptions.Error as e:
         if use_gpu:
             gr.Warning(str(e))
+            gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
+            pipeline.model = models[False]  # Switch to CPU model
             generator = pipeline(text, voice=voice_path, speed=speed)
+            for _, ps, audio in generator:
+                return (24000, audio.numpy()), ps
         else:
             raise gr.Error(e)
     return None, ""
+def predict(text, voice="af_bella.pt", speed=1):
+    return generate_first(text, voice, speed, use_gpu=False)[0]
+def tokenize_first(text, voice="af_bella.pt"):
+    voice_path = os.path.join(VOICE_DIR, voice)
+    if not os.path.exists(voice_path):
+        raise FileNotFoundError(f"Voice file not found: {voice_path}")
+    pipeline = pipelines[voice[0]]
+    generator = pipeline(text, voice=voice_path)
+    for _, ps, _ in generator:
+        return ps
+    return ""
+def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
+    voice_path = os.path.join(VOICE_DIR, voice)
+    if not os.path.exists(voice_path):
+        raise FileNotFoundError(f"Voice file not found: {voice_path}")
+    pipeline = pipelines[voice[0]]
+    use_gpu = use_gpu and CUDA_AVAILABLE
+    first = True
+    if use_gpu:
+        pipeline.model = models[True]  # Switch to GPU model
+    else:
+        pipeline.model = models[False]  # Switch to CPU model
+    generator = pipeline(text, voice=voice_path, speed=speed)
+    for _, _, audio in generator:
+        yield 24000, audio.numpy()
+        if first:
+            first = False
+            yield 24000, torch.zeros(1).numpy()
+# Dynamically load all .pt voice files from VOICE_DIR
 def load_voice_choices():
     voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
+    choices = {}
+    for voice_file in voice_files:
+        prefix = voice_file[:2]
+        if prefix == 'af':
+            label = f"🇺🇸 🚺 {voice_file[3:-3].capitalize()}"
+        elif prefix == 'am':
+            label = f"🇺🇸 🚹 {voice_file[3:-3].capitalize()}"
+        elif prefix == 'bf':
+            label = f"🇬🇧 🚺 {voice_file[3:-3].capitalize()}"
+        elif prefix == 'bm':
+            label = f"🇬🇧 🚹 {voice_file[3:-3].capitalize()}"
+        else:
+            label = f"Unknown {voice_file[:-3]}"
+        choices[label] = voice_file
     return choices
 CHOICES = load_voice_choices()
+# Log available voices
+for label, voice_path in CHOICES.items():
+    full_path = os.path.join(VOICE_DIR, voice_path)
+    if not os.path.exists(full_path):
+        logger.warning(f"Voice file not found: {full_path}")
+    else:
+        logger.info(f"Loaded voice: {label} ({voice_path})")
+# If no voices are found, add a default fallback
 if not CHOICES:
     logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
+    CHOICES = {"🇺🇸 🚺 Bella 🔥": "af_bella.pt"}
 TOKEN_NOTE = '''
 💡 Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/)
+💬 To adjust intonation, try punctuation ;:,.!?—…"()“” or stress ˈ and ˌ
+⬇️ Lower stress [1 level](-1) or [2 levels](-2)
+⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
 '''
+with gr.Blocks() as generate_tab:
+    out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
+    generate_btn = gr.Button("Generate", variant="primary")
+    with gr.Accordion("Output Tokens", open=True):
+        out_ps = gr сутьTextbox(interactive=False, show_label=False,
+                            info="Tokens used to generate the audio, up to 510 context length.")
+        tokenize_btn = gr.Button("Tokenize", variant="secondary")
+        gr.Markdown(TOKEN_NOTE)
+        predict_btn = gr.Button("Predict", variant="secondary", visible=False)
+with gr.Blocks() as stream_tab:
+    out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
     with gr.Row():
+        stream_btn = gr.Button("Stream", variant="primary")
+        stop_btn = gr.Button("Stop", variant="stop")
+    with gr.Accordion("Note", open=True):
+        gr.Markdown("⚠️ There is an unknown Gradio bug that might yield no audio the first time you click Stream.")
+        gr.DuplicateButton()
+with gr.Blocks() as app:
+    with gr.Row():
+        with gr.Column():
+            text = gr.Textbox(label="Input Text", info="Arbitrarily many characters supported")
+            with gr.Row():
+                voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
+                                    info="Quality and availability vary by language")
+                use_gpu = gr.Dropdown(
+                    [("GPU 🚀", True), ("CPU 🐌", False)],
+                    value=CUDA_AVAILABLE,
+                    label="Hardware",
+                    info="GPU is usually faster, but may require CUDA support",
+                    interactive=CUDA_AVAILABLE
+                )
+            speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
+        with gr.Column():
+            gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"])
+    generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu],
+                       outputs=[out_audio, out_ps])
+    tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
+    stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
+    stop_btn.click(fn=None, cancels=[stream_event])
+    predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio])