Spaces:

fdaudens
/

kokoro-mcp

Running

App Files Files Community

fdaudens HF Staff commited on Apr 30

Commit

11bfd4b

verified ·

1 Parent(s): ed91a1d

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -35

app.py CHANGED Viewed

@@ -1,50 +1,49 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForTextToWaveform, AutoProcessor
-# Load model and processor
-model_name = "hexgrad/Kokoro-82M"
-processor = AutoProcessor.from_pretrained(model_name)
-model = AutoModelForTextToWaveform.from_pretrained(model_name, torch_dtype=torch.float16)
-# Move to GPU if available
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = model.to(device)
 def text_to_audio(text, speed=1.0):
     """Convert text to audio using Kokoro model"""
-    # Process the input text
-    inputs = processor(text=text, return_tensors="pt")
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    # Set generation parameters
-    gen_kwargs = {
-        "do_sample": True,
-        "temperature": 0.7,
-        "length_penalty": 1.0,
-        "repetition_penalty": 2.0,
-        "top_p": 0.9,
-    }
-    # Generate waveform
-    with torch.no_grad():
-        waveform = model.generate(**inputs, **gen_kwargs).cpu().numpy()[0]
-    # Create a sample rate (typical for audio is 24000)
-    sample_rate = 24000
-    # Apply speed factor if needed
-    if speed != 1.0:
-        import numpy as np
-        import librosa
-        waveform = librosa.effects.time_stretch(waveform.astype(np.float32), rate=speed)
-    return sample_rate, waveform
 # Create Gradio interface
 with gr.Blocks(title="Kokoro Text-to-Audio") as app:
     gr.Markdown("# 🎵 Kokoro Text-to-Audio Converter")
-    gr.Markdown("Convert text to speech using hexgrad/Kokoro-82M model")
     with gr.Row():
         with gr.Column():
@@ -55,7 +54,7 @@ with gr.Blocks(title="Kokoro Text-to-Audio") as app:
             )
             speed_slider = gr.Slider(
                 minimum=0.5,
-                maximum=1.5,
                 value=1.0,
                 step=0.1,
                 label="Speech Speed"
@@ -72,10 +71,10 @@ with gr.Blocks(title="Kokoro Text-to-Audio") as app:
     )
     gr.Markdown("### Usage Tips")
-    gr.Markdown("- For best results, keep your text reasonably short")
     gr.Markdown("- Adjust the speed slider to modify the pace of speech")
     gr.Markdown("- The model may take a moment to load on first use")
 # Launch the app
 if __name__ == "__main__":
-    app.launch()

 import gradio as gr
 import torch
+import numpy as np
+from kokoro import KModel, KPipeline
+# Check if CUDA is available
+CUDA_AVAILABLE = torch.cuda.is_available()
+# Initialize the model
+model = KModel().to('cuda' if CUDA_AVAILABLE else 'cpu').eval()
+# Initialize pipelines for different language codes (using 'a' for English)
+pipelines = {'a': KPipeline(lang_code='a', model=False)}
+# Custom pronunciation for "kokoro"
+pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
 def text_to_audio(text, speed=1.0):
     """Convert text to audio using Kokoro model"""
+    if not text:
+        return None
+    pipeline = pipelines['a']  # Use English pipeline
+    voice = "af_heart"  # Default voice (US English, female, Heart)
+    # Process the text
+    pack = pipeline.load_voice(voice)
+    for _, ps, _ in pipeline(text, voice, speed):
+        ref_s = pack[len(ps)-1]
+        # Generate audio
+        try:
+            audio = model(ps, ref_s, speed)
+        except Exception as e:
+            raise gr.Error(f"Error generating audio: {str(e)}")
+        # Return the audio with 24kHz sample rate
+        return 24000, audio.numpy()
+    return None
 # Create Gradio interface
 with gr.Blocks(title="Kokoro Text-to-Audio") as app:
     gr.Markdown("# 🎵 Kokoro Text-to-Audio Converter")
+    gr.Markdown("Convert text to speech using the Kokoro-82M model")
     with gr.Row():
         with gr.Column():
             )
             speed_slider = gr.Slider(
                 minimum=0.5,
+                maximum=2.0,
                 value=1.0,
                 step=0.1,
                 label="Speech Speed"
     )
     gr.Markdown("### Usage Tips")
+    gr.Markdown("- For best results, keep your text reasonably short (up to ~500 characters)")
     gr.Markdown("- Adjust the speed slider to modify the pace of speech")
     gr.Markdown("- The model may take a moment to load on first use")
 # Launch the app
 if __name__ == "__main__":
+    app.launch()