Spaces:

NeuralFalcon
/

Kitten-TTS

Running

App Files Files Community

NeuralFalcon commited on 19 days ago

Commit

618f849

verified ·

1 Parent(s): ee9e698

Create app.py

Browse files

Files changed (1) hide show

app.py +162 -0

app.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import gradio as gr
+import tempfile
+import uuid
+import os
+import re
+import numpy as np
+import soundfile as sf
+from kittentts import KittenTTS
+from tqdm.auto import tqdm
+# Initialize the TTS model
+model = KittenTTS("KittenML/kitten-tts-nano-0.1")
+def split_text_into_chunks(text, chunk_size=400):
+    """
+    Split long text into smaller chunks of max length `chunk_size`.
+    """
+    # Split by punctuation followed by space (preserves sentence boundaries)
+    sentences = re.split(r'(?<=[.!?]) +', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) > chunk_size:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+                current_chunk = ""
+        current_chunk += sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def generate_speech(text, voice, speed):
+    """
+    Generate speech from long text in a memory-efficient way.
+    Writes chunks directly to a shared WAV file instead of keeping them in memory.
+    """
+    if not text.strip():
+        return None, "Please enter some text to generate speech."
+    try:
+        # Break text into manageable chunks
+        chunks = split_text_into_chunks(text, chunk_size=400)
+        # Shared output directory (update this path to your shared disk)
+        shared_dir = "./saved_audio"
+        os.makedirs(shared_dir, exist_ok=True)
+        unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
+        output_path = os.path.join(shared_dir, unique_filename)
+        # Open the WAV file for writing
+        with sf.SoundFile(output_path, mode='w', samplerate=24000, channels=1, subtype='PCM_16') as f:
+            for chunk in tqdm(chunks, desc="Streaming audio to disk", unit="chunk"):
+                audio = model.generate(chunk, voice=voice, speed=speed)
+                f.write(audio)  # Write audio directly to disk
+        return output_path
+    except Exception as e:
+        return None, f"Error during TTS generation: {str(e)}"
+def get_available_voices():
+    """Get list of available voices from the model."""
+    try:
+        voices = model.available_voices
+        return voices if voices else ["expr-voice-5-m"]
+    except:
+        return ["expr-voice-5-m"]
+# Get voices once on load
+available_voices = get_available_voices()
+# Create Gradio UI
+with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator")
+    gr.Markdown("Convert your text to high-quality speech using the KittenTTS nano model!")
+    with gr.Row():
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(
+                label="Text to Convert",
+                placeholder="Enter the text you want to convert to speech...",
+                lines=4,
+                max_lines=10
+            )
+            with gr.Row():
+                voice_dropdown = gr.Dropdown(
+                    choices=available_voices,
+                    value=available_voices[0],
+                    label="Voice Selection",
+                    info="Choose the voice for speech generation"
+                )
+                speed_slider = gr.Slider(
+                    minimum=0.5,
+                    maximum=2.0,
+                    step=0.01,
+                    value=1.25,
+                    label="Speech Speed",
+                    info="Adjust the speed of speech (0.5x to 2.0x)"
+                )
+            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                type="filepath",
+                interactive=False,
+                autoplay=True
+            )
+    gr.Markdown("## 📝 Example Texts")
+    gr.Examples(
+        examples=[
+            ["Hello! This is a test of the KittenTTS model.", available_voices[0], 1.25],
+            ["The quick brown fox jumps over the lazy dog.", available_voices[0], 1.5],
+            ["Welcome to the world of high-quality text-to-speech synthesis!", available_voices[0], 1.0],
+        ],
+        inputs=[text_input, voice_dropdown, speed_slider],
+        outputs=[audio_output],
+        fn=generate_speech,
+        label="Click on an example to try it out",
+        # cache_examples="lazy"
+    )
+    with gr.Accordion("ℹ️ Model Information", open=False):
+        gr.Markdown("""
+        **Model:** `KittenML/kitten-tts-nano-0.1`
+        **Features:**
+        - High-quality text-to-speech synthesis
+        - Works without GPU acceleration
+        - Multiple voice options
+        - Adjustable speech speed
+        - 24kHz audio output
+        **Usage Instructions:**
+        1. Enter your text
+        2. Select a voice
+        3. Adjust the speech speed if needed
+        4. Click "Generate Speech"
+        """)
+    # Event Bindings
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, voice_dropdown, speed_slider],
+        outputs=[audio_output]
+    )
+    text_input.submit(
+        fn=generate_speech,
+        inputs=[text_input, voice_dropdown, speed_slider],
+        outputs=[audio_output]
+    )
+# Run the app
+if __name__ == "__main__":
+    app.queue().launch()