Spaces:

NeuralFalcon
/

Kitten-TTS

Running

File size: 5,493 Bytes

#copied from https://huggingface.co/spaces/KingNish/Kitten-TTS & Modified to handle large text input.
import gradio as gr
import tempfile
import uuid
import os
import re
import numpy as np
import soundfile as sf
from kittentts import KittenTTS
from tqdm.auto import tqdm
# Initialize the TTS model
model = KittenTTS("KittenML/kitten-tts-nano-0.1")

def split_text_into_chunks(text, chunk_size=400):
    """
    Split long text into smaller chunks of max length `chunk_size`.
    """
    # Split by punctuation followed by space (preserves sentence boundaries)
    sentences = re.split(r'(?<=[.!?]) +', text)
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) > chunk_size:
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = ""
        current_chunk += sentence + " "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def generate_speech(text, voice, speed):
    """
    Generate speech from long text in a memory-efficient way.
    Writes chunks directly to a shared WAV file instead of keeping them in memory.
    """
    if not text.strip():
        return None, "Please enter some text to generate speech."

    try:
        # Break text into manageable chunks
        chunks = split_text_into_chunks(text, chunk_size=400)

        # Shared output directory (update this path to your shared disk)
        shared_dir = "./saved_audio"
        os.makedirs(shared_dir, exist_ok=True)

        unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
        output_path = os.path.join(shared_dir, unique_filename)

        # Open the WAV file for writing
        with sf.SoundFile(output_path, mode='w', samplerate=24000, channels=1, subtype='PCM_16') as f:
            for chunk in tqdm(chunks, desc="Streaming audio to disk", unit="chunk"):
                audio = model.generate(chunk+" ....", voice=voice, speed=speed)
                f.write(audio)  # Write audio directly to disk

        return output_path
    except Exception as e:
        return None, f"Error during TTS generation: {str(e)}"

def get_available_voices():
    """Get list of available voices from the model."""
    try:
        voices = model.available_voices
        return voices if voices else ["expr-voice-5-m"]
    except:
        return ["expr-voice-5-m"]

# Get voices once on load
available_voices = get_available_voices()

# Create Gradio UI
with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator")
    gr.Markdown("Convert your text to high-quality speech using the KittenTTS nano model!")

    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Text to Convert",
                placeholder="Enter the text you want to convert to speech...",
                lines=4,
                max_lines=10
            )
            
            with gr.Row():
                voice_dropdown = gr.Dropdown(
                    choices=available_voices,
                    value=available_voices[0],
                    label="Voice Selection",
                    info="Choose the voice for speech generation"
                )
                
                speed_slider = gr.Slider(
                    minimum=0.5,
                    maximum=2.0,
                    step=0.01,
                    value=1,
                    label="Speech Speed",
                    info="Adjust the speed of speech (0.5x to 2.0x)"
                )
            
            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath",
                interactive=False,
                autoplay=True
            )

    gr.Markdown("## 📝 Example Texts")
    gr.Examples(
        examples=[
            ["Hello! This is a test of the KittenTTS model.", available_voices[0], 1],
            ["The quick brown fox jumps over the lazy dog.", available_voices[0], 1.25],
            ["Welcome to the world of high-quality text-to-speech synthesis!", available_voices[0], 1.5],
        ],
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output],
        fn=generate_speech,
        label="Click on an example to try it out",
        # cache_examples="lazy"
    )

    with gr.Accordion("ℹ️ Model Information", open=False):
        gr.Markdown("""
        **Model:** `KittenML/kitten-tts-nano-0.1`  
        **Features:**  
        - High-quality text-to-speech synthesis  
        - Works without GPU acceleration  
        - Multiple voice options  
        - Adjustable speech speed  
        - 24kHz audio output  

        **Usage Instructions:**  
        1. Enter your text  
        2. Select a voice  
        3. Adjust the speech speed if needed  
        4. Click "Generate Speech"  
        """)

    # Event Bindings
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output]
    )

    text_input.submit(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output]
    )

# Run the app
if __name__ == "__main__":
    app.queue().launch()