Spaces:

humair025
/

kittenTTS

Running

File size: 8,049 Bytes

fada17e

import gradio as gr
import tempfile
import uuid
import os
import soundfile as sf
import numpy as np
from kittentts import KittenTTS

# Initialize the TTS model
model = KittenTTS("KittenML/kitten-tts-nano-0.1")

def chunk_text(text, max_length=500):
    """
    Split long text into smaller chunks to optimize processing.
    
    Args:
        text (str): Input text to be split
        max_length (int): Maximum length of each chunk in characters
    
    Returns:
        list: List of text chunks
    """
    sentences = text.replace('\n', ' ').split('. ')
    chunks, current_chunk, current_length = [], [], 0
    
    for sentence in sentences:
        sentence = sentence.strip() + '.'
        sentence_length = len(sentence)
        
        if current_length + sentence_length > max_length:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk, current_length = [], 0
            if sentence_length > max_length:
                # Split long sentence into smaller parts
                words = sentence.split()
                temp_chunk, temp_length = [], 0
                for word in words:
                    word_length = len(word) + 1
                    if temp_length + word_length > max_length:
                        chunks.append(' '.join(temp_chunk))
                        temp_chunk, temp_length = [], 0
                    temp_chunk.append(word)
                    temp_length += word_length
                if temp_chunk:
                    chunks.append(' '.join(temp_chunk))
                continue
        
        current_chunk.append(sentence)
        current_length += sentence_length
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return [chunk.strip() for chunk in chunks if chunk.strip()]

def generate_speech(text, voice, speed):
    """
    Generate speech from text using KittenTTS, optimized for long text.
    
    Args:
        text (str): Text to convert to speech
        voice (str): Voice to use for generation
        speed (float): Speed of speech generation
    
    Returns:
        str: Path to generated audio file or None if error
        str: Error message if applicable
    """
    if not text.strip():
        return None, "Please enter some text to generate speech."

    try:
        temp_dir = tempfile.gettempdir()
        unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
        output_path = os.path.join(temp_dir, unique_filename)
        
        # Chunk text for long inputs
        chunks = chunk_text(text, max_length=500)
        audio_segments = []
        
        for chunk in chunks:
            try:
                audio = model.generate(chunk, voice=voice, speed=speed)
                audio_segments.append(audio)
            except Exception as e:
                return None, f"Error processing chunk: {str(e)}"
        
        # Concatenate audio segments
        if len(audio_segments) > 1:
            combined_audio = np.concatenate(audio_segments)
        else:
            combined_audio = audio_segments[0] if audio_segments else None
        
        if combined_audio is None:
            return None, "No audio generated."
        
        # Save audio file
        sf.write(output_path, combined_audio, 24000)
        return output_path, None
    
    except Exception as e:
        return None, f"Error generating speech: {str(e)}"

def get_available_voices():
    """
    Retrieve list of available voices from the model.
    
    Returns:
        list: List of available voice names
    """
    try:
        voices = model.available_voices
        return voices if voices else ["expr-voice-5-m"]
    except Exception:
        return ["expr-voice-5-m"]

# Get available voices
available_voices = get_available_voices()

# Create Gradio interface
with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator")
    gr.Markdown("Convert your text to high-quality speech using KittenTTS nano model! Optimized for long text inputs.")

    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Text to Convert",
                placeholder="Enter the text you want to convert to speech (supports long text)...",
                lines=10,
                max_lines=50
            )
            with gr.Row():
                voice_dropdown = gr.Dropdown(
                    choices=available_voices,
                    value=available_voices[0] if available_voices else "expr-voice-5-m",
                    label="Voice Selection",
                    info="Choose the voice for speech generation"
                )
                speed_slider = gr.Slider(
                    minimum=0.5,
                    maximum=2.0,
                    step=0.01,
                    value=1.25,
                    label="Speech Speed",
                    info="Adjust the speed of speech (0.5x to 2.0x)"
                )
            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath",
                interactive=False,
                autoplay=True
            )
            error_output = gr.Textbox(
                label="Status",
                visible=False,
                interactive=False
            )

    # Example inputs
    gr.Markdown("## 📝 Example Texts")
    examples = gr.Examples(
        examples=[
            ["Hello! This is a test of the KittenTTS model with a short sentence.", 
             available_voices[2] if len(available_voices) > 2 else "expr-voice-5-m", 1.25],
            ["This is a longer text example to demonstrate how KittenTTS handles extended content. "
             "It includes multiple sentences to test the chunking mechanism and ensure smooth audio generation. "
             "The quick brown fox jumps over the lazy dog, and the story continues with more details.", 
             available_voices[1] if len(available_voices) > 1 else "expr-voice-5-m", 1.5],
            ["Welcome to the world of high-quality text-to-speech synthesis! This example showcases "
             "the ability to process varied text lengths efficiently.", 
             available_voices[5] if len(available_voices) > 5 else "expr-voice-5-m", 1],
        ],
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output, error_output],
        fn=generate_speech,
        label="Click an example to try it out",
        cache_examples="lazy"
    )

    # Model information
    with gr.Accordion("ℹ️ Model Information", open=False):
        gr.Markdown("""
        **Model**: KittenML/kitten-tts-nano-0.1
        
        **Features**:
        - High-quality text-to-speech synthesis
        - Optimized for long text inputs through chunking
        - Works without GPU acceleration
        - Multiple voice options
        - Adjustable speech speed (0.5x to 2.0x)
        - 24kHz audio output
        
        **Usage**:
        1. Enter or paste your text in the text box (long texts supported)
        2. Select a voice from the dropdown
        3. Adjust the speech speed if needed
        4. Click "Generate Speech" to create audio
        
        Generated files are saved in a temporary directory with unique UUID filenames.
        Long texts are automatically split into manageable chunks for efficient processing.
        """)

    # Event handlers
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output, error_output]
    )
    
    text_input.submit(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output, error_output]
    )

# Launch the app
if __name__ == "__main__":
    app.queue(default_concurrency_limit=50).launch()