import gradio as gr import tempfile import uuid import os import soundfile as sf import numpy as np from kittentts import KittenTTS # Initialize the TTS model model = KittenTTS("KittenML/kitten-tts-nano-0.1") def chunk_text(text, max_length=500): """ Split long text into smaller chunks to optimize processing. Args: text (str): Input text to be split max_length (int): Maximum length of each chunk in characters Returns: list: List of text chunks """ sentences = text.replace('\n', ' ').split('. ') chunks, current_chunk, current_length = [], [], 0 for sentence in sentences: sentence = sentence.strip() + '.' sentence_length = len(sentence) if current_length + sentence_length > max_length: if current_chunk: chunks.append(' '.join(current_chunk)) current_chunk, current_length = [], 0 if sentence_length > max_length: # Split long sentence into smaller parts words = sentence.split() temp_chunk, temp_length = [], 0 for word in words: word_length = len(word) + 1 if temp_length + word_length > max_length: chunks.append(' '.join(temp_chunk)) temp_chunk, temp_length = [], 0 temp_chunk.append(word) temp_length += word_length if temp_chunk: chunks.append(' '.join(temp_chunk)) continue current_chunk.append(sentence) current_length += sentence_length if current_chunk: chunks.append(' '.join(current_chunk)) return [chunk.strip() for chunk in chunks if chunk.strip()] def generate_speech(text, voice, speed): """ Generate speech from text using KittenTTS, optimized for long text. Args: text (str): Text to convert to speech voice (str): Voice to use for generation speed (float): Speed of speech generation Returns: str: Path to generated audio file or None if error str: Error message if applicable """ if not text.strip(): return None, "Please enter some text to generate speech." try: temp_dir = tempfile.gettempdir() unique_filename = f"kitten_tts_{uuid.uuid4()}.wav" output_path = os.path.join(temp_dir, unique_filename) # Chunk text for long inputs chunks = chunk_text(text, max_length=500) audio_segments = [] for chunk in chunks: try: audio = model.generate(chunk, voice=voice, speed=speed) audio_segments.append(audio) except Exception as e: return None, f"Error processing chunk: {str(e)}" # Concatenate audio segments if len(audio_segments) > 1: combined_audio = np.concatenate(audio_segments) else: combined_audio = audio_segments[0] if audio_segments else None if combined_audio is None: return None, "No audio generated." # Save audio file sf.write(output_path, combined_audio, 24000) return output_path, None except Exception as e: return None, f"Error generating speech: {str(e)}" def get_available_voices(): """ Retrieve list of available voices from the model. Returns: list: List of available voice names """ try: voices = model.available_voices return voices if voices else ["expr-voice-5-m"] except Exception: return ["expr-voice-5-m"] # Get available voices available_voices = get_available_voices() # Create Gradio interface with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app: gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator") gr.Markdown("Convert your text to high-quality speech using KittenTTS nano model! Optimized for long text inputs.") with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Text to Convert", placeholder="Enter the text you want to convert to speech (supports long text)...", lines=10, max_lines=50 ) with gr.Row(): voice_dropdown = gr.Dropdown( choices=available_voices, value=available_voices[0] if available_voices else "expr-voice-5-m", label="Voice Selection", info="Choose the voice for speech generation" ) speed_slider = gr.Slider( minimum=0.5, maximum=2.0, step=0.01, value=1.25, label="Speech Speed", info="Adjust the speed of speech (0.5x to 2.0x)" ) generate_btn = gr.Button("đŸŽĩ Generate Speech", variant="primary", size="lg") with gr.Column(scale=1): audio_output = gr.Audio( label="Generated Speech", type="filepath", interactive=False, autoplay=True ) error_output = gr.Textbox( label="Status", visible=False, interactive=False ) # Example inputs gr.Markdown("## 📝 Example Texts") examples = gr.Examples( examples=[ ["Hello! This is a test of the KittenTTS model with a short sentence.", available_voices[2] if len(available_voices) > 2 else "expr-voice-5-m", 1.25], ["This is a longer text example to demonstrate how KittenTTS handles extended content. " "It includes multiple sentences to test the chunking mechanism and ensure smooth audio generation. " "The quick brown fox jumps over the lazy dog, and the story continues with more details.", available_voices[1] if len(available_voices) > 1 else "expr-voice-5-m", 1.5], ["Welcome to the world of high-quality text-to-speech synthesis! This example showcases " "the ability to process varied text lengths efficiently.", available_voices[5] if len(available_voices) > 5 else "expr-voice-5-m", 1], ], inputs=[text_input, voice_dropdown, speed_slider], outputs=[audio_output, error_output], fn=generate_speech, label="Click an example to try it out", cache_examples="lazy" ) # Model information with gr.Accordion("â„šī¸ Model Information", open=False): gr.Markdown(""" **Model**: KittenML/kitten-tts-nano-0.1 **Features**: - High-quality text-to-speech synthesis - Optimized for long text inputs through chunking - Works without GPU acceleration - Multiple voice options - Adjustable speech speed (0.5x to 2.0x) - 24kHz audio output **Usage**: 1. Enter or paste your text in the text box (long texts supported) 2. Select a voice from the dropdown 3. Adjust the speech speed if needed 4. Click "Generate Speech" to create audio Generated files are saved in a temporary directory with unique UUID filenames. Long texts are automatically split into manageable chunks for efficient processing. """) # Event handlers generate_btn.click( fn=generate_speech, inputs=[text_input, voice_dropdown, speed_slider], outputs=[audio_output, error_output] ) text_input.submit( fn=generate_speech, inputs=[text_input, voice_dropdown, speed_slider], outputs=[audio_output, error_output] ) # Launch the app if __name__ == "__main__": app.queue(default_concurrency_limit=50).launch()