#copied from https://huggingface.co/spaces/KingNish/Kitten-TTS & Modified to handle large text input. import gradio as gr import tempfile import uuid import os import re import numpy as np import soundfile as sf from kittentts import KittenTTS from tqdm.auto import tqdm # Initialize the TTS model model = KittenTTS("KittenML/kitten-tts-nano-0.1") def split_text_into_chunks(text, chunk_size=400): """ Split long text into smaller chunks of max length `chunk_size`. """ # Split by punctuation followed by space (preserves sentence boundaries) sentences = re.split(r'(?<=[.!?]) +', text) chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk) + len(sentence) > chunk_size: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = "" current_chunk += sentence + " " if current_chunk: chunks.append(current_chunk.strip()) return chunks def generate_speech(text, voice, speed): """ Generate speech from long text in a memory-efficient way. Writes chunks directly to a shared WAV file instead of keeping them in memory. """ if not text.strip(): return None, "Please enter some text to generate speech." try: # Break text into manageable chunks chunks = split_text_into_chunks(text, chunk_size=400) # Shared output directory (update this path to your shared disk) shared_dir = "./saved_audio" os.makedirs(shared_dir, exist_ok=True) unique_filename = f"kitten_tts_{uuid.uuid4()}.wav" output_path = os.path.join(shared_dir, unique_filename) # Open the WAV file for writing with sf.SoundFile(output_path, mode='w', samplerate=24000, channels=1, subtype='PCM_16') as f: for chunk in tqdm(chunks, desc="Streaming audio to disk", unit="chunk"): audio = model.generate(chunk+" ....", voice=voice, speed=speed) f.write(audio) # Write audio directly to disk return output_path except Exception as e: return None, f"Error during TTS generation: {str(e)}" def get_available_voices(): """Get list of available voices from the model.""" try: voices = model.available_voices return voices if voices else ["expr-voice-5-m"] except: return ["expr-voice-5-m"] # Get voices once on load available_voices = get_available_voices() # Create Gradio UI with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app: gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator") gr.Markdown("Convert your text to high-quality speech using the KittenTTS nano model!") with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Text to Convert", placeholder="Enter the text you want to convert to speech...", lines=4, max_lines=10 ) with gr.Row(): voice_dropdown = gr.Dropdown( choices=available_voices, value=available_voices[0], label="Voice Selection", info="Choose the voice for speech generation" ) speed_slider = gr.Slider( minimum=0.5, maximum=2.0, step=0.01, value=1, label="Speech Speed", info="Adjust the speed of speech (0.5x to 2.0x)" ) generate_btn = gr.Button("đŸŽĩ Generate Speech", variant="primary", size="lg") with gr.Column(scale=1): audio_output = gr.Audio( label="Generated Speech", type="filepath", interactive=False, autoplay=True ) gr.Markdown("## 📝 Example Texts") gr.Examples( examples=[ ["Hello! This is a test of the KittenTTS model.", available_voices[0], 1], ["The quick brown fox jumps over the lazy dog.", available_voices[0], 1.25], ["Welcome to the world of high-quality text-to-speech synthesis!", available_voices[0], 1.5], ], inputs=[text_input, voice_dropdown, speed_slider], outputs=[audio_output], fn=generate_speech, label="Click on an example to try it out", # cache_examples="lazy" ) with gr.Accordion("â„šī¸ Model Information", open=False): gr.Markdown(""" **Model:** `KittenML/kitten-tts-nano-0.1` **Features:** - High-quality text-to-speech synthesis - Works without GPU acceleration - Multiple voice options - Adjustable speech speed - 24kHz audio output **Usage Instructions:** 1. Enter your text 2. Select a voice 3. Adjust the speech speed if needed 4. Click "Generate Speech" """) # Event Bindings generate_btn.click( fn=generate_speech, inputs=[text_input, voice_dropdown, speed_slider], outputs=[audio_output] ) text_input.submit( fn=generate_speech, inputs=[text_input, voice_dropdown, speed_slider], outputs=[audio_output] ) # Run the app if __name__ == "__main__": app.queue().launch()