kittenTTS / app.py
humair025's picture
Create app.py
fada17e verified
import gradio as gr
import tempfile
import uuid
import os
import soundfile as sf
import numpy as np
from kittentts import KittenTTS
# Initialize the TTS model
model = KittenTTS("KittenML/kitten-tts-nano-0.1")
def chunk_text(text, max_length=500):
"""
Split long text into smaller chunks to optimize processing.
Args:
text (str): Input text to be split
max_length (int): Maximum length of each chunk in characters
Returns:
list: List of text chunks
"""
sentences = text.replace('\n', ' ').split('. ')
chunks, current_chunk, current_length = [], [], 0
for sentence in sentences:
sentence = sentence.strip() + '.'
sentence_length = len(sentence)
if current_length + sentence_length > max_length:
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk, current_length = [], 0
if sentence_length > max_length:
# Split long sentence into smaller parts
words = sentence.split()
temp_chunk, temp_length = [], 0
for word in words:
word_length = len(word) + 1
if temp_length + word_length > max_length:
chunks.append(' '.join(temp_chunk))
temp_chunk, temp_length = [], 0
temp_chunk.append(word)
temp_length += word_length
if temp_chunk:
chunks.append(' '.join(temp_chunk))
continue
current_chunk.append(sentence)
current_length += sentence_length
if current_chunk:
chunks.append(' '.join(current_chunk))
return [chunk.strip() for chunk in chunks if chunk.strip()]
def generate_speech(text, voice, speed):
"""
Generate speech from text using KittenTTS, optimized for long text.
Args:
text (str): Text to convert to speech
voice (str): Voice to use for generation
speed (float): Speed of speech generation
Returns:
str: Path to generated audio file or None if error
str: Error message if applicable
"""
if not text.strip():
return None, "Please enter some text to generate speech."
try:
temp_dir = tempfile.gettempdir()
unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
output_path = os.path.join(temp_dir, unique_filename)
# Chunk text for long inputs
chunks = chunk_text(text, max_length=500)
audio_segments = []
for chunk in chunks:
try:
audio = model.generate(chunk, voice=voice, speed=speed)
audio_segments.append(audio)
except Exception as e:
return None, f"Error processing chunk: {str(e)}"
# Concatenate audio segments
if len(audio_segments) > 1:
combined_audio = np.concatenate(audio_segments)
else:
combined_audio = audio_segments[0] if audio_segments else None
if combined_audio is None:
return None, "No audio generated."
# Save audio file
sf.write(output_path, combined_audio, 24000)
return output_path, None
except Exception as e:
return None, f"Error generating speech: {str(e)}"
def get_available_voices():
"""
Retrieve list of available voices from the model.
Returns:
list: List of available voice names
"""
try:
voices = model.available_voices
return voices if voices else ["expr-voice-5-m"]
except Exception:
return ["expr-voice-5-m"]
# Get available voices
available_voices = get_available_voices()
# Create Gradio interface
with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator")
gr.Markdown("Convert your text to high-quality speech using KittenTTS nano model! Optimized for long text inputs.")
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Text to Convert",
placeholder="Enter the text you want to convert to speech (supports long text)...",
lines=10,
max_lines=50
)
with gr.Row():
voice_dropdown = gr.Dropdown(
choices=available_voices,
value=available_voices[0] if available_voices else "expr-voice-5-m",
label="Voice Selection",
info="Choose the voice for speech generation"
)
speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
step=0.01,
value=1.25,
label="Speech Speed",
info="Adjust the speed of speech (0.5x to 2.0x)"
)
generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Speech",
type="filepath",
interactive=False,
autoplay=True
)
error_output = gr.Textbox(
label="Status",
visible=False,
interactive=False
)
# Example inputs
gr.Markdown("## πŸ“ Example Texts")
examples = gr.Examples(
examples=[
["Hello! This is a test of the KittenTTS model with a short sentence.",
available_voices[2] if len(available_voices) > 2 else "expr-voice-5-m", 1.25],
["This is a longer text example to demonstrate how KittenTTS handles extended content. "
"It includes multiple sentences to test the chunking mechanism and ensure smooth audio generation. "
"The quick brown fox jumps over the lazy dog, and the story continues with more details.",
available_voices[1] if len(available_voices) > 1 else "expr-voice-5-m", 1.5],
["Welcome to the world of high-quality text-to-speech synthesis! This example showcases "
"the ability to process varied text lengths efficiently.",
available_voices[5] if len(available_voices) > 5 else "expr-voice-5-m", 1],
],
inputs=[text_input, voice_dropdown, speed_slider],
outputs=[audio_output, error_output],
fn=generate_speech,
label="Click an example to try it out",
cache_examples="lazy"
)
# Model information
with gr.Accordion("ℹ️ Model Information", open=False):
gr.Markdown("""
**Model**: KittenML/kitten-tts-nano-0.1
**Features**:
- High-quality text-to-speech synthesis
- Optimized for long text inputs through chunking
- Works without GPU acceleration
- Multiple voice options
- Adjustable speech speed (0.5x to 2.0x)
- 24kHz audio output
**Usage**:
1. Enter or paste your text in the text box (long texts supported)
2. Select a voice from the dropdown
3. Adjust the speech speed if needed
4. Click "Generate Speech" to create audio
Generated files are saved in a temporary directory with unique UUID filenames.
Long texts are automatically split into manageable chunks for efficient processing.
""")
# Event handlers
generate_btn.click(
fn=generate_speech,
inputs=[text_input, voice_dropdown, speed_slider],
outputs=[audio_output, error_output]
)
text_input.submit(
fn=generate_speech,
inputs=[text_input, voice_dropdown, speed_slider],
outputs=[audio_output, error_output]
)
# Launch the app
if __name__ == "__main__":
app.queue(default_concurrency_limit=50).launch()