Spaces:
Sleeping
Sleeping
import gradio as gr | |
import tempfile | |
import uuid | |
import os | |
import soundfile as sf | |
import numpy as np | |
from kittentts import KittenTTS | |
# Initialize the TTS model | |
model = KittenTTS("KittenML/kitten-tts-nano-0.1") | |
def chunk_text(text, max_length=500): | |
""" | |
Split long text into smaller chunks to optimize processing. | |
Args: | |
text (str): Input text to be split | |
max_length (int): Maximum length of each chunk in characters | |
Returns: | |
list: List of text chunks | |
""" | |
sentences = text.replace('\n', ' ').split('. ') | |
chunks, current_chunk, current_length = [], [], 0 | |
for sentence in sentences: | |
sentence = sentence.strip() + '.' | |
sentence_length = len(sentence) | |
if current_length + sentence_length > max_length: | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
current_chunk, current_length = [], 0 | |
if sentence_length > max_length: | |
# Split long sentence into smaller parts | |
words = sentence.split() | |
temp_chunk, temp_length = [], 0 | |
for word in words: | |
word_length = len(word) + 1 | |
if temp_length + word_length > max_length: | |
chunks.append(' '.join(temp_chunk)) | |
temp_chunk, temp_length = [], 0 | |
temp_chunk.append(word) | |
temp_length += word_length | |
if temp_chunk: | |
chunks.append(' '.join(temp_chunk)) | |
continue | |
current_chunk.append(sentence) | |
current_length += sentence_length | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
return [chunk.strip() for chunk in chunks if chunk.strip()] | |
def generate_speech(text, voice, speed): | |
""" | |
Generate speech from text using KittenTTS, optimized for long text. | |
Args: | |
text (str): Text to convert to speech | |
voice (str): Voice to use for generation | |
speed (float): Speed of speech generation | |
Returns: | |
str: Path to generated audio file or None if error | |
str: Error message if applicable | |
""" | |
if not text.strip(): | |
return None, "Please enter some text to generate speech." | |
try: | |
temp_dir = tempfile.gettempdir() | |
unique_filename = f"kitten_tts_{uuid.uuid4()}.wav" | |
output_path = os.path.join(temp_dir, unique_filename) | |
# Chunk text for long inputs | |
chunks = chunk_text(text, max_length=500) | |
audio_segments = [] | |
for chunk in chunks: | |
try: | |
audio = model.generate(chunk, voice=voice, speed=speed) | |
audio_segments.append(audio) | |
except Exception as e: | |
return None, f"Error processing chunk: {str(e)}" | |
# Concatenate audio segments | |
if len(audio_segments) > 1: | |
combined_audio = np.concatenate(audio_segments) | |
else: | |
combined_audio = audio_segments[0] if audio_segments else None | |
if combined_audio is None: | |
return None, "No audio generated." | |
# Save audio file | |
sf.write(output_path, combined_audio, 24000) | |
return output_path, None | |
except Exception as e: | |
return None, f"Error generating speech: {str(e)}" | |
def get_available_voices(): | |
""" | |
Retrieve list of available voices from the model. | |
Returns: | |
list: List of available voice names | |
""" | |
try: | |
voices = model.available_voices | |
return voices if voices else ["expr-voice-5-m"] | |
except Exception: | |
return ["expr-voice-5-m"] | |
# Get available voices | |
available_voices = get_available_voices() | |
# Create Gradio interface | |
with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app: | |
gr.Markdown("# π± KittenTTS - Text to Speech Generator") | |
gr.Markdown("Convert your text to high-quality speech using KittenTTS nano model! Optimized for long text inputs.") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
text_input = gr.Textbox( | |
label="Text to Convert", | |
placeholder="Enter the text you want to convert to speech (supports long text)...", | |
lines=10, | |
max_lines=50 | |
) | |
with gr.Row(): | |
voice_dropdown = gr.Dropdown( | |
choices=available_voices, | |
value=available_voices[0] if available_voices else "expr-voice-5-m", | |
label="Voice Selection", | |
info="Choose the voice for speech generation" | |
) | |
speed_slider = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
step=0.01, | |
value=1.25, | |
label="Speech Speed", | |
info="Adjust the speed of speech (0.5x to 2.0x)" | |
) | |
generate_btn = gr.Button("π΅ Generate Speech", variant="primary", size="lg") | |
with gr.Column(scale=1): | |
audio_output = gr.Audio( | |
label="Generated Speech", | |
type="filepath", | |
interactive=False, | |
autoplay=True | |
) | |
error_output = gr.Textbox( | |
label="Status", | |
visible=False, | |
interactive=False | |
) | |
# Example inputs | |
gr.Markdown("## π Example Texts") | |
examples = gr.Examples( | |
examples=[ | |
["Hello! This is a test of the KittenTTS model with a short sentence.", | |
available_voices[2] if len(available_voices) > 2 else "expr-voice-5-m", 1.25], | |
["This is a longer text example to demonstrate how KittenTTS handles extended content. " | |
"It includes multiple sentences to test the chunking mechanism and ensure smooth audio generation. " | |
"The quick brown fox jumps over the lazy dog, and the story continues with more details.", | |
available_voices[1] if len(available_voices) > 1 else "expr-voice-5-m", 1.5], | |
["Welcome to the world of high-quality text-to-speech synthesis! This example showcases " | |
"the ability to process varied text lengths efficiently.", | |
available_voices[5] if len(available_voices) > 5 else "expr-voice-5-m", 1], | |
], | |
inputs=[text_input, voice_dropdown, speed_slider], | |
outputs=[audio_output, error_output], | |
fn=generate_speech, | |
label="Click an example to try it out", | |
cache_examples="lazy" | |
) | |
# Model information | |
with gr.Accordion("βΉοΈ Model Information", open=False): | |
gr.Markdown(""" | |
**Model**: KittenML/kitten-tts-nano-0.1 | |
**Features**: | |
- High-quality text-to-speech synthesis | |
- Optimized for long text inputs through chunking | |
- Works without GPU acceleration | |
- Multiple voice options | |
- Adjustable speech speed (0.5x to 2.0x) | |
- 24kHz audio output | |
**Usage**: | |
1. Enter or paste your text in the text box (long texts supported) | |
2. Select a voice from the dropdown | |
3. Adjust the speech speed if needed | |
4. Click "Generate Speech" to create audio | |
Generated files are saved in a temporary directory with unique UUID filenames. | |
Long texts are automatically split into manageable chunks for efficient processing. | |
""") | |
# Event handlers | |
generate_btn.click( | |
fn=generate_speech, | |
inputs=[text_input, voice_dropdown, speed_slider], | |
outputs=[audio_output, error_output] | |
) | |
text_input.submit( | |
fn=generate_speech, | |
inputs=[text_input, voice_dropdown, speed_slider], | |
outputs=[audio_output, error_output] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
app.queue(default_concurrency_limit=50).launch() |