Spaces:
Running
Running
#copied from https://huggingface.co/spaces/KingNish/Kitten-TTS & Modified to handle large text input. | |
import gradio as gr | |
import tempfile | |
import uuid | |
import os | |
import re | |
import numpy as np | |
import soundfile as sf | |
from kittentts import KittenTTS | |
from tqdm.auto import tqdm | |
# Initialize the TTS model | |
model = KittenTTS("KittenML/kitten-tts-nano-0.1") | |
def split_text_into_chunks(text, chunk_size=400): | |
""" | |
Split long text into smaller chunks of max length `chunk_size`. | |
""" | |
# Split by punctuation followed by space (preserves sentence boundaries) | |
sentences = re.split(r'(?<=[.!?]) +', text) | |
chunks = [] | |
current_chunk = "" | |
for sentence in sentences: | |
if len(current_chunk) + len(sentence) > chunk_size: | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
current_chunk = "" | |
current_chunk += sentence + " " | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
return chunks | |
def generate_speech(text, voice, speed): | |
""" | |
Generate speech from long text in a memory-efficient way. | |
Writes chunks directly to a shared WAV file instead of keeping them in memory. | |
""" | |
if not text.strip(): | |
return None, "Please enter some text to generate speech." | |
try: | |
# Break text into manageable chunks | |
chunks = split_text_into_chunks(text, chunk_size=400) | |
# Shared output directory (update this path to your shared disk) | |
shared_dir = "./saved_audio" | |
os.makedirs(shared_dir, exist_ok=True) | |
unique_filename = f"kitten_tts_{uuid.uuid4()}.wav" | |
output_path = os.path.join(shared_dir, unique_filename) | |
# Open the WAV file for writing | |
with sf.SoundFile(output_path, mode='w', samplerate=24000, channels=1, subtype='PCM_16') as f: | |
for chunk in tqdm(chunks, desc="Streaming audio to disk", unit="chunk"): | |
audio = model.generate(chunk+" ....", voice=voice, speed=speed) | |
f.write(audio) # Write audio directly to disk | |
return output_path | |
except Exception as e: | |
return None, f"Error during TTS generation: {str(e)}" | |
def get_available_voices(): | |
"""Get list of available voices from the model.""" | |
try: | |
voices = model.available_voices | |
return voices if voices else ["expr-voice-5-m"] | |
except: | |
return ["expr-voice-5-m"] | |
# Get voices once on load | |
available_voices = get_available_voices() | |
# Create Gradio UI | |
with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app: | |
gr.Markdown("# π± KittenTTS - Text to Speech Generator") | |
gr.Markdown("Convert your text to high-quality speech using the KittenTTS nano model!") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
text_input = gr.Textbox( | |
label="Text to Convert", | |
placeholder="Enter the text you want to convert to speech...", | |
lines=4, | |
max_lines=10 | |
) | |
with gr.Row(): | |
voice_dropdown = gr.Dropdown( | |
choices=available_voices, | |
value=available_voices[0], | |
label="Voice Selection", | |
info="Choose the voice for speech generation" | |
) | |
speed_slider = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
step=0.01, | |
value=1, | |
label="Speech Speed", | |
info="Adjust the speed of speech (0.5x to 2.0x)" | |
) | |
generate_btn = gr.Button("π΅ Generate Speech", variant="primary", size="lg") | |
with gr.Column(scale=1): | |
audio_output = gr.Audio( | |
label="Generated Speech", | |
type="filepath", | |
interactive=False, | |
autoplay=True | |
) | |
gr.Markdown("## π Example Texts") | |
gr.Examples( | |
examples=[ | |
["Hello! This is a test of the KittenTTS model.", available_voices[0], 1], | |
["The quick brown fox jumps over the lazy dog.", available_voices[0], 1.25], | |
["Welcome to the world of high-quality text-to-speech synthesis!", available_voices[0], 1.5], | |
], | |
inputs=[text_input, voice_dropdown, speed_slider], | |
outputs=[audio_output], | |
fn=generate_speech, | |
label="Click on an example to try it out", | |
# cache_examples="lazy" | |
) | |
with gr.Accordion("βΉοΈ Model Information", open=False): | |
gr.Markdown(""" | |
**Model:** `KittenML/kitten-tts-nano-0.1` | |
**Features:** | |
- High-quality text-to-speech synthesis | |
- Works without GPU acceleration | |
- Multiple voice options | |
- Adjustable speech speed | |
- 24kHz audio output | |
**Usage Instructions:** | |
1. Enter your text | |
2. Select a voice | |
3. Adjust the speech speed if needed | |
4. Click "Generate Speech" | |
""") | |
# Event Bindings | |
generate_btn.click( | |
fn=generate_speech, | |
inputs=[text_input, voice_dropdown, speed_slider], | |
outputs=[audio_output] | |
) | |
text_input.submit( | |
fn=generate_speech, | |
inputs=[text_input, voice_dropdown, speed_slider], | |
outputs=[audio_output] | |
) | |
# Run the app | |
if __name__ == "__main__": | |
app.queue().launch() | |