Spaces:
Running
Running
File size: 8,049 Bytes
fada17e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import gradio as gr
import tempfile
import uuid
import os
import soundfile as sf
import numpy as np
from kittentts import KittenTTS
# Initialize the TTS model
model = KittenTTS("KittenML/kitten-tts-nano-0.1")
def chunk_text(text, max_length=500):
"""
Split long text into smaller chunks to optimize processing.
Args:
text (str): Input text to be split
max_length (int): Maximum length of each chunk in characters
Returns:
list: List of text chunks
"""
sentences = text.replace('\n', ' ').split('. ')
chunks, current_chunk, current_length = [], [], 0
for sentence in sentences:
sentence = sentence.strip() + '.'
sentence_length = len(sentence)
if current_length + sentence_length > max_length:
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk, current_length = [], 0
if sentence_length > max_length:
# Split long sentence into smaller parts
words = sentence.split()
temp_chunk, temp_length = [], 0
for word in words:
word_length = len(word) + 1
if temp_length + word_length > max_length:
chunks.append(' '.join(temp_chunk))
temp_chunk, temp_length = [], 0
temp_chunk.append(word)
temp_length += word_length
if temp_chunk:
chunks.append(' '.join(temp_chunk))
continue
current_chunk.append(sentence)
current_length += sentence_length
if current_chunk:
chunks.append(' '.join(current_chunk))
return [chunk.strip() for chunk in chunks if chunk.strip()]
def generate_speech(text, voice, speed):
"""
Generate speech from text using KittenTTS, optimized for long text.
Args:
text (str): Text to convert to speech
voice (str): Voice to use for generation
speed (float): Speed of speech generation
Returns:
str: Path to generated audio file or None if error
str: Error message if applicable
"""
if not text.strip():
return None, "Please enter some text to generate speech."
try:
temp_dir = tempfile.gettempdir()
unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
output_path = os.path.join(temp_dir, unique_filename)
# Chunk text for long inputs
chunks = chunk_text(text, max_length=500)
audio_segments = []
for chunk in chunks:
try:
audio = model.generate(chunk, voice=voice, speed=speed)
audio_segments.append(audio)
except Exception as e:
return None, f"Error processing chunk: {str(e)}"
# Concatenate audio segments
if len(audio_segments) > 1:
combined_audio = np.concatenate(audio_segments)
else:
combined_audio = audio_segments[0] if audio_segments else None
if combined_audio is None:
return None, "No audio generated."
# Save audio file
sf.write(output_path, combined_audio, 24000)
return output_path, None
except Exception as e:
return None, f"Error generating speech: {str(e)}"
def get_available_voices():
"""
Retrieve list of available voices from the model.
Returns:
list: List of available voice names
"""
try:
voices = model.available_voices
return voices if voices else ["expr-voice-5-m"]
except Exception:
return ["expr-voice-5-m"]
# Get available voices
available_voices = get_available_voices()
# Create Gradio interface
with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
gr.Markdown("# π± KittenTTS - Text to Speech Generator")
gr.Markdown("Convert your text to high-quality speech using KittenTTS nano model! Optimized for long text inputs.")
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Text to Convert",
placeholder="Enter the text you want to convert to speech (supports long text)...",
lines=10,
max_lines=50
)
with gr.Row():
voice_dropdown = gr.Dropdown(
choices=available_voices,
value=available_voices[0] if available_voices else "expr-voice-5-m",
label="Voice Selection",
info="Choose the voice for speech generation"
)
speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
step=0.01,
value=1.25,
label="Speech Speed",
info="Adjust the speed of speech (0.5x to 2.0x)"
)
generate_btn = gr.Button("π΅ Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Speech",
type="filepath",
interactive=False,
autoplay=True
)
error_output = gr.Textbox(
label="Status",
visible=False,
interactive=False
)
# Example inputs
gr.Markdown("## π Example Texts")
examples = gr.Examples(
examples=[
["Hello! This is a test of the KittenTTS model with a short sentence.",
available_voices[2] if len(available_voices) > 2 else "expr-voice-5-m", 1.25],
["This is a longer text example to demonstrate how KittenTTS handles extended content. "
"It includes multiple sentences to test the chunking mechanism and ensure smooth audio generation. "
"The quick brown fox jumps over the lazy dog, and the story continues with more details.",
available_voices[1] if len(available_voices) > 1 else "expr-voice-5-m", 1.5],
["Welcome to the world of high-quality text-to-speech synthesis! This example showcases "
"the ability to process varied text lengths efficiently.",
available_voices[5] if len(available_voices) > 5 else "expr-voice-5-m", 1],
],
inputs=[text_input, voice_dropdown, speed_slider],
outputs=[audio_output, error_output],
fn=generate_speech,
label="Click an example to try it out",
cache_examples="lazy"
)
# Model information
with gr.Accordion("βΉοΈ Model Information", open=False):
gr.Markdown("""
**Model**: KittenML/kitten-tts-nano-0.1
**Features**:
- High-quality text-to-speech synthesis
- Optimized for long text inputs through chunking
- Works without GPU acceleration
- Multiple voice options
- Adjustable speech speed (0.5x to 2.0x)
- 24kHz audio output
**Usage**:
1. Enter or paste your text in the text box (long texts supported)
2. Select a voice from the dropdown
3. Adjust the speech speed if needed
4. Click "Generate Speech" to create audio
Generated files are saved in a temporary directory with unique UUID filenames.
Long texts are automatically split into manageable chunks for efficient processing.
""")
# Event handlers
generate_btn.click(
fn=generate_speech,
inputs=[text_input, voice_dropdown, speed_slider],
outputs=[audio_output, error_output]
)
text_input.submit(
fn=generate_speech,
inputs=[text_input, voice_dropdown, speed_slider],
outputs=[audio_output, error_output]
)
# Launch the app
if __name__ == "__main__":
app.queue(default_concurrency_limit=50).launch() |