Spaces:

NeuralFalcon
/

Kitten-TTS

Running

App Files Files Community

Kitten-TTS / app.py

NeuralFalcon

Update app.py

c732fbe verified 8 days ago

raw

history blame contribute delete

5.49 kB

	#copied from https://huggingface.co/spaces/KingNish/Kitten-TTS & Modified to handle large text input.
	import gradio as gr
	import tempfile
	import uuid
	import os
	import re
	import numpy as np
	import soundfile as sf
	from kittentts import KittenTTS
	from tqdm.auto import tqdm
	# Initialize the TTS model
	model = KittenTTS("KittenML/kitten-tts-nano-0.1")

	def split_text_into_chunks(text, chunk_size=400):
	"""
	Split long text into smaller chunks of max length `chunk_size`.
	"""
	# Split by punctuation followed by space (preserves sentence boundaries)
	sentences = re.split(r'(?<=[.!?]) +', text)

	chunks = []
	current_chunk = ""

	for sentence in sentences:
	if len(current_chunk) + len(sentence) > chunk_size:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = ""
	current_chunk += sentence + " "

	if current_chunk:
	chunks.append(current_chunk.strip())

	return chunks

	def generate_speech(text, voice, speed):
	"""
	Generate speech from long text in a memory-efficient way.
	Writes chunks directly to a shared WAV file instead of keeping them in memory.
	"""
	if not text.strip():
	return None, "Please enter some text to generate speech."

	try:
	# Break text into manageable chunks
	chunks = split_text_into_chunks(text, chunk_size=400)

	# Shared output directory (update this path to your shared disk)
	shared_dir = "./saved_audio"
	os.makedirs(shared_dir, exist_ok=True)

	unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
	output_path = os.path.join(shared_dir, unique_filename)

	# Open the WAV file for writing
	with sf.SoundFile(output_path, mode='w', samplerate=24000, channels=1, subtype='PCM_16') as f:
	for chunk in tqdm(chunks, desc="Streaming audio to disk", unit="chunk"):
	audio = model.generate(chunk+" ....", voice=voice, speed=speed)
	f.write(audio) # Write audio directly to disk

	return output_path
	except Exception as e:
	return None, f"Error during TTS generation: {str(e)}"

	def get_available_voices():
	"""Get list of available voices from the model."""
	try:
	voices = model.available_voices
	return voices if voices else ["expr-voice-5-m"]
	except:
	return ["expr-voice-5-m"]

	# Get voices once on load
	available_voices = get_available_voices()

	# Create Gradio UI
	with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
	gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator")
	gr.Markdown("Convert your text to high-quality speech using the KittenTTS nano model!")

	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="Text to Convert",
	placeholder="Enter the text you want to convert to speech...",
	lines=4,
	max_lines=10
	)

	with gr.Row():
	voice_dropdown = gr.Dropdown(
	choices=available_voices,
	value=available_voices[0],
	label="Voice Selection",
	info="Choose the voice for speech generation"
	)

	speed_slider = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	step=0.01,
	value=1,
	label="Speech Speed",
	info="Adjust the speed of speech (0.5x to 2.0x)"
	)

	generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")

	with gr.Column(scale=1):
	audio_output = gr.Audio(
	label="Generated Speech",
	type="filepath",
	interactive=False,
	autoplay=True
	)

	gr.Markdown("## 📝 Example Texts")
	gr.Examples(
	examples=[
	["Hello! This is a test of the KittenTTS model.", available_voices[0], 1],
	["The quick brown fox jumps over the lazy dog.", available_voices[0], 1.25],
	["Welcome to the world of high-quality text-to-speech synthesis!", available_voices[0], 1.5],
	],
	inputs=[text_input, voice_dropdown, speed_slider],
	outputs=[audio_output],
	fn=generate_speech,
	label="Click on an example to try it out",
	# cache_examples="lazy"
	)

	with gr.Accordion("ℹ️ Model Information", open=False):
	gr.Markdown("""
	Model: `KittenML/kitten-tts-nano-0.1`
	Features:
	- High-quality text-to-speech synthesis
	- Works without GPU acceleration
	- Multiple voice options
	- Adjustable speech speed
	- 24kHz audio output

	Usage Instructions:
	1. Enter your text
	2. Select a voice
	3. Adjust the speech speed if needed
	4. Click "Generate Speech"
	""")

	# Event Bindings
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, voice_dropdown, speed_slider],
	outputs=[audio_output]
	)

	text_input.submit(
	fn=generate_speech,
	inputs=[text_input, voice_dropdown, speed_slider],
	outputs=[audio_output]
	)

	# Run the app
	if __name__ == "__main__":
	app.queue().launch()