NeuralFalcon commited on
Commit
618f849
·
verified ·
1 Parent(s): ee9e698

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -0
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tempfile
3
+ import uuid
4
+ import os
5
+ import re
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from kittentts import KittenTTS
9
+ from tqdm.auto import tqdm
10
+ # Initialize the TTS model
11
+ model = KittenTTS("KittenML/kitten-tts-nano-0.1")
12
+
13
+ def split_text_into_chunks(text, chunk_size=400):
14
+ """
15
+ Split long text into smaller chunks of max length `chunk_size`.
16
+ """
17
+ # Split by punctuation followed by space (preserves sentence boundaries)
18
+ sentences = re.split(r'(?<=[.!?]) +', text)
19
+
20
+ chunks = []
21
+ current_chunk = ""
22
+
23
+ for sentence in sentences:
24
+ if len(current_chunk) + len(sentence) > chunk_size:
25
+ if current_chunk:
26
+ chunks.append(current_chunk.strip())
27
+ current_chunk = ""
28
+ current_chunk += sentence + " "
29
+
30
+ if current_chunk:
31
+ chunks.append(current_chunk.strip())
32
+
33
+ return chunks
34
+
35
+ def generate_speech(text, voice, speed):
36
+ """
37
+ Generate speech from long text in a memory-efficient way.
38
+ Writes chunks directly to a shared WAV file instead of keeping them in memory.
39
+ """
40
+ if not text.strip():
41
+ return None, "Please enter some text to generate speech."
42
+
43
+ try:
44
+ # Break text into manageable chunks
45
+ chunks = split_text_into_chunks(text, chunk_size=400)
46
+
47
+ # Shared output directory (update this path to your shared disk)
48
+ shared_dir = "./saved_audio"
49
+ os.makedirs(shared_dir, exist_ok=True)
50
+
51
+ unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
52
+ output_path = os.path.join(shared_dir, unique_filename)
53
+
54
+ # Open the WAV file for writing
55
+ with sf.SoundFile(output_path, mode='w', samplerate=24000, channels=1, subtype='PCM_16') as f:
56
+ for chunk in tqdm(chunks, desc="Streaming audio to disk", unit="chunk"):
57
+ audio = model.generate(chunk, voice=voice, speed=speed)
58
+ f.write(audio) # Write audio directly to disk
59
+
60
+ return output_path
61
+ except Exception as e:
62
+ return None, f"Error during TTS generation: {str(e)}"
63
+
64
+ def get_available_voices():
65
+ """Get list of available voices from the model."""
66
+ try:
67
+ voices = model.available_voices
68
+ return voices if voices else ["expr-voice-5-m"]
69
+ except:
70
+ return ["expr-voice-5-m"]
71
+
72
+ # Get voices once on load
73
+ available_voices = get_available_voices()
74
+
75
+ # Create Gradio UI
76
+ with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
77
+ gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator")
78
+ gr.Markdown("Convert your text to high-quality speech using the KittenTTS nano model!")
79
+
80
+ with gr.Row():
81
+ with gr.Column(scale=2):
82
+ text_input = gr.Textbox(
83
+ label="Text to Convert",
84
+ placeholder="Enter the text you want to convert to speech...",
85
+ lines=4,
86
+ max_lines=10
87
+ )
88
+
89
+ with gr.Row():
90
+ voice_dropdown = gr.Dropdown(
91
+ choices=available_voices,
92
+ value=available_voices[0],
93
+ label="Voice Selection",
94
+ info="Choose the voice for speech generation"
95
+ )
96
+
97
+ speed_slider = gr.Slider(
98
+ minimum=0.5,
99
+ maximum=2.0,
100
+ step=0.01,
101
+ value=1.25,
102
+ label="Speech Speed",
103
+ info="Adjust the speed of speech (0.5x to 2.0x)"
104
+ )
105
+
106
+ generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
107
+
108
+ with gr.Column(scale=1):
109
+ audio_output = gr.Audio(
110
+ label="Generated Speech",
111
+ type="filepath",
112
+ interactive=False,
113
+ autoplay=True
114
+ )
115
+
116
+ gr.Markdown("## 📝 Example Texts")
117
+ gr.Examples(
118
+ examples=[
119
+ ["Hello! This is a test of the KittenTTS model.", available_voices[0], 1.25],
120
+ ["The quick brown fox jumps over the lazy dog.", available_voices[0], 1.5],
121
+ ["Welcome to the world of high-quality text-to-speech synthesis!", available_voices[0], 1.0],
122
+ ],
123
+ inputs=[text_input, voice_dropdown, speed_slider],
124
+ outputs=[audio_output],
125
+ fn=generate_speech,
126
+ label="Click on an example to try it out",
127
+ # cache_examples="lazy"
128
+ )
129
+
130
+ with gr.Accordion("ℹ️ Model Information", open=False):
131
+ gr.Markdown("""
132
+ **Model:** `KittenML/kitten-tts-nano-0.1`
133
+ **Features:**
134
+ - High-quality text-to-speech synthesis
135
+ - Works without GPU acceleration
136
+ - Multiple voice options
137
+ - Adjustable speech speed
138
+ - 24kHz audio output
139
+
140
+ **Usage Instructions:**
141
+ 1. Enter your text
142
+ 2. Select a voice
143
+ 3. Adjust the speech speed if needed
144
+ 4. Click "Generate Speech"
145
+ """)
146
+
147
+ # Event Bindings
148
+ generate_btn.click(
149
+ fn=generate_speech,
150
+ inputs=[text_input, voice_dropdown, speed_slider],
151
+ outputs=[audio_output]
152
+ )
153
+
154
+ text_input.submit(
155
+ fn=generate_speech,
156
+ inputs=[text_input, voice_dropdown, speed_slider],
157
+ outputs=[audio_output]
158
+ )
159
+
160
+ # Run the app
161
+ if __name__ == "__main__":
162
+ app.queue().launch()