Hassan-16 commited on
Commit
21977f5
Β·
verified Β·
1 Parent(s): 5dda1a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -50
app.py CHANGED
@@ -57,19 +57,19 @@ def load_voice_choices():
57
  for voice_file in voice_files:
58
  prefix = voice_file[:2]
59
  if prefix == 'af':
60
- label = f"πŸ‡ΊπŸ‡Έ 🚺 {voice_file[3:-3].capitalize()}"
61
  elif prefix == 'am':
62
- label = f"πŸ‡ΊπŸ‡Έ 🚹 {voice_file[3:-3].capitalize()}"
63
  elif prefix == 'bf':
64
- label = f"πŸ‡¬πŸ‡§ 🚺 {voice_file[3:-3].capitalize()}"
65
  elif prefix == 'bm':
66
- label = f"πŸ‡¬πŸ‡§ 🚹 {voice_file[3:-3].capitalize()}"
67
  else:
68
- label = f"Unknown {voice_file[:-3]}"
69
  choices[label] = voice_file
70
  if not choices:
71
  logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
72
- choices = {"πŸ‡ΊπŸ‡Έ 🚺 Bella πŸ”₯": "af_bella.pt"}
73
  VOICE_CHOICES = choices
74
  return choices
75
 
@@ -87,10 +87,10 @@ def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
87
  start_time = time.time()
88
  if len(text) > 510:
89
  text = text[:510]
90
- gr.Warning("Text truncated to 510 characters for performance.")
91
  voice_path = os.path.join(VOICE_DIR, voice)
92
  if not os.path.exists(voice_path):
93
- raise FileNotFoundError(f"Voice file not found: {voice_path}")
94
 
95
  pipeline = pipelines[voice[0]]
96
  use_gpu = use_gpu and CUDA_AVAILABLE
@@ -117,10 +117,10 @@ def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
117
  def tokenize_first(text, voice="af_bella.pt"):
118
  if len(text) > 510:
119
  text = text[:510]
120
- gr.Warning("Text truncated to 510 characters for performance.")
121
  voice_path = os.path.join(VOICE_DIR, voice)
122
  if not os.path.exists(voice_path):
123
- raise FileNotFoundError(f"Voice file not found: {voice_path}")
124
 
125
  pipeline = pipelines[voice[0]]
126
  generator = pipeline(text, voice=voice_path)
@@ -132,10 +132,10 @@ def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
132
  start_time = time.time()
133
  if len(text) > 510:
134
  text = text[:510]
135
- gr.Warning("Text truncated to 510 characters for performance.")
136
  voice_path = os.path.join(VOICE_DIR, voice)
137
  if not os.path.exists(voice_path):
138
- raise FileNotFoundError(f"Voice file not found: {voice_path}")
139
 
140
  pipeline = pipelines[voice[0]]
141
  use_gpu = use_gpu and CUDA_AVAILABLE
@@ -151,52 +151,124 @@ def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
151
  logger.info(f"Streaming generation time: {time.time() - start_time} seconds")
152
 
153
  TOKEN_NOTE = '''
154
- πŸ’‘ Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkΙ™ΙΉO/)
155
-
156
- πŸ’¬ To adjust intonation, try punctuation ;:,.!?—…"()β€œβ€ or stress ˈ and ˌ
157
-
158
- ⬇️ Lower stress [1 level](-1) or [2 levels](-2)
159
-
160
- ⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
161
  '''
162
 
163
- with gr.Blocks(theme="soft") as app:
164
- with gr.Row():
165
- with gr.Column():
166
- text = gr.Textbox(label="Input Text", value=TEXT, info="Arbitrarily many characters supported (max 510)")
167
- with gr.Row():
168
- voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
169
- info="Quality and availability vary by language")
170
- use_gpu = gr.Dropdown(
171
- [("GPU πŸš€", True), ("CPU 🐌", False)],
172
- value=CUDA_AVAILABLE,
173
- label="Hardware",
174
- info="GPU is faster but requires CUDA support",
175
- interactive=CUDA_AVAILABLE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  )
177
- speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
178
- with gr.Column():
179
- with gr.Tab(label="Generate"):
180
- out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
181
- generate_btn = gr.Button("Generate", variant="primary")
182
- with gr.Accordion("Output Tokens", open=True):
183
- out_ps = gr.Textbox(interactive=False, show_label=False,
184
- info="Tokens used to generate the audio, up to 510 context length.")
185
- tokenize_btn = gr.Button("Tokenize", variant="secondary")
186
  gr.Markdown(TOKEN_NOTE)
187
- with gr.Tab(label="Stream"):
188
- out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
 
 
 
 
 
 
 
 
 
 
 
189
  with gr.Row():
190
- stream_btn = gr.Button("Stream", variant="primary")
191
- stop_btn = gr.Button("Stop", variant="stop")
192
- gr.Markdown("⚠️ Streaming may have initial delays due to processing.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps])
195
- tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
196
- stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
 
 
 
 
 
 
 
 
 
 
 
 
197
  stop_btn.click(fn=None, cancels=[stream_event])
198
 
199
  if __name__ == "__main__":
200
  logger.info("Starting Gradio app...")
201
- app.launch()
202
  logger.info("Gradio app started.")
 
57
  for voice_file in voice_files:
58
  prefix = voice_file[:2]
59
  if prefix == 'af':
60
+ label = f"πŸ‡ΊπŸ‡Έ Female: {voice_file[3:-3].capitalize()}"
61
  elif prefix == 'am':
62
+ label = f"πŸ‡ΊπŸ‡Έ Male: {voice_file[3:-3].capitalize()}"
63
  elif prefix == 'bf':
64
+ label = f"πŸ‡¬πŸ‡§ Female: {voice_file[3:-3].capitalize()}"
65
  elif prefix == 'bm':
66
+ label = f"πŸ‡¬πŸ‡§ Male: {voice_file[3:-3].capitalize()}"
67
  else:
68
+ label = f"Unknown: {voice_file[:-3]}"
69
  choices[label] = voice_file
70
  if not choices:
71
  logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
72
+ choices = {"πŸ‡ΊπŸ‡Έ Female: Bella": "af_bella.pt"}
73
  VOICE_CHOICES = choices
74
  return choices
75
 
 
87
  start_time = time.time()
88
  if len(text) > 510:
89
  text = text[:510]
90
+ gr.Warning("Text truncated to 510 characters for faster processing.")
91
  voice_path = os.path.join(VOICE_DIR, voice)
92
  if not os.path.exists(voice_path):
93
+ raise gr.Error(f"Voice file not found: {voice_path}")
94
 
95
  pipeline = pipelines[voice[0]]
96
  use_gpu = use_gpu and CUDA_AVAILABLE
 
117
  def tokenize_first(text, voice="af_bella.pt"):
118
  if len(text) > 510:
119
  text = text[:510]
120
+ gr.Warning("Text truncated to 510 characters for faster processing.")
121
  voice_path = os.path.join(VOICE_DIR, voice)
122
  if not os.path.exists(voice_path):
123
+ raise gr.Error(f"Voice file not found: {voice_path}")
124
 
125
  pipeline = pipelines[voice[0]]
126
  generator = pipeline(text, voice=voice_path)
 
132
  start_time = time.time()
133
  if len(text) > 510:
134
  text = text[:510]
135
+ gr.Warning("Text truncated to 510 characters for faster processing.")
136
  voice_path = os.path.join(VOICE_DIR, voice)
137
  if not os.path.exists(voice_path):
138
+ raise gr.Error(f"Voice file not found: {voice_path}")
139
 
140
  pipeline = pipelines[voice[0]]
141
  use_gpu = use_gpu and CUDA_AVAILABLE
 
151
  logger.info(f"Streaming generation time: {time.time() - start_time} seconds")
152
 
153
  TOKEN_NOTE = '''
154
+ **How to Customize Pronunciation**
155
+ - Use Markdown link syntax, e.g., `[Kokoro](/kˈOkΙ™ΙΉO/)` for custom pronunciation.
156
+ - Adjust intonation with punctuation: `;:,.!?—…"()β€œβ€`.
157
+ - Control stress: `[word](-1)` or `[word](-2)` to lower, `[word](+1)` or `[word](+2)` to raise stress.
 
 
 
158
  '''
159
 
160
+ with gr.Blocks(theme="huggingface", css=".gr-button-primary {background-color: #1e88e5 !important; color: white !important;}") as app:
161
+ gr.Markdown("# Kokoro TTS: Text-to-Speech Generator")
162
+ gr.Markdown("Enter text and select a voice to generate high-quality audio. Adjust speed for faster or slower speech.")
163
+
164
+ with gr.Column():
165
+ text = gr.Textbox(
166
+ label="Input Text",
167
+ value=TEXT,
168
+ placeholder="Type your text here (max 510 characters)",
169
+ lines=3,
170
+ max_lines=5,
171
+ info="Enter text to convert to speech."
172
+ )
173
+ with gr.Row():
174
+ voice = gr.Dropdown(
175
+ choices=list(CHOICES.items()),
176
+ value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0],
177
+ label="Voice",
178
+ info="Choose a voice for the audio output."
179
+ )
180
+ use_gpu = gr.Dropdown(
181
+ choices=[("GPU πŸš€ (Faster)", True), ("CPU 🐌 (Slower)", False)],
182
+ value=CUDA_AVAILABLE,
183
+ label="Hardware",
184
+ info="GPU is faster but requires CUDA support.",
185
+ interactive=CUDA_AVAILABLE
186
+ )
187
+ speed = gr.Slider(
188
+ minimum=0.5,
189
+ maximum=2,
190
+ value=1,
191
+ step=0.1,
192
+ label="Speech Speed",
193
+ info="Adjust the speed of the generated audio (0.5 = slower, 2 = faster)."
194
+ )
195
+
196
+ with gr.Tabs():
197
+ with gr.Tab(label="Generate Audio"):
198
+ out_audio = gr.Audio(
199
+ label="Generated Audio",
200
+ interactive=False,
201
+ streaming=False,
202
+ autoplay=True
203
+ )
204
+ status = gr.Textbox(
205
+ value="Ready to generate audio.",
206
+ label="Status",
207
+ interactive=False
208
  )
209
+ generate_btn = gr.Button("Generate Audio", variant="primary")
210
+ with gr.Accordion("Pronunciation Tokens", open=False):
211
+ out_ps = gr.Textbox(
212
+ interactive=False,
213
+ show_label=False,
214
+ info="Tokens used to generate the audio."
215
+ )
216
+ tokenize_btn = gr.Button("Show Tokens", variant="secondary")
 
217
  gr.Markdown(TOKEN_NOTE)
218
+
219
+ with gr.Tab(label="Stream Audio"):
220
+ out_stream = gr.Audio(
221
+ label="Streaming Audio",
222
+ interactive=False,
223
+ streaming=True,
224
+ autoplay=True
225
+ )
226
+ status_stream = gr.Textbox(
227
+ value="Ready to stream audio.",
228
+ label="Status",
229
+ interactive=False
230
+ )
231
  with gr.Row():
232
+ stream_btn = gr.Button("Start Streaming", variant="primary")
233
+ stop_btn = gr.Button("Stop Streaming", variant="stop")
234
+ gr.Markdown("⚠️ Streaming may have slight delays due to processing.")
235
+
236
+ def update_status_generate(text, voice, speed, use_gpu):
237
+ status.value = "Generating audio..."
238
+ result, ps = generate_first(text, voice, speed, use_gpu)
239
+ status.value = "Audio generated successfully!" if result else "Failed to generate audio."
240
+ return result, ps
241
+
242
+ def update_status_tokenize(text, voice):
243
+ status.value = "Tokenizing text..."
244
+ result = tokenize_first(text, voice)
245
+ status.value = "Tokenization complete!" if result else "Failed to tokenize."
246
+ return result
247
+
248
+ def update_status_stream(text, voice, speed, use_gpu):
249
+ status_stream.value = "Starting audio stream..."
250
+ for audio in generate_all(text, voice, speed, use_gpu):
251
+ yield audio
252
+ status_stream.value = "Streaming complete!"
253
 
254
+ generate_btn.click(
255
+ fn=update_status_generate,
256
+ inputs=[text, voice, speed, use_gpu],
257
+ outputs=[out_audio, out_ps, status]
258
+ )
259
+ tokenize_btn.click(
260
+ fn=update_status_tokenize,
261
+ inputs=[text, voice],
262
+ outputs=[out_ps, status]
263
+ )
264
+ stream_event = stream_btn.click(
265
+ fn=update_status_stream,
266
+ inputs=[text, voice, speed, use_gpu],
267
+ outputs=[out_stream, status_stream]
268
+ )
269
  stop_btn.click(fn=None, cancels=[stream_event])
270
 
271
  if __name__ == "__main__":
272
  logger.info("Starting Gradio app...")
273
+ app.launch(queue=False)
274
  logger.info("Gradio app started.")