SWivid commited on
Commit
b4abb3c
·
1 Parent(s): ff690b7

update infer_gradio

Browse files
Files changed (1) hide show
  1. src/f5_tts/infer/infer_gradio.py +27 -84
src/f5_tts/infer/infer_gradio.py CHANGED
@@ -140,31 +140,6 @@ def generate_podcast(
140
  return podcast_path
141
 
142
 
143
- def parse_speechtypes_text(gen_text):
144
- # Pattern to find (Emotion)
145
- pattern = r"\((.*?)\)"
146
-
147
- # Split the text by the pattern
148
- tokens = re.split(pattern, gen_text)
149
-
150
- segments = []
151
-
152
- current_emotion = "Regular"
153
-
154
- for i in range(len(tokens)):
155
- if i % 2 == 0:
156
- # This is text
157
- text = tokens[i].strip()
158
- if text:
159
- segments.append({"emotion": current_emotion, "text": text})
160
- else:
161
- # This is emotion
162
- emotion = tokens[i].strip()
163
- current_emotion = emotion
164
-
165
- return segments
166
-
167
-
168
  with gr.Blocks() as app_credits:
169
  gr.Markdown("""
170
  # Credits
@@ -272,9 +247,9 @@ with gr.Blocks() as app_podcast:
272
  )
273
 
274
 
275
- def parse_emotional_text(gen_text):
276
- # Pattern to find (Emotion)
277
- pattern = r"\((.*?)\)"
278
 
279
  # Split the text by the pattern
280
  tokens = re.split(pattern, gen_text)
@@ -307,7 +282,7 @@ with gr.Blocks() as app_emotional:
307
 
308
  **Example Input:**
309
 
310
- (Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, darn you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
311
  """
312
  )
313
 
@@ -323,17 +298,19 @@ with gr.Blocks() as app_emotional:
323
 
324
  # Additional speech types (up to 99 more)
325
  max_speech_types = 100
 
326
  speech_type_names = []
327
  speech_type_audios = []
328
  speech_type_ref_texts = []
329
  speech_type_delete_btns = []
330
 
331
  for i in range(max_speech_types - 1):
332
- with gr.Row():
333
- name_input = gr.Textbox(label="Speech Type Name", visible=False)
334
- audio_input = gr.Audio(label="Reference Audio", type="filepath", visible=False)
335
- ref_text_input = gr.Textbox(label="Reference Text", lines=2, visible=False)
336
- delete_btn = gr.Button("Delete", variant="secondary", visible=False)
 
337
  speech_type_names.append(name_input)
338
  speech_type_audios.append(audio_input)
339
  speech_type_ref_texts.append(ref_text_input)
@@ -349,79 +326,44 @@ with gr.Blocks() as app_emotional:
349
  def add_speech_type_fn(speech_type_count):
350
  if speech_type_count < max_speech_types - 1:
351
  speech_type_count += 1
352
- # Prepare updates for the components
353
- name_updates = []
354
- audio_updates = []
355
- ref_text_updates = []
356
- delete_btn_updates = []
357
  for i in range(max_speech_types - 1):
358
  if i < speech_type_count:
359
- name_updates.append(gr.update(visible=True))
360
- audio_updates.append(gr.update(visible=True))
361
- ref_text_updates.append(gr.update(visible=True))
362
- delete_btn_updates.append(gr.update(visible=True))
363
  else:
364
- name_updates.append(gr.update())
365
- audio_updates.append(gr.update())
366
- ref_text_updates.append(gr.update())
367
- delete_btn_updates.append(gr.update())
368
  else:
369
  # Optionally, show a warning
370
- # gr.Warning("Maximum number of speech types reached.")
371
- name_updates = [gr.update() for _ in range(max_speech_types - 1)]
372
- audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
373
- ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
374
- delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
375
- return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
376
 
377
  add_speech_type_btn.click(
378
- add_speech_type_fn,
379
- inputs=speech_type_count,
380
- outputs=[speech_type_count]
381
- + speech_type_names
382
- + speech_type_audios
383
- + speech_type_ref_texts
384
- + speech_type_delete_btns,
385
  )
386
 
387
  # Function to delete a speech type
388
  def make_delete_speech_type_fn(index):
389
  def delete_speech_type_fn(speech_type_count):
390
  # Prepare updates
391
- name_updates = []
392
- audio_updates = []
393
- ref_text_updates = []
394
- delete_btn_updates = []
395
 
396
  for i in range(max_speech_types - 1):
397
  if i == index:
398
- name_updates.append(gr.update(visible=False, value=""))
399
- audio_updates.append(gr.update(visible=False, value=None))
400
- ref_text_updates.append(gr.update(visible=False, value=""))
401
- delete_btn_updates.append(gr.update(visible=False))
402
  else:
403
- name_updates.append(gr.update())
404
- audio_updates.append(gr.update())
405
- ref_text_updates.append(gr.update())
406
- delete_btn_updates.append(gr.update())
407
 
408
  speech_type_count = max(0, speech_type_count - 1)
409
 
410
- return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
411
 
412
  return delete_speech_type_fn
413
 
 
414
  for i, delete_btn in enumerate(speech_type_delete_btns):
415
  delete_fn = make_delete_speech_type_fn(i)
416
- delete_btn.click(
417
- delete_fn,
418
- inputs=speech_type_count,
419
- outputs=[speech_type_count]
420
- + speech_type_names
421
- + speech_type_audios
422
- + speech_type_ref_texts
423
- + speech_type_delete_btns,
424
- )
425
 
426
  # Text input for the prompt
427
  gen_text_input_emotional = gr.Textbox(label="Text to Generate", lines=10)
@@ -432,7 +374,7 @@ with gr.Blocks() as app_emotional:
432
  with gr.Accordion("Advanced Settings", open=False):
433
  remove_silence_emotional = gr.Checkbox(
434
  label="Remove Silences",
435
- value=True,
436
  )
437
 
438
  # Generate button
@@ -529,7 +471,7 @@ with gr.Blocks() as app_emotional:
529
  speech_types_available.add(name_input)
530
 
531
  # Parse the gen_text to get the speech types used
532
- segments = parse_emotional_text(gen_text)
533
  speech_types_in_text = set(segment["emotion"] for segment in segments)
534
 
535
  # Check if all speech types in text are available
@@ -547,6 +489,7 @@ with gr.Blocks() as app_emotional:
547
  inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
548
  outputs=generate_emotional_btn,
549
  )
 
550
  with gr.Blocks() as app:
551
  gr.Markdown(
552
  """
 
140
  return podcast_path
141
 
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  with gr.Blocks() as app_credits:
144
  gr.Markdown("""
145
  # Credits
 
247
  )
248
 
249
 
250
+ def parse_speechtypes_text(gen_text):
251
+ # Pattern to find {speechtype}
252
+ pattern = r"\{(.*?)\}"
253
 
254
  # Split the text by the pattern
255
  tokens = re.split(pattern, gen_text)
 
282
 
283
  **Example Input:**
284
 
285
+ {Regular} Hello, I'd like to order a sandwich please. {Surprised} What do you mean you're out of bread? {Sad} I really wanted a sandwich though... {Angry} You know what, darn you and your little shop, you suck! {Whisper} I'll just go back home and cry now. {Shouting} Why me?!
286
  """
287
  )
288
 
 
298
 
299
  # Additional speech types (up to 99 more)
300
  max_speech_types = 100
301
+ speech_type_rows = []
302
  speech_type_names = []
303
  speech_type_audios = []
304
  speech_type_ref_texts = []
305
  speech_type_delete_btns = []
306
 
307
  for i in range(max_speech_types - 1):
308
+ with gr.Row(visible=False) as row:
309
+ name_input = gr.Textbox(label="Speech Type Name")
310
+ audio_input = gr.Audio(label="Reference Audio", type="filepath")
311
+ ref_text_input = gr.Textbox(label="Reference Text", lines=2)
312
+ delete_btn = gr.Button("Delete", variant="secondary")
313
+ speech_type_rows.append(row)
314
  speech_type_names.append(name_input)
315
  speech_type_audios.append(audio_input)
316
  speech_type_ref_texts.append(ref_text_input)
 
326
  def add_speech_type_fn(speech_type_count):
327
  if speech_type_count < max_speech_types - 1:
328
  speech_type_count += 1
329
+ # Prepare updates for the rows
330
+ row_updates = []
 
 
 
331
  for i in range(max_speech_types - 1):
332
  if i < speech_type_count:
333
+ row_updates.append(gr.update(visible=True))
 
 
 
334
  else:
335
+ row_updates.append(gr.update())
 
 
 
336
  else:
337
  # Optionally, show a warning
338
+ row_updates = [gr.update() for _ in range(max_speech_types - 1)]
339
+ return [speech_type_count] + row_updates
 
 
 
 
340
 
341
  add_speech_type_btn.click(
342
+ add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
 
 
 
 
 
 
343
  )
344
 
345
  # Function to delete a speech type
346
  def make_delete_speech_type_fn(index):
347
  def delete_speech_type_fn(speech_type_count):
348
  # Prepare updates
349
+ row_updates = []
 
 
 
350
 
351
  for i in range(max_speech_types - 1):
352
  if i == index:
353
+ row_updates.append(gr.update(visible=False))
 
 
 
354
  else:
355
+ row_updates.append(gr.update())
 
 
 
356
 
357
  speech_type_count = max(0, speech_type_count - 1)
358
 
359
+ return [speech_type_count] + row_updates
360
 
361
  return delete_speech_type_fn
362
 
363
+ # Update delete button clicks
364
  for i, delete_btn in enumerate(speech_type_delete_btns):
365
  delete_fn = make_delete_speech_type_fn(i)
366
+ delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
 
 
 
 
 
 
 
 
367
 
368
  # Text input for the prompt
369
  gen_text_input_emotional = gr.Textbox(label="Text to Generate", lines=10)
 
374
  with gr.Accordion("Advanced Settings", open=False):
375
  remove_silence_emotional = gr.Checkbox(
376
  label="Remove Silences",
377
+ value=False,
378
  )
379
 
380
  # Generate button
 
471
  speech_types_available.add(name_input)
472
 
473
  # Parse the gen_text to get the speech types used
474
+ segments = parse_speechtypes_text(gen_text)
475
  speech_types_in_text = set(segment["emotion"] for segment in segments)
476
 
477
  # Check if all speech types in text are available
 
489
  inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
490
  outputs=generate_emotional_btn,
491
  )
492
+
493
  with gr.Blocks() as app:
494
  gr.Markdown(
495
  """