Yushen CHEN commited on
Commit
3287ba5
·
unverified ·
2 Parent(s): 53c7725 f6b1de2

Merge pull request #79 from fakerybakery/patch-1

Browse files
Files changed (1) hide show
  1. gradio_app.py +283 -266
gradio_app.py CHANGED
@@ -21,6 +21,20 @@ import librosa
21
  import click
22
  import soundfile as sf
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  SPLIT_WORDS = [
25
  "but", "however", "nevertheless", "yet", "still",
26
  "therefore", "thus", "hence", "consequently",
@@ -406,24 +420,15 @@ def update_speed(new_speed):
406
  speed = new_speed
407
  return f"Speed set to: {speed}"
408
 
409
- with gr.Blocks() as app:
410
- gr.Markdown(
411
- """
412
- # E2/F5 TTS with Advanced Batch Processing
413
-
414
- This is a local web UI for F5 TTS with advanced batch processing support, based on the unofficial [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS) supported by [mrfakename](https://github.com/fakerybakery). This app supports the following TTS models:
415
-
416
- * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
417
- * [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
418
-
419
- The checkpoints support English and Chinese.
420
-
421
- If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
422
-
423
- **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
424
- """
425
- )
426
 
 
 
 
 
 
427
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
428
  gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
429
  model_choice = gr.Radio(
@@ -472,38 +477,32 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
472
  outputs=[audio_output, spectrogram_output],
473
  )
474
 
475
- gr.Markdown(
476
- """
477
- # Podcast Generation
478
-
479
- Supported by [RootingInLoad](https://github.com/RootingInLoad)
480
- """
481
- )
482
- with gr.Tab("Podcast Generation"):
483
- speaker1_name = gr.Textbox(label="Speaker 1 Name")
484
- ref_audio_input1 = gr.Audio(label="Reference Audio (Speaker 1)", type="filepath")
485
- ref_text_input1 = gr.Textbox(label="Reference Text (Speaker 1)", lines=2)
486
-
487
- speaker2_name = gr.Textbox(label="Speaker 2 Name")
488
- ref_audio_input2 = gr.Audio(label="Reference Audio (Speaker 2)", type="filepath")
489
- ref_text_input2 = gr.Textbox(label="Reference Text (Speaker 2)", lines=2)
490
-
491
- script_input = gr.Textbox(label="Podcast Script", lines=10,
492
- placeholder="Enter the script with speaker names at the start of each block, e.g.:\nSean: How did you start studying...\n\nMeghan: I came to my interest in technology...\nIt was a long journey...\n\nSean: That's fascinating. Can you elaborate...")
493
-
494
- podcast_model_choice = gr.Radio(
495
- choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
496
- )
497
- podcast_remove_silence = gr.Checkbox(
498
- label="Remove Silences",
499
- value=True,
500
- )
501
- generate_podcast_btn = gr.Button("Generate Podcast", variant="primary")
502
- podcast_output = gr.Audio(label="Generated Podcast")
503
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  def podcast_generation(script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence):
505
  return generate_podcast(script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence)
506
-
507
  generate_podcast_btn.click(
508
  podcast_generation,
509
  inputs=[
@@ -520,30 +519,31 @@ Supported by [RootingInLoad](https://github.com/RootingInLoad)
520
  outputs=podcast_output,
521
  )
522
 
523
- def parse_emotional_text(gen_text):
524
- # Pattern to find (Emotion)
525
- pattern = r'\((.*?)\)'
526
 
527
- # Split the text by the pattern
528
- tokens = re.split(pattern, gen_text)
529
 
530
- segments = []
531
 
532
- current_emotion = 'Regular'
533
 
534
- for i in range(len(tokens)):
535
- if i % 2 == 0:
536
- # This is text
537
- text = tokens[i].strip()
538
- if text:
539
- segments.append({'emotion': current_emotion, 'text': text})
540
- else:
541
- # This is emotion
542
- emotion = tokens[i].strip()
543
- current_emotion = emotion
544
 
545
- return segments
546
 
 
547
  # New section for emotional generation
548
  gr.Markdown(
549
  """
@@ -557,233 +557,250 @@ Supported by [RootingInLoad](https://github.com/RootingInLoad)
557
  """
558
  )
559
 
560
- with gr.Tab("Multiple Speech-Type Generation"):
561
- gr.Markdown("Upload different audio clips for each speech type. 'Regular' emotion is mandatory. You can add additional speech types by clicking the 'Add Speech Type' button.")
 
 
 
 
 
 
 
 
 
 
 
 
562
 
563
- # Regular speech type (mandatory)
564
  with gr.Row():
565
- regular_name = gr.Textbox(value='Regular', label='Speech Type Name', interactive=False)
566
- regular_audio = gr.Audio(label='Regular Reference Audio', type='filepath')
567
- regular_ref_text = gr.Textbox(label='Reference Text (Regular)', lines=2)
568
-
569
- # Additional speech types (up to 9 more)
570
- max_speech_types = 10
571
- speech_type_names = []
572
- speech_type_audios = []
573
- speech_type_ref_texts = []
574
- speech_type_delete_btns = []
575
-
576
- for i in range(max_speech_types - 1):
577
- with gr.Row():
578
- name_input = gr.Textbox(label='Speech Type Name', visible=False)
579
- audio_input = gr.Audio(label='Reference Audio', type='filepath', visible=False)
580
- ref_text_input = gr.Textbox(label='Reference Text', lines=2, visible=False)
581
- delete_btn = gr.Button("Delete", variant="secondary", visible=False)
582
- speech_type_names.append(name_input)
583
- speech_type_audios.append(audio_input)
584
- speech_type_ref_texts.append(ref_text_input)
585
- speech_type_delete_btns.append(delete_btn)
586
-
587
- # Button to add speech type
588
- add_speech_type_btn = gr.Button("Add Speech Type")
589
-
590
- # Keep track of current number of speech types
591
- speech_type_count = gr.State(value=0)
592
-
593
- # Function to add a speech type
594
- def add_speech_type_fn(speech_type_count):
595
- if speech_type_count < max_speech_types - 1:
596
- speech_type_count += 1
597
- # Prepare updates for the components
598
- name_updates = []
599
- audio_updates = []
600
- ref_text_updates = []
601
- delete_btn_updates = []
602
- for i in range(max_speech_types - 1):
603
- if i < speech_type_count:
604
- name_updates.append(gr.update(visible=True))
605
- audio_updates.append(gr.update(visible=True))
606
- ref_text_updates.append(gr.update(visible=True))
607
- delete_btn_updates.append(gr.update(visible=True))
608
- else:
609
- name_updates.append(gr.update())
610
- audio_updates.append(gr.update())
611
- ref_text_updates.append(gr.update())
612
- delete_btn_updates.append(gr.update())
613
- else:
614
- # Optionally, show a warning
615
- # gr.Warning("Maximum number of speech types reached.")
616
- name_updates = [gr.update() for _ in range(max_speech_types - 1)]
617
- audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
618
- ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
619
- delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
  return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
621
 
622
- add_speech_type_btn.click(
623
- add_speech_type_fn,
 
 
 
 
624
  inputs=speech_type_count,
625
  outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
626
  )
627
 
628
- # Function to delete a speech type
629
- def make_delete_speech_type_fn(index):
630
- def delete_speech_type_fn(speech_type_count):
631
- # Prepare updates
632
- name_updates = []
633
- audio_updates = []
634
- ref_text_updates = []
635
- delete_btn_updates = []
636
-
637
- for i in range(max_speech_types - 1):
638
- if i == index:
639
- name_updates.append(gr.update(visible=False, value=''))
640
- audio_updates.append(gr.update(visible=False, value=None))
641
- ref_text_updates.append(gr.update(visible=False, value=''))
642
- delete_btn_updates.append(gr.update(visible=False))
643
- else:
644
- name_updates.append(gr.update())
645
- audio_updates.append(gr.update())
646
- ref_text_updates.append(gr.update())
647
- delete_btn_updates.append(gr.update())
648
 
649
- speech_type_count = max(0, speech_type_count - 1)
 
 
 
650
 
651
- return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
 
 
 
 
652
 
653
- return delete_speech_type_fn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
 
655
- for i, delete_btn in enumerate(speech_type_delete_btns):
656
- delete_fn = make_delete_speech_type_fn(i)
657
- delete_btn.click(
658
- delete_fn,
659
- inputs=speech_type_count,
660
- outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
661
- )
662
 
663
- # Text input for the prompt
664
- gen_text_input_emotional = gr.Textbox(label="Text to Generate", lines=10)
 
 
 
665
 
666
- # Model choice
667
- model_choice_emotional = gr.Radio(
668
- choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
669
- )
670
 
671
- with gr.Accordion("Advanced Settings", open=False):
672
- remove_silence_emotional = gr.Checkbox(
673
- label="Remove Silences",
674
- value=True,
675
- )
676
 
677
- # Generate button
678
- generate_emotional_btn = gr.Button("Generate Emotional Speech", variant="primary")
679
 
680
- # Output audio
681
- audio_output_emotional = gr.Audio(label="Synthesized Audio")
 
 
 
 
 
682
 
683
- def generate_emotional_speech(
 
 
684
  regular_audio,
685
  regular_ref_text,
686
- gen_text,
687
- *args,
688
- ):
689
- num_additional_speech_types = max_speech_types - 1
690
- speech_type_names_list = args[:num_additional_speech_types]
691
- speech_type_audios_list = args[num_additional_speech_types:2 * num_additional_speech_types]
692
- speech_type_ref_texts_list = args[2 * num_additional_speech_types:3 * num_additional_speech_types]
693
- model_choice = args[3 * num_additional_speech_types]
694
- remove_silence = args[3 * num_additional_speech_types + 1]
695
-
696
- # Collect the speech types and their audios into a dict
697
- speech_types = {'Regular': {'audio': regular_audio, 'ref_text': regular_ref_text}}
698
-
699
- for name_input, audio_input, ref_text_input in zip(speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list):
700
- if name_input and audio_input:
701
- speech_types[name_input] = {'audio': audio_input, 'ref_text': ref_text_input}
702
-
703
- # Parse the gen_text into segments
704
- segments = parse_speechtypes_text(gen_text)
705
-
706
- # For each segment, generate speech
707
- generated_audio_segments = []
708
- current_emotion = 'Regular'
709
-
710
- for segment in segments:
711
- emotion = segment['emotion']
712
- text = segment['text']
713
-
714
- if emotion in speech_types:
715
- current_emotion = emotion
716
- else:
717
- # If emotion not available, default to Regular
718
- current_emotion = 'Regular'
719
 
720
- ref_audio = speech_types[current_emotion]['audio']
721
- ref_text = speech_types[current_emotion].get('ref_text', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
 
723
- # Generate speech for this segment
724
- audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, "")
725
- sr, audio_data = audio
 
 
 
 
 
 
726
 
727
- generated_audio_segments.append(audio_data)
728
 
729
- # Concatenate all audio segments
730
- if generated_audio_segments:
731
- final_audio_data = np.concatenate(generated_audio_segments)
732
- return (sr, final_audio_data)
733
- else:
734
- gr.Warning("No audio generated.")
735
- return None
736
-
737
- generate_emotional_btn.click(
738
- generate_emotional_speech,
739
- inputs=[
740
- regular_audio,
741
- regular_ref_text,
742
- gen_text_input_emotional,
743
- ] + speech_type_names + speech_type_audios + speech_type_ref_texts + [
744
- model_choice_emotional,
745
- remove_silence_emotional,
746
- ],
747
- outputs=audio_output_emotional,
748
- )
749
 
750
- # Validation function to disable Generate button if speech types are missing
751
- def validate_speech_types(
752
- gen_text,
753
- regular_name,
754
- *args
755
- ):
756
- num_additional_speech_types = max_speech_types - 1
757
- speech_type_names_list = args[:num_additional_speech_types]
758
-
759
- # Collect the speech types names
760
- speech_types_available = set()
761
- if regular_name:
762
- speech_types_available.add(regular_name)
763
- for name_input in speech_type_names_list:
764
- if name_input:
765
- speech_types_available.add(name_input)
766
-
767
- # Parse the gen_text to get the speech types used
768
- segments = parse_emotional_text(gen_text)
769
- speech_types_in_text = set(segment['emotion'] for segment in segments)
770
-
771
- # Check if all speech types in text are available
772
- missing_speech_types = speech_types_in_text - speech_types_available
773
-
774
- if missing_speech_types:
775
- # Disable the generate button
776
- return gr.update(interactive=False)
777
- else:
778
- # Enable the generate button
779
- return gr.update(interactive=True)
780
 
781
- gen_text_input_emotional.change(
782
- validate_speech_types,
783
- inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
784
- outputs=generate_emotional_btn
785
- )
786
-
787
  @click.command()
788
  @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
789
  @click.option("--host", "-H", default=None, help="Host to run the app on")
@@ -804,4 +821,4 @@ def main(port, host, share, api):
804
 
805
 
806
  if __name__ == "__main__":
807
- main()
 
21
  import click
22
  import soundfile as sf
23
 
24
+ try:
25
+ import spaces
26
+ USING_SPACES = True
27
+ except ImportError:
28
+ USING_SPACES = False
29
+
30
+ def gpu_decorator(func):
31
+ if USING_SPACES:
32
+ return spaces.GPU(func)
33
+ else:
34
+ return func
35
+
36
+
37
+
38
  SPLIT_WORDS = [
39
  "but", "however", "nevertheless", "yet", "still",
40
  "therefore", "thus", "hence", "consequently",
 
420
  speed = new_speed
421
  return f"Speed set to: {speed}"
422
 
423
+ with gr.Blocks() as app_credits:
424
+ gr.Markdown("""
425
+ # Credits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
+ * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
428
+ * [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
429
+ """)
430
+ with gr.Blocks() as app_tts:
431
+ gr.Markdown("# Batched TTS")
432
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
433
  gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
434
  model_choice = gr.Radio(
 
477
  outputs=[audio_output, spectrogram_output],
478
  )
479
 
480
+ with gr.Blocks() as app_podcast:
481
+ gr.Markdown("# Podcast Generation")
482
+ speaker1_name = gr.Textbox(label="Speaker 1 Name")
483
+ ref_audio_input1 = gr.Audio(label="Reference Audio (Speaker 1)", type="filepath")
484
+ ref_text_input1 = gr.Textbox(label="Reference Text (Speaker 1)", lines=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
 
486
+ speaker2_name = gr.Textbox(label="Speaker 2 Name")
487
+ ref_audio_input2 = gr.Audio(label="Reference Audio (Speaker 2)", type="filepath")
488
+ ref_text_input2 = gr.Textbox(label="Reference Text (Speaker 2)", lines=2)
489
+
490
+ script_input = gr.Textbox(label="Podcast Script", lines=10,
491
+ placeholder="Enter the script with speaker names at the start of each block, e.g.:\nSean: How did you start studying...\n\nMeghan: I came to my interest in technology...\nIt was a long journey...\n\nSean: That's fascinating. Can you elaborate...")
492
+
493
+ podcast_model_choice = gr.Radio(
494
+ choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
495
+ )
496
+ podcast_remove_silence = gr.Checkbox(
497
+ label="Remove Silences",
498
+ value=True,
499
+ )
500
+ generate_podcast_btn = gr.Button("Generate Podcast", variant="primary")
501
+ podcast_output = gr.Audio(label="Generated Podcast")
502
+
503
  def podcast_generation(script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence):
504
  return generate_podcast(script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence)
505
+
506
  generate_podcast_btn.click(
507
  podcast_generation,
508
  inputs=[
 
519
  outputs=podcast_output,
520
  )
521
 
522
+ def parse_emotional_text(gen_text):
523
+ # Pattern to find (Emotion)
524
+ pattern = r'\((.*?)\)'
525
 
526
+ # Split the text by the pattern
527
+ tokens = re.split(pattern, gen_text)
528
 
529
+ segments = []
530
 
531
+ current_emotion = 'Regular'
532
 
533
+ for i in range(len(tokens)):
534
+ if i % 2 == 0:
535
+ # This is text
536
+ text = tokens[i].strip()
537
+ if text:
538
+ segments.append({'emotion': current_emotion, 'text': text})
539
+ else:
540
+ # This is emotion
541
+ emotion = tokens[i].strip()
542
+ current_emotion = emotion
543
 
544
+ return segments
545
 
546
+ with gr.Blocks() as app_emotional:
547
  # New section for emotional generation
548
  gr.Markdown(
549
  """
 
557
  """
558
  )
559
 
560
+ gr.Markdown("Upload different audio clips for each speech type. 'Regular' emotion is mandatory. You can add additional speech types by clicking the 'Add Speech Type' button.")
561
+
562
+ # Regular speech type (mandatory)
563
+ with gr.Row():
564
+ regular_name = gr.Textbox(value='Regular', label='Speech Type Name', interactive=False)
565
+ regular_audio = gr.Audio(label='Regular Reference Audio', type='filepath')
566
+ regular_ref_text = gr.Textbox(label='Reference Text (Regular)', lines=2)
567
+
568
+ # Additional speech types (up to 9 more)
569
+ max_speech_types = 10
570
+ speech_type_names = []
571
+ speech_type_audios = []
572
+ speech_type_ref_texts = []
573
+ speech_type_delete_btns = []
574
 
575
+ for i in range(max_speech_types - 1):
576
  with gr.Row():
577
+ name_input = gr.Textbox(label='Speech Type Name', visible=False)
578
+ audio_input = gr.Audio(label='Reference Audio', type='filepath', visible=False)
579
+ ref_text_input = gr.Textbox(label='Reference Text', lines=2, visible=False)
580
+ delete_btn = gr.Button("Delete", variant="secondary", visible=False)
581
+ speech_type_names.append(name_input)
582
+ speech_type_audios.append(audio_input)
583
+ speech_type_ref_texts.append(ref_text_input)
584
+ speech_type_delete_btns.append(delete_btn)
585
+
586
+ # Button to add speech type
587
+ add_speech_type_btn = gr.Button("Add Speech Type")
588
+
589
+ # Keep track of current number of speech types
590
+ speech_type_count = gr.State(value=0)
591
+
592
+ # Function to add a speech type
593
+ def add_speech_type_fn(speech_type_count):
594
+ if speech_type_count < max_speech_types - 1:
595
+ speech_type_count += 1
596
+ # Prepare updates for the components
597
+ name_updates = []
598
+ audio_updates = []
599
+ ref_text_updates = []
600
+ delete_btn_updates = []
601
+ for i in range(max_speech_types - 1):
602
+ if i < speech_type_count:
603
+ name_updates.append(gr.update(visible=True))
604
+ audio_updates.append(gr.update(visible=True))
605
+ ref_text_updates.append(gr.update(visible=True))
606
+ delete_btn_updates.append(gr.update(visible=True))
607
+ else:
608
+ name_updates.append(gr.update())
609
+ audio_updates.append(gr.update())
610
+ ref_text_updates.append(gr.update())
611
+ delete_btn_updates.append(gr.update())
612
+ else:
613
+ # Optionally, show a warning
614
+ # gr.Warning("Maximum number of speech types reached.")
615
+ name_updates = [gr.update() for _ in range(max_speech_types - 1)]
616
+ audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
617
+ ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
618
+ delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
619
+ return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
620
+
621
+ add_speech_type_btn.click(
622
+ add_speech_type_fn,
623
+ inputs=speech_type_count,
624
+ outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
625
+ )
626
+
627
+ # Function to delete a speech type
628
+ def make_delete_speech_type_fn(index):
629
+ def delete_speech_type_fn(speech_type_count):
630
+ # Prepare updates
631
+ name_updates = []
632
+ audio_updates = []
633
+ ref_text_updates = []
634
+ delete_btn_updates = []
635
+
636
+ for i in range(max_speech_types - 1):
637
+ if i == index:
638
+ name_updates.append(gr.update(visible=False, value=''))
639
+ audio_updates.append(gr.update(visible=False, value=None))
640
+ ref_text_updates.append(gr.update(visible=False, value=''))
641
+ delete_btn_updates.append(gr.update(visible=False))
642
+ else:
643
+ name_updates.append(gr.update())
644
+ audio_updates.append(gr.update())
645
+ ref_text_updates.append(gr.update())
646
+ delete_btn_updates.append(gr.update())
647
+
648
+ speech_type_count = max(0, speech_type_count - 1)
649
+
650
  return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
651
 
652
+ return delete_speech_type_fn
653
+
654
+ for i, delete_btn in enumerate(speech_type_delete_btns):
655
+ delete_fn = make_delete_speech_type_fn(i)
656
+ delete_btn.click(
657
+ delete_fn,
658
  inputs=speech_type_count,
659
  outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
660
  )
661
 
662
+ # Text input for the prompt
663
+ gen_text_input_emotional = gr.Textbox(label="Text to Generate", lines=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
664
 
665
+ # Model choice
666
+ model_choice_emotional = gr.Radio(
667
+ choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
668
+ )
669
 
670
+ with gr.Accordion("Advanced Settings", open=False):
671
+ remove_silence_emotional = gr.Checkbox(
672
+ label="Remove Silences",
673
+ value=True,
674
+ )
675
 
676
+ # Generate button
677
+ generate_emotional_btn = gr.Button("Generate Emotional Speech", variant="primary")
678
+
679
+ # Output audio
680
+ audio_output_emotional = gr.Audio(label="Synthesized Audio")
681
+
682
+ def generate_emotional_speech(
683
+ regular_audio,
684
+ regular_ref_text,
685
+ gen_text,
686
+ *args,
687
+ ):
688
+ num_additional_speech_types = max_speech_types - 1
689
+ speech_type_names_list = args[:num_additional_speech_types]
690
+ speech_type_audios_list = args[num_additional_speech_types:2 * num_additional_speech_types]
691
+ speech_type_ref_texts_list = args[2 * num_additional_speech_types:3 * num_additional_speech_types]
692
+ model_choice = args[3 * num_additional_speech_types]
693
+ remove_silence = args[3 * num_additional_speech_types + 1]
694
+
695
+ # Collect the speech types and their audios into a dict
696
+ speech_types = {'Regular': {'audio': regular_audio, 'ref_text': regular_ref_text}}
697
+
698
+ for name_input, audio_input, ref_text_input in zip(speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list):
699
+ if name_input and audio_input:
700
+ speech_types[name_input] = {'audio': audio_input, 'ref_text': ref_text_input}
701
+
702
+ # Parse the gen_text into segments
703
+ segments = parse_speechtypes_text(gen_text)
704
+
705
+ # For each segment, generate speech
706
+ generated_audio_segments = []
707
+ current_emotion = 'Regular'
708
 
709
+ for segment in segments:
710
+ emotion = segment['emotion']
711
+ text = segment['text']
 
 
 
 
712
 
713
+ if emotion in speech_types:
714
+ current_emotion = emotion
715
+ else:
716
+ # If emotion not available, default to Regular
717
+ current_emotion = 'Regular'
718
 
719
+ ref_audio = speech_types[current_emotion]['audio']
720
+ ref_text = speech_types[current_emotion].get('ref_text', '')
 
 
721
 
722
+ # Generate speech for this segment
723
+ audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, "")
724
+ sr, audio_data = audio
 
 
725
 
726
+ generated_audio_segments.append(audio_data)
 
727
 
728
+ # Concatenate all audio segments
729
+ if generated_audio_segments:
730
+ final_audio_data = np.concatenate(generated_audio_segments)
731
+ return (sr, final_audio_data)
732
+ else:
733
+ gr.Warning("No audio generated.")
734
+ return None
735
 
736
+ generate_emotional_btn.click(
737
+ generate_emotional_speech,
738
+ inputs=[
739
  regular_audio,
740
  regular_ref_text,
741
+ gen_text_input_emotional,
742
+ ] + speech_type_names + speech_type_audios + speech_type_ref_texts + [
743
+ model_choice_emotional,
744
+ remove_silence_emotional,
745
+ ],
746
+ outputs=audio_output_emotional,
747
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748
 
749
+ # Validation function to disable Generate button if speech types are missing
750
+ def validate_speech_types(
751
+ gen_text,
752
+ regular_name,
753
+ *args
754
+ ):
755
+ num_additional_speech_types = max_speech_types - 1
756
+ speech_type_names_list = args[:num_additional_speech_types]
757
+
758
+ # Collect the speech types names
759
+ speech_types_available = set()
760
+ if regular_name:
761
+ speech_types_available.add(regular_name)
762
+ for name_input in speech_type_names_list:
763
+ if name_input:
764
+ speech_types_available.add(name_input)
765
+
766
+ # Parse the gen_text to get the speech types used
767
+ segments = parse_emotional_text(gen_text)
768
+ speech_types_in_text = set(segment['emotion'] for segment in segments)
769
+
770
+ # Check if all speech types in text are available
771
+ missing_speech_types = speech_types_in_text - speech_types_available
772
+
773
+ if missing_speech_types:
774
+ # Disable the generate button
775
+ return gr.update(interactive=False)
776
+ else:
777
+ # Enable the generate button
778
+ return gr.update(interactive=True)
779
 
780
+ gen_text_input_emotional.change(
781
+ validate_speech_types,
782
+ inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
783
+ outputs=generate_emotional_btn
784
+ )
785
+ with gr.Blocks() as app:
786
+ gr.Markdown(
787
+ """
788
+ # E2/F5 TTS
789
 
790
+ This is a local web UI for F5 TTS with advanced batch processing support. This app supports the following TTS models:
791
 
792
+ * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
793
+ * [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
794
 
795
+ The checkpoints support English and Chinese.
796
+
797
+ If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
798
+
799
+ **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
800
+ """
801
+ )
802
+ gr.TabbedInterface([app_tts, app_podcast, app_emotional, app_credits], ["TTS", "Podcast", "Multi-Style", "Credits"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
 
 
 
 
 
 
 
804
  @click.command()
805
  @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
806
  @click.option("--host", "-H", default=None, help="Host to run the app on")
 
821
 
822
 
823
  if __name__ == "__main__":
824
+ main()