Spaces:
Configuration error
Configuration error
Merge pull request #79 from fakerybakery/patch-1
Browse files- gradio_app.py +283 -266
gradio_app.py
CHANGED
@@ -21,6 +21,20 @@ import librosa
|
|
21 |
import click
|
22 |
import soundfile as sf
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
SPLIT_WORDS = [
|
25 |
"but", "however", "nevertheless", "yet", "still",
|
26 |
"therefore", "thus", "hence", "consequently",
|
@@ -406,24 +420,15 @@ def update_speed(new_speed):
|
|
406 |
speed = new_speed
|
407 |
return f"Speed set to: {speed}"
|
408 |
|
409 |
-
with gr.Blocks() as
|
410 |
-
gr.Markdown(
|
411 |
-
|
412 |
-
# E2/F5 TTS with Advanced Batch Processing
|
413 |
-
|
414 |
-
This is a local web UI for F5 TTS with advanced batch processing support, based on the unofficial [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS) supported by [mrfakename](https://github.com/fakerybakery). This app supports the following TTS models:
|
415 |
-
|
416 |
-
* [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
|
417 |
-
* [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
|
418 |
-
|
419 |
-
The checkpoints support English and Chinese.
|
420 |
-
|
421 |
-
If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
|
422 |
-
|
423 |
-
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
|
424 |
-
"""
|
425 |
-
)
|
426 |
|
|
|
|
|
|
|
|
|
|
|
427 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
428 |
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
|
429 |
model_choice = gr.Radio(
|
@@ -472,38 +477,32 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
|
|
472 |
outputs=[audio_output, spectrogram_output],
|
473 |
)
|
474 |
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
"""
|
481 |
-
)
|
482 |
-
with gr.Tab("Podcast Generation"):
|
483 |
-
speaker1_name = gr.Textbox(label="Speaker 1 Name")
|
484 |
-
ref_audio_input1 = gr.Audio(label="Reference Audio (Speaker 1)", type="filepath")
|
485 |
-
ref_text_input1 = gr.Textbox(label="Reference Text (Speaker 1)", lines=2)
|
486 |
-
|
487 |
-
speaker2_name = gr.Textbox(label="Speaker 2 Name")
|
488 |
-
ref_audio_input2 = gr.Audio(label="Reference Audio (Speaker 2)", type="filepath")
|
489 |
-
ref_text_input2 = gr.Textbox(label="Reference Text (Speaker 2)", lines=2)
|
490 |
-
|
491 |
-
script_input = gr.Textbox(label="Podcast Script", lines=10,
|
492 |
-
placeholder="Enter the script with speaker names at the start of each block, e.g.:\nSean: How did you start studying...\n\nMeghan: I came to my interest in technology...\nIt was a long journey...\n\nSean: That's fascinating. Can you elaborate...")
|
493 |
-
|
494 |
-
podcast_model_choice = gr.Radio(
|
495 |
-
choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
|
496 |
-
)
|
497 |
-
podcast_remove_silence = gr.Checkbox(
|
498 |
-
label="Remove Silences",
|
499 |
-
value=True,
|
500 |
-
)
|
501 |
-
generate_podcast_btn = gr.Button("Generate Podcast", variant="primary")
|
502 |
-
podcast_output = gr.Audio(label="Generated Podcast")
|
503 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
504 |
def podcast_generation(script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence):
|
505 |
return generate_podcast(script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence)
|
506 |
-
|
507 |
generate_podcast_btn.click(
|
508 |
podcast_generation,
|
509 |
inputs=[
|
@@ -520,30 +519,31 @@ Supported by [RootingInLoad](https://github.com/RootingInLoad)
|
|
520 |
outputs=podcast_output,
|
521 |
)
|
522 |
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
|
527 |
-
|
528 |
-
|
529 |
|
530 |
-
|
531 |
|
532 |
-
|
533 |
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
|
545 |
-
|
546 |
|
|
|
547 |
# New section for emotional generation
|
548 |
gr.Markdown(
|
549 |
"""
|
@@ -557,233 +557,250 @@ Supported by [RootingInLoad](https://github.com/RootingInLoad)
|
|
557 |
"""
|
558 |
)
|
559 |
|
560 |
-
|
561 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
562 |
|
563 |
-
|
564 |
with gr.Row():
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
620 |
return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
|
621 |
|
622 |
-
|
623 |
-
|
|
|
|
|
|
|
|
|
624 |
inputs=speech_type_count,
|
625 |
outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
|
626 |
)
|
627 |
|
628 |
-
|
629 |
-
|
630 |
-
def delete_speech_type_fn(speech_type_count):
|
631 |
-
# Prepare updates
|
632 |
-
name_updates = []
|
633 |
-
audio_updates = []
|
634 |
-
ref_text_updates = []
|
635 |
-
delete_btn_updates = []
|
636 |
-
|
637 |
-
for i in range(max_speech_types - 1):
|
638 |
-
if i == index:
|
639 |
-
name_updates.append(gr.update(visible=False, value=''))
|
640 |
-
audio_updates.append(gr.update(visible=False, value=None))
|
641 |
-
ref_text_updates.append(gr.update(visible=False, value=''))
|
642 |
-
delete_btn_updates.append(gr.update(visible=False))
|
643 |
-
else:
|
644 |
-
name_updates.append(gr.update())
|
645 |
-
audio_updates.append(gr.update())
|
646 |
-
ref_text_updates.append(gr.update())
|
647 |
-
delete_btn_updates.append(gr.update())
|
648 |
|
649 |
-
|
|
|
|
|
|
|
650 |
|
651 |
-
|
|
|
|
|
|
|
|
|
652 |
|
653 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
|
655 |
-
for
|
656 |
-
|
657 |
-
|
658 |
-
delete_fn,
|
659 |
-
inputs=speech_type_count,
|
660 |
-
outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
|
661 |
-
)
|
662 |
|
663 |
-
|
664 |
-
|
|
|
|
|
|
|
665 |
|
666 |
-
|
667 |
-
|
668 |
-
choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
|
669 |
-
)
|
670 |
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
value=True,
|
675 |
-
)
|
676 |
|
677 |
-
|
678 |
-
generate_emotional_btn = gr.Button("Generate Emotional Speech", variant="primary")
|
679 |
|
680 |
-
#
|
681 |
-
|
|
|
|
|
|
|
|
|
|
|
682 |
|
683 |
-
|
|
|
|
|
684 |
regular_audio,
|
685 |
regular_ref_text,
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
model_choice = args[3 * num_additional_speech_types]
|
694 |
-
remove_silence = args[3 * num_additional_speech_types + 1]
|
695 |
-
|
696 |
-
# Collect the speech types and their audios into a dict
|
697 |
-
speech_types = {'Regular': {'audio': regular_audio, 'ref_text': regular_ref_text}}
|
698 |
-
|
699 |
-
for name_input, audio_input, ref_text_input in zip(speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list):
|
700 |
-
if name_input and audio_input:
|
701 |
-
speech_types[name_input] = {'audio': audio_input, 'ref_text': ref_text_input}
|
702 |
-
|
703 |
-
# Parse the gen_text into segments
|
704 |
-
segments = parse_speechtypes_text(gen_text)
|
705 |
-
|
706 |
-
# For each segment, generate speech
|
707 |
-
generated_audio_segments = []
|
708 |
-
current_emotion = 'Regular'
|
709 |
-
|
710 |
-
for segment in segments:
|
711 |
-
emotion = segment['emotion']
|
712 |
-
text = segment['text']
|
713 |
-
|
714 |
-
if emotion in speech_types:
|
715 |
-
current_emotion = emotion
|
716 |
-
else:
|
717 |
-
# If emotion not available, default to Regular
|
718 |
-
current_emotion = 'Regular'
|
719 |
|
720 |
-
|
721 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
722 |
|
723 |
-
|
724 |
-
|
725 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
726 |
|
727 |
-
|
728 |
|
729 |
-
|
730 |
-
|
731 |
-
final_audio_data = np.concatenate(generated_audio_segments)
|
732 |
-
return (sr, final_audio_data)
|
733 |
-
else:
|
734 |
-
gr.Warning("No audio generated.")
|
735 |
-
return None
|
736 |
-
|
737 |
-
generate_emotional_btn.click(
|
738 |
-
generate_emotional_speech,
|
739 |
-
inputs=[
|
740 |
-
regular_audio,
|
741 |
-
regular_ref_text,
|
742 |
-
gen_text_input_emotional,
|
743 |
-
] + speech_type_names + speech_type_audios + speech_type_ref_texts + [
|
744 |
-
model_choice_emotional,
|
745 |
-
remove_silence_emotional,
|
746 |
-
],
|
747 |
-
outputs=audio_output_emotional,
|
748 |
-
)
|
749 |
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
# Collect the speech types names
|
760 |
-
speech_types_available = set()
|
761 |
-
if regular_name:
|
762 |
-
speech_types_available.add(regular_name)
|
763 |
-
for name_input in speech_type_names_list:
|
764 |
-
if name_input:
|
765 |
-
speech_types_available.add(name_input)
|
766 |
-
|
767 |
-
# Parse the gen_text to get the speech types used
|
768 |
-
segments = parse_emotional_text(gen_text)
|
769 |
-
speech_types_in_text = set(segment['emotion'] for segment in segments)
|
770 |
-
|
771 |
-
# Check if all speech types in text are available
|
772 |
-
missing_speech_types = speech_types_in_text - speech_types_available
|
773 |
-
|
774 |
-
if missing_speech_types:
|
775 |
-
# Disable the generate button
|
776 |
-
return gr.update(interactive=False)
|
777 |
-
else:
|
778 |
-
# Enable the generate button
|
779 |
-
return gr.update(interactive=True)
|
780 |
|
781 |
-
gen_text_input_emotional.change(
|
782 |
-
validate_speech_types,
|
783 |
-
inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
|
784 |
-
outputs=generate_emotional_btn
|
785 |
-
)
|
786 |
-
|
787 |
@click.command()
|
788 |
@click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
|
789 |
@click.option("--host", "-H", default=None, help="Host to run the app on")
|
@@ -804,4 +821,4 @@ def main(port, host, share, api):
|
|
804 |
|
805 |
|
806 |
if __name__ == "__main__":
|
807 |
-
main()
|
|
|
21 |
import click
|
22 |
import soundfile as sf
|
23 |
|
24 |
+
try:
|
25 |
+
import spaces
|
26 |
+
USING_SPACES = True
|
27 |
+
except ImportError:
|
28 |
+
USING_SPACES = False
|
29 |
+
|
30 |
+
def gpu_decorator(func):
|
31 |
+
if USING_SPACES:
|
32 |
+
return spaces.GPU(func)
|
33 |
+
else:
|
34 |
+
return func
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
SPLIT_WORDS = [
|
39 |
"but", "however", "nevertheless", "yet", "still",
|
40 |
"therefore", "thus", "hence", "consequently",
|
|
|
420 |
speed = new_speed
|
421 |
return f"Speed set to: {speed}"
|
422 |
|
423 |
+
with gr.Blocks() as app_credits:
|
424 |
+
gr.Markdown("""
|
425 |
+
# Credits
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
|
427 |
+
* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
428 |
+
* [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
|
429 |
+
""")
|
430 |
+
with gr.Blocks() as app_tts:
|
431 |
+
gr.Markdown("# Batched TTS")
|
432 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
433 |
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
|
434 |
model_choice = gr.Radio(
|
|
|
477 |
outputs=[audio_output, spectrogram_output],
|
478 |
)
|
479 |
|
480 |
+
with gr.Blocks() as app_podcast:
|
481 |
+
gr.Markdown("# Podcast Generation")
|
482 |
+
speaker1_name = gr.Textbox(label="Speaker 1 Name")
|
483 |
+
ref_audio_input1 = gr.Audio(label="Reference Audio (Speaker 1)", type="filepath")
|
484 |
+
ref_text_input1 = gr.Textbox(label="Reference Text (Speaker 1)", lines=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
485 |
|
486 |
+
speaker2_name = gr.Textbox(label="Speaker 2 Name")
|
487 |
+
ref_audio_input2 = gr.Audio(label="Reference Audio (Speaker 2)", type="filepath")
|
488 |
+
ref_text_input2 = gr.Textbox(label="Reference Text (Speaker 2)", lines=2)
|
489 |
+
|
490 |
+
script_input = gr.Textbox(label="Podcast Script", lines=10,
|
491 |
+
placeholder="Enter the script with speaker names at the start of each block, e.g.:\nSean: How did you start studying...\n\nMeghan: I came to my interest in technology...\nIt was a long journey...\n\nSean: That's fascinating. Can you elaborate...")
|
492 |
+
|
493 |
+
podcast_model_choice = gr.Radio(
|
494 |
+
choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
|
495 |
+
)
|
496 |
+
podcast_remove_silence = gr.Checkbox(
|
497 |
+
label="Remove Silences",
|
498 |
+
value=True,
|
499 |
+
)
|
500 |
+
generate_podcast_btn = gr.Button("Generate Podcast", variant="primary")
|
501 |
+
podcast_output = gr.Audio(label="Generated Podcast")
|
502 |
+
|
503 |
def podcast_generation(script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence):
|
504 |
return generate_podcast(script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence)
|
505 |
+
|
506 |
generate_podcast_btn.click(
|
507 |
podcast_generation,
|
508 |
inputs=[
|
|
|
519 |
outputs=podcast_output,
|
520 |
)
|
521 |
|
522 |
+
def parse_emotional_text(gen_text):
|
523 |
+
# Pattern to find (Emotion)
|
524 |
+
pattern = r'\((.*?)\)'
|
525 |
|
526 |
+
# Split the text by the pattern
|
527 |
+
tokens = re.split(pattern, gen_text)
|
528 |
|
529 |
+
segments = []
|
530 |
|
531 |
+
current_emotion = 'Regular'
|
532 |
|
533 |
+
for i in range(len(tokens)):
|
534 |
+
if i % 2 == 0:
|
535 |
+
# This is text
|
536 |
+
text = tokens[i].strip()
|
537 |
+
if text:
|
538 |
+
segments.append({'emotion': current_emotion, 'text': text})
|
539 |
+
else:
|
540 |
+
# This is emotion
|
541 |
+
emotion = tokens[i].strip()
|
542 |
+
current_emotion = emotion
|
543 |
|
544 |
+
return segments
|
545 |
|
546 |
+
with gr.Blocks() as app_emotional:
|
547 |
# New section for emotional generation
|
548 |
gr.Markdown(
|
549 |
"""
|
|
|
557 |
"""
|
558 |
)
|
559 |
|
560 |
+
gr.Markdown("Upload different audio clips for each speech type. 'Regular' emotion is mandatory. You can add additional speech types by clicking the 'Add Speech Type' button.")
|
561 |
+
|
562 |
+
# Regular speech type (mandatory)
|
563 |
+
with gr.Row():
|
564 |
+
regular_name = gr.Textbox(value='Regular', label='Speech Type Name', interactive=False)
|
565 |
+
regular_audio = gr.Audio(label='Regular Reference Audio', type='filepath')
|
566 |
+
regular_ref_text = gr.Textbox(label='Reference Text (Regular)', lines=2)
|
567 |
+
|
568 |
+
# Additional speech types (up to 9 more)
|
569 |
+
max_speech_types = 10
|
570 |
+
speech_type_names = []
|
571 |
+
speech_type_audios = []
|
572 |
+
speech_type_ref_texts = []
|
573 |
+
speech_type_delete_btns = []
|
574 |
|
575 |
+
for i in range(max_speech_types - 1):
|
576 |
with gr.Row():
|
577 |
+
name_input = gr.Textbox(label='Speech Type Name', visible=False)
|
578 |
+
audio_input = gr.Audio(label='Reference Audio', type='filepath', visible=False)
|
579 |
+
ref_text_input = gr.Textbox(label='Reference Text', lines=2, visible=False)
|
580 |
+
delete_btn = gr.Button("Delete", variant="secondary", visible=False)
|
581 |
+
speech_type_names.append(name_input)
|
582 |
+
speech_type_audios.append(audio_input)
|
583 |
+
speech_type_ref_texts.append(ref_text_input)
|
584 |
+
speech_type_delete_btns.append(delete_btn)
|
585 |
+
|
586 |
+
# Button to add speech type
|
587 |
+
add_speech_type_btn = gr.Button("Add Speech Type")
|
588 |
+
|
589 |
+
# Keep track of current number of speech types
|
590 |
+
speech_type_count = gr.State(value=0)
|
591 |
+
|
592 |
+
# Function to add a speech type
|
593 |
+
def add_speech_type_fn(speech_type_count):
|
594 |
+
if speech_type_count < max_speech_types - 1:
|
595 |
+
speech_type_count += 1
|
596 |
+
# Prepare updates for the components
|
597 |
+
name_updates = []
|
598 |
+
audio_updates = []
|
599 |
+
ref_text_updates = []
|
600 |
+
delete_btn_updates = []
|
601 |
+
for i in range(max_speech_types - 1):
|
602 |
+
if i < speech_type_count:
|
603 |
+
name_updates.append(gr.update(visible=True))
|
604 |
+
audio_updates.append(gr.update(visible=True))
|
605 |
+
ref_text_updates.append(gr.update(visible=True))
|
606 |
+
delete_btn_updates.append(gr.update(visible=True))
|
607 |
+
else:
|
608 |
+
name_updates.append(gr.update())
|
609 |
+
audio_updates.append(gr.update())
|
610 |
+
ref_text_updates.append(gr.update())
|
611 |
+
delete_btn_updates.append(gr.update())
|
612 |
+
else:
|
613 |
+
# Optionally, show a warning
|
614 |
+
# gr.Warning("Maximum number of speech types reached.")
|
615 |
+
name_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
616 |
+
audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
617 |
+
ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
618 |
+
delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
619 |
+
return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
|
620 |
+
|
621 |
+
add_speech_type_btn.click(
|
622 |
+
add_speech_type_fn,
|
623 |
+
inputs=speech_type_count,
|
624 |
+
outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
|
625 |
+
)
|
626 |
+
|
627 |
+
# Function to delete a speech type
|
628 |
+
def make_delete_speech_type_fn(index):
|
629 |
+
def delete_speech_type_fn(speech_type_count):
|
630 |
+
# Prepare updates
|
631 |
+
name_updates = []
|
632 |
+
audio_updates = []
|
633 |
+
ref_text_updates = []
|
634 |
+
delete_btn_updates = []
|
635 |
+
|
636 |
+
for i in range(max_speech_types - 1):
|
637 |
+
if i == index:
|
638 |
+
name_updates.append(gr.update(visible=False, value=''))
|
639 |
+
audio_updates.append(gr.update(visible=False, value=None))
|
640 |
+
ref_text_updates.append(gr.update(visible=False, value=''))
|
641 |
+
delete_btn_updates.append(gr.update(visible=False))
|
642 |
+
else:
|
643 |
+
name_updates.append(gr.update())
|
644 |
+
audio_updates.append(gr.update())
|
645 |
+
ref_text_updates.append(gr.update())
|
646 |
+
delete_btn_updates.append(gr.update())
|
647 |
+
|
648 |
+
speech_type_count = max(0, speech_type_count - 1)
|
649 |
+
|
650 |
return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
|
651 |
|
652 |
+
return delete_speech_type_fn
|
653 |
+
|
654 |
+
for i, delete_btn in enumerate(speech_type_delete_btns):
|
655 |
+
delete_fn = make_delete_speech_type_fn(i)
|
656 |
+
delete_btn.click(
|
657 |
+
delete_fn,
|
658 |
inputs=speech_type_count,
|
659 |
outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
|
660 |
)
|
661 |
|
662 |
+
# Text input for the prompt
|
663 |
+
gen_text_input_emotional = gr.Textbox(label="Text to Generate", lines=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
664 |
|
665 |
+
# Model choice
|
666 |
+
model_choice_emotional = gr.Radio(
|
667 |
+
choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
|
668 |
+
)
|
669 |
|
670 |
+
with gr.Accordion("Advanced Settings", open=False):
|
671 |
+
remove_silence_emotional = gr.Checkbox(
|
672 |
+
label="Remove Silences",
|
673 |
+
value=True,
|
674 |
+
)
|
675 |
|
676 |
+
# Generate button
|
677 |
+
generate_emotional_btn = gr.Button("Generate Emotional Speech", variant="primary")
|
678 |
+
|
679 |
+
# Output audio
|
680 |
+
audio_output_emotional = gr.Audio(label="Synthesized Audio")
|
681 |
+
|
682 |
+
def generate_emotional_speech(
|
683 |
+
regular_audio,
|
684 |
+
regular_ref_text,
|
685 |
+
gen_text,
|
686 |
+
*args,
|
687 |
+
):
|
688 |
+
num_additional_speech_types = max_speech_types - 1
|
689 |
+
speech_type_names_list = args[:num_additional_speech_types]
|
690 |
+
speech_type_audios_list = args[num_additional_speech_types:2 * num_additional_speech_types]
|
691 |
+
speech_type_ref_texts_list = args[2 * num_additional_speech_types:3 * num_additional_speech_types]
|
692 |
+
model_choice = args[3 * num_additional_speech_types]
|
693 |
+
remove_silence = args[3 * num_additional_speech_types + 1]
|
694 |
+
|
695 |
+
# Collect the speech types and their audios into a dict
|
696 |
+
speech_types = {'Regular': {'audio': regular_audio, 'ref_text': regular_ref_text}}
|
697 |
+
|
698 |
+
for name_input, audio_input, ref_text_input in zip(speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list):
|
699 |
+
if name_input and audio_input:
|
700 |
+
speech_types[name_input] = {'audio': audio_input, 'ref_text': ref_text_input}
|
701 |
+
|
702 |
+
# Parse the gen_text into segments
|
703 |
+
segments = parse_speechtypes_text(gen_text)
|
704 |
+
|
705 |
+
# For each segment, generate speech
|
706 |
+
generated_audio_segments = []
|
707 |
+
current_emotion = 'Regular'
|
708 |
|
709 |
+
for segment in segments:
|
710 |
+
emotion = segment['emotion']
|
711 |
+
text = segment['text']
|
|
|
|
|
|
|
|
|
712 |
|
713 |
+
if emotion in speech_types:
|
714 |
+
current_emotion = emotion
|
715 |
+
else:
|
716 |
+
# If emotion not available, default to Regular
|
717 |
+
current_emotion = 'Regular'
|
718 |
|
719 |
+
ref_audio = speech_types[current_emotion]['audio']
|
720 |
+
ref_text = speech_types[current_emotion].get('ref_text', '')
|
|
|
|
|
721 |
|
722 |
+
# Generate speech for this segment
|
723 |
+
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, "")
|
724 |
+
sr, audio_data = audio
|
|
|
|
|
725 |
|
726 |
+
generated_audio_segments.append(audio_data)
|
|
|
727 |
|
728 |
+
# Concatenate all audio segments
|
729 |
+
if generated_audio_segments:
|
730 |
+
final_audio_data = np.concatenate(generated_audio_segments)
|
731 |
+
return (sr, final_audio_data)
|
732 |
+
else:
|
733 |
+
gr.Warning("No audio generated.")
|
734 |
+
return None
|
735 |
|
736 |
+
generate_emotional_btn.click(
|
737 |
+
generate_emotional_speech,
|
738 |
+
inputs=[
|
739 |
regular_audio,
|
740 |
regular_ref_text,
|
741 |
+
gen_text_input_emotional,
|
742 |
+
] + speech_type_names + speech_type_audios + speech_type_ref_texts + [
|
743 |
+
model_choice_emotional,
|
744 |
+
remove_silence_emotional,
|
745 |
+
],
|
746 |
+
outputs=audio_output_emotional,
|
747 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
748 |
|
749 |
+
# Validation function to disable Generate button if speech types are missing
|
750 |
+
def validate_speech_types(
|
751 |
+
gen_text,
|
752 |
+
regular_name,
|
753 |
+
*args
|
754 |
+
):
|
755 |
+
num_additional_speech_types = max_speech_types - 1
|
756 |
+
speech_type_names_list = args[:num_additional_speech_types]
|
757 |
+
|
758 |
+
# Collect the speech types names
|
759 |
+
speech_types_available = set()
|
760 |
+
if regular_name:
|
761 |
+
speech_types_available.add(regular_name)
|
762 |
+
for name_input in speech_type_names_list:
|
763 |
+
if name_input:
|
764 |
+
speech_types_available.add(name_input)
|
765 |
+
|
766 |
+
# Parse the gen_text to get the speech types used
|
767 |
+
segments = parse_emotional_text(gen_text)
|
768 |
+
speech_types_in_text = set(segment['emotion'] for segment in segments)
|
769 |
+
|
770 |
+
# Check if all speech types in text are available
|
771 |
+
missing_speech_types = speech_types_in_text - speech_types_available
|
772 |
+
|
773 |
+
if missing_speech_types:
|
774 |
+
# Disable the generate button
|
775 |
+
return gr.update(interactive=False)
|
776 |
+
else:
|
777 |
+
# Enable the generate button
|
778 |
+
return gr.update(interactive=True)
|
779 |
|
780 |
+
gen_text_input_emotional.change(
|
781 |
+
validate_speech_types,
|
782 |
+
inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
|
783 |
+
outputs=generate_emotional_btn
|
784 |
+
)
|
785 |
+
with gr.Blocks() as app:
|
786 |
+
gr.Markdown(
|
787 |
+
"""
|
788 |
+
# E2/F5 TTS
|
789 |
|
790 |
+
This is a local web UI for F5 TTS with advanced batch processing support. This app supports the following TTS models:
|
791 |
|
792 |
+
* [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
|
793 |
+
* [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
794 |
|
795 |
+
The checkpoints support English and Chinese.
|
796 |
+
|
797 |
+
If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
|
798 |
+
|
799 |
+
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
|
800 |
+
"""
|
801 |
+
)
|
802 |
+
gr.TabbedInterface([app_tts, app_podcast, app_emotional, app_credits], ["TTS", "Podcast", "Multi-Style", "Credits"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
803 |
|
|
|
|
|
|
|
|
|
|
|
|
|
804 |
@click.command()
|
805 |
@click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
|
806 |
@click.option("--host", "-H", default=None, help="Host to run the app on")
|
|
|
821 |
|
822 |
|
823 |
if __name__ == "__main__":
|
824 |
+
main()
|