E2-F5-TTS

Running

App Files Files Community

mrfakename commited on Oct 25, 2024

Commit

0978fba

verified ·

1 Parent(s): 0559e57

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (1) hide show

app.py +37 -34

app.py CHANGED Viewed

@@ -79,8 +79,10 @@ def generate_response(messages, model, tokenizer):
 @gpu_decorator
-def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1):
-    ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=gr.Info)
     if model == "F5-TTS":
         ema_model = F5TTS_ema_model
@@ -94,7 +96,7 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_
         ema_model,
         cross_fade_duration=cross_fade_duration,
         speed=speed,
-        show_info=gr.Info,
         progress=gr.Progress(),
     )
@@ -183,24 +185,24 @@ def parse_speechtypes_text(gen_text):
     segments = []
-    current_emotion = "Regular"
     for i in range(len(tokens)):
         if i % 2 == 0:
             # This is text
             text = tokens[i].strip()
             if text:
-                segments.append({"emotion": current_emotion, "text": text})
         else:
-            # This is emotion
-            emotion = tokens[i].strip()
-            current_emotion = emotion
     return segments
 with gr.Blocks() as app_multistyle:
-    # New section for emotional generation
     gr.Markdown(
         """
     # Multiple Speech-Type Generation
@@ -313,29 +315,29 @@ with gr.Blocks() as app_multistyle:
         delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
     # Text input for the prompt
-    gen_text_input_emotional = gr.Textbox(
         label="Text to Generate",
         lines=10,
         placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
     )
     # Model choice
-    model_choice_emotional = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
     with gr.Accordion("Advanced Settings", open=False):
-        remove_silence_emotional = gr.Checkbox(
             label="Remove Silences",
             value=False,
         )
     # Generate button
-    generate_emotional_btn = gr.Button("Generate Emotional Speech", variant="primary")
     # Output audio
-    audio_output_emotional = gr.Audio(label="Synthesized Audio")
     @gpu_decorator
-    def generate_emotional_speech(
         regular_audio,
         regular_ref_text,
         gen_text,
@@ -362,23 +364,23 @@ with gr.Blocks() as app_multistyle:
         # For each segment, generate speech
         generated_audio_segments = []
-        current_emotion = "Regular"
         for segment in segments:
-            emotion = segment["emotion"]
             text = segment["text"]
-            if emotion in speech_types:
-                current_emotion = emotion
             else:
-                # If emotion not available, default to Regular
-                current_emotion = "Regular"
-            ref_audio = speech_types[current_emotion]["audio"]
-            ref_text = speech_types[current_emotion].get("ref_text", "")
             # Generate speech for this segment
-            audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0)
             sr, audio_data = audio
             generated_audio_segments.append(audio_data)
@@ -391,21 +393,21 @@ with gr.Blocks() as app_multistyle:
             gr.Warning("No audio generated.")
             return None
-    generate_emotional_btn.click(
-        generate_emotional_speech,
         inputs=[
             regular_audio,
             regular_ref_text,
-            gen_text_input_emotional,
         ]
         + speech_type_names
         + speech_type_audios
         + speech_type_ref_texts
         + [
-            model_choice_emotional,
-            remove_silence_emotional,
         ],
-        outputs=audio_output_emotional,
     )
     # Validation function to disable Generate button if speech types are missing
@@ -423,7 +425,7 @@ with gr.Blocks() as app_multistyle:
         # Parse the gen_text to get the speech types used
         segments = parse_speechtypes_text(gen_text)
-        speech_types_in_text = set(segment["emotion"] for segment in segments)
         # Check if all speech types in text are available
         missing_speech_types = speech_types_in_text - speech_types_available
@@ -435,10 +437,10 @@ with gr.Blocks() as app_multistyle:
             # Enable the generate button
             return gr.update(interactive=True)
-    gen_text_input_emotional.change(
         validate_speech_types,
-        inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
-        outputs=generate_emotional_btn,
     )
@@ -576,6 +578,7 @@ Have a conversation with an AI using your reference voice!
                 remove_silence,
                 cross_fade_duration=0.15,
                 speed=1.0,
             )
             return audio_result

 @gpu_decorator
+def infer(
+    ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
+):
+    ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
     if model == "F5-TTS":
         ema_model = F5TTS_ema_model
         ema_model,
         cross_fade_duration=cross_fade_duration,
         speed=speed,
+        show_info=show_info,
         progress=gr.Progress(),
     )
     segments = []
+    current_style = "Regular"
     for i in range(len(tokens)):
         if i % 2 == 0:
             # This is text
             text = tokens[i].strip()
             if text:
+                segments.append({"style": current_style, "text": text})
         else:
+            # This is style
+            style = tokens[i].strip()
+            current_style = style
     return segments
 with gr.Blocks() as app_multistyle:
+    # New section for multistyle generation
     gr.Markdown(
         """
     # Multiple Speech-Type Generation
         delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
     # Text input for the prompt
+    gen_text_input_multistyle = gr.Textbox(
         label="Text to Generate",
         lines=10,
         placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
     )
     # Model choice
+    model_choice_multistyle = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
     with gr.Accordion("Advanced Settings", open=False):
+        remove_silence_multistyle = gr.Checkbox(
             label="Remove Silences",
             value=False,
         )
     # Generate button
+    generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
     # Output audio
+    audio_output_multistyle = gr.Audio(label="Synthesized Audio")
     @gpu_decorator
+    def generate_multistyle_speech(
         regular_audio,
         regular_ref_text,
         gen_text,
         # For each segment, generate speech
         generated_audio_segments = []
+        current_style = "Regular"
         for segment in segments:
+            style = segment["style"]
             text = segment["text"]
+            if style in speech_types:
+                current_style = style
             else:
+                # If style not available, default to Regular
+                current_style = "Regular"
+            ref_audio = speech_types[current_style]["audio"]
+            ref_text = speech_types[current_style].get("ref_text", "")
             # Generate speech for this segment
+            audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0, show_info=None)
             sr, audio_data = audio
             generated_audio_segments.append(audio_data)
             gr.Warning("No audio generated.")
             return None
+    generate_multistyle_btn.click(
+        generate_multistyle_speech,
         inputs=[
             regular_audio,
             regular_ref_text,
+            gen_text_input_multistyle,
         ]
         + speech_type_names
         + speech_type_audios
         + speech_type_ref_texts
         + [
+            model_choice_multistyle,
+            remove_silence_multistyle,
         ],
+        outputs=audio_output_multistyle,
     )
     # Validation function to disable Generate button if speech types are missing
         # Parse the gen_text to get the speech types used
         segments = parse_speechtypes_text(gen_text)
+        speech_types_in_text = set(segment["style"] for segment in segments)
         # Check if all speech types in text are available
         missing_speech_types = speech_types_in_text - speech_types_available
             # Enable the generate button
             return gr.update(interactive=True)
+    gen_text_input_multistyle.change(
         validate_speech_types,
+        inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
+        outputs=generate_multistyle_btn,
     )
                 remove_silence,
                 cross_fade_duration=0.15,
                 speed=1.0,
+                show_info=None,
             )
             return audio_result