Spaces:

Emmiq
/

EmmiSpace

Build error

App Files Files Community

SWivid commited on Oct 15, 2024

Commit

eb1a2c4

2 Parent(s): c2e2e16 1c556bc

Merge branch 'main' of github.com:SWivid/F5-TTS into main

Browse files

Files changed (1) hide show

gradio_app.py +94 -130

gradio_app.py CHANGED Viewed

@@ -31,19 +31,6 @@ def gpu_decorator(func):
     else:
         return func
-SPLIT_WORDS = [
-    "but", "however", "nevertheless", "yet", "still",
-    "therefore", "thus", "hence", "consequently",
-    "moreover", "furthermore", "additionally",
-    "meanwhile", "alternatively", "otherwise",
-    "namely", "specifically", "for example", "such as",
-    "in fact", "indeed", "notably",
-    "in contrast", "on the other hand", "conversely",
-    "in conclusion", "to summarize", "finally"
-]
 device = (
     "cuda"
     if torch.cuda.is_available()
@@ -71,7 +58,6 @@ cfg_strength = 2.0
 ode_method = "euler"
 sway_sampling_coef = -1.0
 speed = 1.0
-# fix_duration = 27  # None or float (duration in seconds)
 fix_duration = None
@@ -112,104 +98,37 @@ E2TTS_ema_model = load_model(
     "E2-TTS", "E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000
 )
-def split_text_into_batches(text, max_chars=200, split_words=SPLIT_WORDS):
-    if len(text.encode('utf-8')) <= max_chars:
-        return [text]
-    if text[-1] not in ['。', '.', '!', '！', '?', '？']:
-        text += '.'
-    sentences = re.split('([。.!?！？])', text)
-    sentences = [''.join(i) for i in zip(sentences[0::2], sentences[1::2])]
-    batches = []
-    current_batch = ""
-    def split_by_words(text):
-        words = text.split()
-        current_word_part = ""
-        word_batches = []
-        for word in words:
-            if len(current_word_part.encode('utf-8')) + len(word.encode('utf-8')) + 1 <= max_chars:
-                current_word_part += word + ' '
-            else:
-                if current_word_part:
-                    # Try to find a suitable split word
-                    for split_word in split_words:
-                        split_index = current_word_part.rfind(' ' + split_word + ' ')
-                        if split_index != -1:
-                            word_batches.append(current_word_part[:split_index].strip())
-                            current_word_part = current_word_part[split_index:].strip() + ' '
-                            break
-                    else:
-                        # If no suitable split word found, just append the current part
-                        word_batches.append(current_word_part.strip())
-                        current_word_part = ""
-                current_word_part += word + ' '
-        if current_word_part:
-            word_batches.append(current_word_part.strip())
-        return word_batches
     for sentence in sentences:
-        if len(current_batch.encode('utf-8')) + len(sentence.encode('utf-8')) <= max_chars:
-            current_batch += sentence
         else:
-            # If adding this sentence would exceed the limit
-            if current_batch:
-                batches.append(current_batch)
-                current_batch = ""
-            # If the sentence itself is longer than max_chars, split it
-            if len(sentence.encode('utf-8')) > max_chars:
-                # First, try to split by colon
-                colon_parts = sentence.split(':')
-                if len(colon_parts) > 1:
-                    for part in colon_parts:
-                        if len(part.encode('utf-8')) <= max_chars:
-                            batches.append(part)
-                        else:
-                            # If colon part is still too long, split by comma
-                            comma_parts = re.split('[,，]', part)
-                            if len(comma_parts) > 1:
-                                current_comma_part = ""
-                                for comma_part in comma_parts:
-                                    if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
-                                        current_comma_part += comma_part + ','
-                                    else:
-                                        if current_comma_part:
-                                            batches.append(current_comma_part.rstrip(','))
-                                        current_comma_part = comma_part + ','
-                                if current_comma_part:
-                                    batches.append(current_comma_part.rstrip(','))
-                            else:
-                                # If no comma, split by words
-                                batches.extend(split_by_words(part))
-                else:
-                    # If no colon, split by comma
-                    comma_parts = re.split('[,，]', sentence)
-                    if len(comma_parts) > 1:
-                        current_comma_part = ""
-                        for comma_part in comma_parts:
-                            if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
-                                current_comma_part += comma_part + ','
-                            else:
-                                if current_comma_part:
-                                    batches.append(current_comma_part.rstrip(','))
-                                current_comma_part = comma_part + ','
-                        if current_comma_part:
-                            batches.append(current_comma_part.rstrip(','))
-                    else:
-                        # If no comma, split by words
-                        batches.extend(split_by_words(sentence))
-            else:
-                current_batch = sentence
-    if current_batch:
-        batches.append(current_batch)
-    return batches
 @gpu_decorator
-def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, progress=gr.Progress()):
     if exp_name == "F5-TTS":
         ema_model = F5TTS_ema_model
     elif exp_name == "E2-TTS":
@@ -267,8 +186,44 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
         generated_waves.append(generated_wave)
         spectrograms.append(generated_mel_spec[0].cpu().numpy())
-    # Combine all generated waves
-    final_wave = np.concatenate(generated_waves)
     # Remove silence
     if remove_silence:
@@ -294,11 +249,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
     return (target_sample_rate, final_wave), spectrogram_path
 @gpu_decorator
-def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_split_words=''):
-    if not custom_split_words.strip():
-        custom_words = [word.strip() for word in custom_split_words.split(',')]
-        global SPLIT_WORDS
-        SPLIT_WORDS = custom_words
     print(gen_text)
@@ -306,7 +257,9 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_s
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         aseg = AudioSegment.from_file(ref_audio_orig)
-        non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)
         non_silent_wave = AudioSegment.silent(duration=0)
         for non_silent_seg in non_silent_segs:
             non_silent_wave += non_silent_seg
@@ -332,16 +285,24 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_s
     else:
         gr.Info("Using custom reference text...")
-    # Split the input text into batches
     audio, sr = torchaudio.load(ref_audio)
-    max_chars = int(len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (30 - audio.shape[-1] / sr))
-    gen_text_batches = split_text_into_batches(gen_text, max_chars=max_chars)
     print('ref_text', ref_text)
-    for i, gen_text in enumerate(gen_text_batches):
-        print(f'gen_text {i}', gen_text)
     gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
-    return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence)
 @gpu_decorator
 def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
@@ -446,12 +407,7 @@ with gr.Blocks() as app_tts:
         remove_silence = gr.Checkbox(
             label="Remove Silences",
             info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
-            value=True,
-        )
-        split_words_input = gr.Textbox(
-            label="Custom Split Words",
-            info="Enter custom words to split on, separated by commas. Leave blank to use default list.",
-            lines=2,
         )
         speed_slider = gr.Slider(
             label="Speed",
@@ -461,6 +417,14 @@ with gr.Blocks() as app_tts:
             step=0.1,
             info="Adjust the speed of the audio.",
         )
     speed_slider.change(update_speed, inputs=speed_slider)
     audio_output = gr.Audio(label="Synthesized Audio")
@@ -474,7 +438,7 @@ with gr.Blocks() as app_tts:
             gen_text_input,
             model_choice,
             remove_silence,
-            split_words_input,
         ],
         outputs=[audio_output, spectrogram_output],
     )
@@ -722,7 +686,7 @@ with gr.Blocks() as app_emotional:
             ref_text = speech_types[current_emotion].get('ref_text', '')
             # Generate speech for this segment
-            audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, "")
             sr, audio_data = audio
             generated_audio_segments.append(audio_data)
@@ -823,4 +787,4 @@ def main(port, host, share, api):
 if __name__ == "__main__":
-    main()

     else:
         return func
 device = (
     "cuda"
     if torch.cuda.is_available()
 ode_method = "euler"
 sway_sampling_coef = -1.0
 speed = 1.0
 fix_duration = None
     "E2-TTS", "E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000
 )
+def chunk_text(text, max_chars=135):
+    """
+    Splits the input text into chunks, each with a maximum number of characters.
+    Args:
+        text (str): The text to be split.
+        max_chars (int): The maximum number of characters per chunk.
+    Returns:
+        List[str]: A list of text chunks.
+    """
+    chunks = []
+    current_chunk = ""
+    # Split the text into sentences based on punctuation followed by whitespace
+    sentences = re.split(r'(?<=[;:,.!?])\s+', text)
     for sentence in sentences:
+        if len(current_chunk) + len(sentence) <= max_chars:
+            current_chunk += sentence + " "
         else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
 @gpu_decorator
+def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
     if exp_name == "F5-TTS":
         ema_model = F5TTS_ema_model
     elif exp_name == "E2-TTS":
         generated_waves.append(generated_wave)
         spectrograms.append(generated_mel_spec[0].cpu().numpy())
+    # Combine all generated waves with cross-fading
+    if cross_fade_duration <= 0:
+        # Simply concatenate
+        final_wave = np.concatenate(generated_waves)
+    else:
+        final_wave = generated_waves[0]
+        for i in range(1, len(generated_waves)):
+            prev_wave = final_wave
+            next_wave = generated_waves[i]
+            # Calculate cross-fade samples, ensuring it does not exceed wave lengths
+            cross_fade_samples = int(cross_fade_duration * target_sample_rate)
+            cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
+            if cross_fade_samples <= 0:
+                # No overlap possible, concatenate
+                final_wave = np.concatenate([prev_wave, next_wave])
+                continue
+            # Overlapping parts
+            prev_overlap = prev_wave[-cross_fade_samples:]
+            next_overlap = next_wave[:cross_fade_samples]
+            # Fade out and fade in
+            fade_out = np.linspace(1, 0, cross_fade_samples)
+            fade_in = np.linspace(0, 1, cross_fade_samples)
+            # Cross-faded overlap
+            cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
+            # Combine
+            new_wave = np.concatenate([
+                prev_wave[:-cross_fade_samples],
+                cross_faded_overlap,
+                next_wave[cross_fade_samples:]
+            ])
+            final_wave = new_wave
     # Remove silence
     if remove_silence:
     return (target_sample_rate, final_wave), spectrogram_path
 @gpu_decorator
+def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15):
     print(gen_text)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         aseg = AudioSegment.from_file(ref_audio_orig)
+        non_silent_segs = silence.split_on_silence(
+            aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500
+        )
         non_silent_wave = AudioSegment.silent(duration=0)
         for non_silent_seg in non_silent_segs:
             non_silent_wave += non_silent_seg
     else:
         gr.Info("Using custom reference text...")
+    # Add the functionality to ensure it ends with ". "
+    if not ref_text.endswith(". "):
+        if ref_text.endswith("."):
+            ref_text += " "
+        else:
+            ref_text += ". "
     audio, sr = torchaudio.load(ref_audio)
+    # Use the new chunk_text function to split gen_text
+    gen_text_batches = chunk_text(gen_text, max_chars=135)
     print('ref_text', ref_text)
+    for i, batch_text in enumerate(gen_text_batches):
+        print(f'gen_text {i}', batch_text)
     gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
+    return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
 @gpu_decorator
 def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
         remove_silence = gr.Checkbox(
             label="Remove Silences",
             info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
+            value=False,
         )
         speed_slider = gr.Slider(
             label="Speed",
             step=0.1,
             info="Adjust the speed of the audio.",
         )
+        cross_fade_duration_slider = gr.Slider(
+            label="Cross-Fade Duration (s)",
+            minimum=0.0,
+            maximum=1.0,
+            value=0.15,
+            step=0.01,
+            info="Set the duration of the cross-fade between audio clips.",
+        )
     speed_slider.change(update_speed, inputs=speed_slider)
     audio_output = gr.Audio(label="Synthesized Audio")
             gen_text_input,
             model_choice,
             remove_silence,
+            cross_fade_duration_slider,
         ],
         outputs=[audio_output, spectrogram_output],
     )
             ref_text = speech_types[current_emotion].get('ref_text', '')
             # Generate speech for this segment
+            audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0)
             sr, audio_data = audio
             generated_audio_segments.append(audio_data)
 if __name__ == "__main__":
+    main()