Spaces:

zhouzhou363
/

f5-tts

Configuration error

App Files Files Community

Jarod Mica commited on Oct 16, 2024

Commit

2a521e9

unverified ·

2 Parent(s): 31e5051 bdc76f5

Merge branch 'SWivid:main' into main

Browse files

Files changed (7) hide show

README.md +18 -3
gradio_app.py +95 -132
inference-cli.py +111 -136
inference-cli.toml +1 -1
model/utils.py +6 -7
requirements.txt +2 -8
requirements_eval.txt +5 -0

README.md CHANGED Viewed

@@ -1,16 +1,25 @@
 # F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching
 [![python](https://img.shields.io/badge/Python-3.10-brightgreen)](https://github.com/SWivid/F5-TTS)
 [![arXiv](https://img.shields.io/badge/arXiv-2410.06885-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2410.06885)
 [![demo](https://img.shields.io/badge/GitHub-Demo%20page-blue.svg)](https://swivid.github.io/F5-TTS/)
 [![space](https://img.shields.io/badge/🤗-Space%20demo-yellow)](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
 **F5-TTS**: Diffusion Transformer with ConvNeXt V2, faster trained and inference.
-**E2 TTS**: Flat-UNet Transformer, closest reproduction.
 **Sway Sampling**: Inference-time flow step sampling strategy, greatly improves performance
 ## Installation
 Clone the repository:
@@ -62,7 +71,7 @@ An initial guidance on Finetuning [#57](https://github.com/SWivid/F5-TTS/discuss
 ## Inference
-To run inference with pretrained models, download the checkpoints from [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS), or automatically downloaded with `inference-cli` and `gradio_app`.
 Currently support 30s for a single generation, which is the **TOTAL** length of prompt audio and the generated. Batch inference with chunks is supported by `inference-cli` and `gradio_app`.
 - To avoid possible inference failures, make sure you have seen through the following instructions.
@@ -148,6 +157,12 @@ bash scripts/eval_infer_batch.sh
 ### Objective Evaluation
 **Some Notes**
 For faster-whisper with CUDA 11:
@@ -193,4 +208,4 @@ python scripts/eval_librispeech_test_clean.py
 ```
 ## License
-Our code is released under MIT License.

 # F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching
+<div style="position: absolute; width: 100%;">
+  <div style="position: absolute; top: 0; right: 100px;">
+    <img src="https://avatars.githubusercontent.com/u/35554183?s=200&v=4" alt="Watermark" style="width: 140px; height: auto;">
+  </div>
+</div>
 [![python](https://img.shields.io/badge/Python-3.10-brightgreen)](https://github.com/SWivid/F5-TTS)
 [![arXiv](https://img.shields.io/badge/arXiv-2410.06885-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2410.06885)
 [![demo](https://img.shields.io/badge/GitHub-Demo%20page-blue.svg)](https://swivid.github.io/F5-TTS/)
 [![space](https://img.shields.io/badge/🤗-Space%20demo-yellow)](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
+[![lab](https://img.shields.io/badge/X--LANCE-Lab-grey?labelColor=lightgrey)](https://x-lance.sjtu.edu.cn/)
 **F5-TTS**: Diffusion Transformer with ConvNeXt V2, faster trained and inference.
+**E2 TTS**: Flat-UNet Transformer, closest reproduction from [paper](https://arxiv.org/abs/2406.18009).
 **Sway Sampling**: Inference-time flow step sampling strategy, greatly improves performance
+### Thanks to all the contributors !
 ## Installation
 Clone the repository:
 ## Inference
+The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [⭐ Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or automatically downloaded with `inference-cli` and `gradio_app`.
 Currently support 30s for a single generation, which is the **TOTAL** length of prompt audio and the generated. Batch inference with chunks is supported by `inference-cli` and `gradio_app`.
 - To avoid possible inference failures, make sure you have seen through the following instructions.
 ### Objective Evaluation
+Install packages for evaluation:
+```bash
+pip install -r requirements_eval.txt
+```
 **Some Notes**
 For faster-whisper with CUDA 11:
 ```
 ## License
+Our code is released under MIT License. The pre-trained models are licensed under the CC-BY-NC license due to the training data Emilia, which is an in-the-wild dataset. Sorry for any inconvenience this may cause.

gradio_app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import os
 import re
 import torch
 import torchaudio
@@ -17,7 +16,6 @@ from model.utils import (
     save_spectrogram,
 )
 from transformers import pipeline
-import librosa
 import click
 import soundfile as sf
@@ -33,19 +31,6 @@ def gpu_decorator(func):
     else:
         return func
-SPLIT_WORDS = [
-    "but", "however", "nevertheless", "yet", "still",
-    "therefore", "thus", "hence", "consequently",
-    "moreover", "furthermore", "additionally",
-    "meanwhile", "alternatively", "otherwise",
-    "namely", "specifically", "for example", "such as",
-    "in fact", "indeed", "notably",
-    "in contrast", "on the other hand", "conversely",
-    "in conclusion", "to summarize", "finally"
-]
 device = (
     "cuda"
     if torch.cuda.is_available()
@@ -73,7 +58,6 @@ cfg_strength = 2.0
 ode_method = "euler"
 sway_sampling_coef = -1.0
 speed = 1.0
-# fix_duration = 27  # None or float (duration in seconds)
 fix_duration = None
@@ -114,104 +98,37 @@ E2TTS_ema_model = load_model(
     "E2-TTS", "E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000
 )
-def split_text_into_batches(text, max_chars=200, split_words=SPLIT_WORDS):
-    if len(text.encode('utf-8')) <= max_chars:
-        return [text]
-    if text[-1] not in ['。', '.', '!', '！', '?', '？']:
-        text += '.'
-    sentences = re.split('([。.!?！？])', text)
-    sentences = [''.join(i) for i in zip(sentences[0::2], sentences[1::2])]
-    batches = []
-    current_batch = ""
-    def split_by_words(text):
-        words = text.split()
-        current_word_part = ""
-        word_batches = []
-        for word in words:
-            if len(current_word_part.encode('utf-8')) + len(word.encode('utf-8')) + 1 <= max_chars:
-                current_word_part += word + ' '
-            else:
-                if current_word_part:
-                    # Try to find a suitable split word
-                    for split_word in split_words:
-                        split_index = current_word_part.rfind(' ' + split_word + ' ')
-                        if split_index != -1:
-                            word_batches.append(current_word_part[:split_index].strip())
-                            current_word_part = current_word_part[split_index:].strip() + ' '
-                            break
-                    else:
-                        # If no suitable split word found, just append the current part
-                        word_batches.append(current_word_part.strip())
-                        current_word_part = ""
-                current_word_part += word + ' '
-        if current_word_part:
-            word_batches.append(current_word_part.strip())
-        return word_batches
     for sentence in sentences:
-        if len(current_batch.encode('utf-8')) + len(sentence.encode('utf-8')) <= max_chars:
-            current_batch += sentence
         else:
-            # If adding this sentence would exceed the limit
-            if current_batch:
-                batches.append(current_batch)
-                current_batch = ""
-            # If the sentence itself is longer than max_chars, split it
-            if len(sentence.encode('utf-8')) > max_chars:
-                # First, try to split by colon
-                colon_parts = sentence.split(':')
-                if len(colon_parts) > 1:
-                    for part in colon_parts:
-                        if len(part.encode('utf-8')) <= max_chars:
-                            batches.append(part)
-                        else:
-                            # If colon part is still too long, split by comma
-                            comma_parts = re.split('[,，]', part)
-                            if len(comma_parts) > 1:
-                                current_comma_part = ""
-                                for comma_part in comma_parts:
-                                    if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
-                                        current_comma_part += comma_part + ','
-                                    else:
-                                        if current_comma_part:
-                                            batches.append(current_comma_part.rstrip(','))
-                                        current_comma_part = comma_part + ','
-                                if current_comma_part:
-                                    batches.append(current_comma_part.rstrip(','))
-                            else:
-                                # If no comma, split by words
-                                batches.extend(split_by_words(part))
-                else:
-                    # If no colon, split by comma
-                    comma_parts = re.split('[,，]', sentence)
-                    if len(comma_parts) > 1:
-                        current_comma_part = ""
-                        for comma_part in comma_parts:
-                            if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
-                                current_comma_part += comma_part + ','
-                            else:
-                                if current_comma_part:
-                                    batches.append(current_comma_part.rstrip(','))
-                                current_comma_part = comma_part + ','
-                        if current_comma_part:
-                            batches.append(current_comma_part.rstrip(','))
-                    else:
-                        # If no comma, split by words
-                        batches.extend(split_by_words(sentence))
-            else:
-                current_batch = sentence
-    if current_batch:
-        batches.append(current_batch)
-    return batches
 @gpu_decorator
-def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, progress=gr.Progress()):
     if exp_name == "F5-TTS":
         ema_model = F5TTS_ema_model
     elif exp_name == "E2-TTS":
@@ -269,8 +186,44 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
         generated_waves.append(generated_wave)
         spectrograms.append(generated_mel_spec[0].cpu().numpy())
-    # Combine all generated waves
-    final_wave = np.concatenate(generated_waves)
     # Remove silence
     if remove_silence:
@@ -296,11 +249,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
     return (target_sample_rate, final_wave), spectrogram_path
 @gpu_decorator
-def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_split_words=''):
-    if not custom_split_words.strip():
-        custom_words = [word.strip() for word in custom_split_words.split(',')]
-        global SPLIT_WORDS
-        SPLIT_WORDS = custom_words
     print(gen_text)
@@ -308,7 +257,9 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_s
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         aseg = AudioSegment.from_file(ref_audio_orig)
-        non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)
         non_silent_wave = AudioSegment.silent(duration=0)
         for non_silent_seg in non_silent_segs:
             non_silent_wave += non_silent_seg
@@ -334,16 +285,25 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_s
     else:
         gr.Info("Using custom reference text...")
-    # Split the input text into batches
     audio, sr = torchaudio.load(ref_audio)
-    max_chars = int(len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (30 - audio.shape[-1] / sr))
-    gen_text_batches = split_text_into_batches(gen_text, max_chars=max_chars)
     print('ref_text', ref_text)
-    for i, gen_text in enumerate(gen_text_batches):
-        print(f'gen_text {i}', gen_text)
     gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
-    return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence)
 @gpu_decorator
 def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
@@ -448,12 +408,7 @@ with gr.Blocks() as app_tts:
         remove_silence = gr.Checkbox(
             label="Remove Silences",
             info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
-            value=True,
-        )
-        split_words_input = gr.Textbox(
-            label="Custom Split Words",
-            info="Enter custom words to split on, separated by commas. Leave blank to use default list.",
-            lines=2,
         )
         speed_slider = gr.Slider(
             label="Speed",
@@ -463,6 +418,14 @@ with gr.Blocks() as app_tts:
             step=0.1,
             info="Adjust the speed of the audio.",
         )
     speed_slider.change(update_speed, inputs=speed_slider)
     audio_output = gr.Audio(label="Synthesized Audio")
@@ -476,7 +439,7 @@ with gr.Blocks() as app_tts:
             gen_text_input,
             model_choice,
             remove_silence,
-            split_words_input,
         ],
         outputs=[audio_output, spectrogram_output],
     )
@@ -724,7 +687,7 @@ with gr.Blocks() as app_emotional:
             ref_text = speech_types[current_emotion].get('ref_text', '')
             # Generate speech for this segment
-            audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, "")
             sr, audio_data = audio
             generated_audio_segments.append(audio_data)
@@ -825,4 +788,4 @@ def main(port, host, share, api):
 if __name__ == "__main__":
-    main()

 import re
 import torch
 import torchaudio
     save_spectrogram,
 )
 from transformers import pipeline
 import click
 import soundfile as sf
     else:
         return func
 device = (
     "cuda"
     if torch.cuda.is_available()
 ode_method = "euler"
 sway_sampling_coef = -1.0
 speed = 1.0
 fix_duration = None
     "E2-TTS", "E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000
 )
+def chunk_text(text, max_chars=135):
+    """
+    Splits the input text into chunks, each with a maximum number of characters.
+    Args:
+        text (str): The text to be split.
+        max_chars (int): The maximum number of characters per chunk.
+    Returns:
+        List[str]: A list of text chunks.
+    """
+    chunks = []
+    current_chunk = ""
+    # Split the text into sentences based on punctuation followed by whitespace
+    sentences = re.split(r'(?<=[;:,.!?])\s+|(?<=[；：，。！？])', text)
     for sentence in sentences:
+        if len(current_chunk.encode('utf-8')) + len(sentence.encode('utf-8')) <= max_chars:
+            current_chunk += sentence + " " if sentence and len(sentence[-1].encode('utf-8')) == 1 else sentence
         else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence + " " if sentence and len(sentence[-1].encode('utf-8')) == 1 else sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
 @gpu_decorator
+def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
     if exp_name == "F5-TTS":
         ema_model = F5TTS_ema_model
     elif exp_name == "E2-TTS":
         generated_waves.append(generated_wave)
         spectrograms.append(generated_mel_spec[0].cpu().numpy())
+    # Combine all generated waves with cross-fading
+    if cross_fade_duration <= 0:
+        # Simply concatenate
+        final_wave = np.concatenate(generated_waves)
+    else:
+        final_wave = generated_waves[0]
+        for i in range(1, len(generated_waves)):
+            prev_wave = final_wave
+            next_wave = generated_waves[i]
+            # Calculate cross-fade samples, ensuring it does not exceed wave lengths
+            cross_fade_samples = int(cross_fade_duration * target_sample_rate)
+            cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
+            if cross_fade_samples <= 0:
+                # No overlap possible, concatenate
+                final_wave = np.concatenate([prev_wave, next_wave])
+                continue
+            # Overlapping parts
+            prev_overlap = prev_wave[-cross_fade_samples:]
+            next_overlap = next_wave[:cross_fade_samples]
+            # Fade out and fade in
+            fade_out = np.linspace(1, 0, cross_fade_samples)
+            fade_in = np.linspace(0, 1, cross_fade_samples)
+            # Cross-faded overlap
+            cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
+            # Combine
+            new_wave = np.concatenate([
+                prev_wave[:-cross_fade_samples],
+                cross_faded_overlap,
+                next_wave[cross_fade_samples:]
+            ])
+            final_wave = new_wave
     # Remove silence
     if remove_silence:
     return (target_sample_rate, final_wave), spectrogram_path
 @gpu_decorator
+def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15):
     print(gen_text)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         aseg = AudioSegment.from_file(ref_audio_orig)
+        non_silent_segs = silence.split_on_silence(
+            aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000
+        )
         non_silent_wave = AudioSegment.silent(duration=0)
         for non_silent_seg in non_silent_segs:
             non_silent_wave += non_silent_seg
     else:
         gr.Info("Using custom reference text...")
+    # Add the functionality to ensure it ends with ". "
+    if not ref_text.endswith(". "):
+        if ref_text.endswith("."):
+            ref_text += " "
+        else:
+            ref_text += ". "
     audio, sr = torchaudio.load(ref_audio)
+    # Use the new chunk_text function to split gen_text
+    max_chars = int(len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
+    gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
     print('ref_text', ref_text)
+    for i, batch_text in enumerate(gen_text_batches):
+        print(f'gen_text {i}', batch_text)
     gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
+    return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
 @gpu_decorator
 def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
         remove_silence = gr.Checkbox(
             label="Remove Silences",
             info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
+            value=False,
         )
         speed_slider = gr.Slider(
             label="Speed",
             step=0.1,
             info="Adjust the speed of the audio.",
         )
+        cross_fade_duration_slider = gr.Slider(
+            label="Cross-Fade Duration (s)",
+            minimum=0.0,
+            maximum=1.0,
+            value=0.15,
+            step=0.01,
+            info="Set the duration of the cross-fade between audio clips.",
+        )
     speed_slider.change(update_speed, inputs=speed_slider)
     audio_output = gr.Audio(label="Synthesized Audio")
             gen_text_input,
             model_choice,
             remove_silence,
+            cross_fade_duration_slider,
         ],
         outputs=[audio_output, spectrogram_output],
     )
             ref_text = speech_types[current_emotion].get('ref_text', '')
             # Generate speech for this segment
+            audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0)
             sr, audio_data = audio
             generated_audio_segments.append(audio_data)
 if __name__ == "__main__":
+    main()

inference-cli.py CHANGED Viewed

@@ -1,26 +1,24 @@
 import re
 import torch
 import torchaudio
-import numpy as np
-import tempfile
 from einops import rearrange
-from vocos import Vocos
 from pydub import AudioSegment, silence
-from model import CFM, UNetT, DiT, MMDiT
-from cached_path import cached_path
-from model.utils import (
-    load_checkpoint,
-    get_tokenizer,
-    convert_char_to_pinyin,
-    save_spectrogram,
-)
 from transformers import pipeline
-import soundfile as sf
-import tomli
-import argparse
-import tqdm
-from pathlib import Path
-import codecs
 parser = argparse.ArgumentParser(
     prog="python3 inference-cli.py",
@@ -73,6 +71,11 @@ parser.add_argument(
     "--remove_silence",
     help="Remove silence.",
 )
 args = parser.parse_args()
 config = tomli.load(open(args.config, "rb"))
@@ -88,24 +91,23 @@ model = args.model if args.model else config["model"]
 remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
 wave_path = Path(output_dir)/"out.wav"
 spectrogram_path = Path(output_dir)/"out.png"
-SPLIT_WORDS = [
-    "but", "however", "nevertheless", "yet", "still",
-    "therefore", "thus", "hence", "consequently",
-    "moreover", "furthermore", "additionally",
-    "meanwhile", "alternatively", "otherwise",
-    "namely", "specifically", "for example", "such as",
-    "in fact", "indeed", "notably",
-    "in contrast", "on the other hand", "conversely",
-    "in conclusion", "to summarize", "finally"
-]
 device = (
     "cuda"
     if torch.cuda.is_available()
     else "mps" if torch.backends.mps.is_available() else "cpu"
 )
-vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
 print(f"Using {device} device")
@@ -124,8 +126,9 @@ speed = 1.0
 fix_duration = None
 def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
-    ckpt_path = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
-    # ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors
     vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
     model = CFM(
         transformer=model_cls(
@@ -153,103 +156,36 @@ F5TTS_model_cfg = dict(
 )
 E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
-def split_text_into_batches(text, max_chars=200, split_words=SPLIT_WORDS):
-    if len(text.encode('utf-8')) <= max_chars:
-        return [text]
-    if text[-1] not in ['。', '.', '!', '！', '?', '？']:
-        text += '.'
-    sentences = re.split('([。.!?！？])', text)
-    sentences = [''.join(i) for i in zip(sentences[0::2], sentences[1::2])]
-    batches = []
-    current_batch = ""
-    def split_by_words(text):
-        words = text.split()
-        current_word_part = ""
-        word_batches = []
-        for word in words:
-            if len(current_word_part.encode('utf-8')) + len(word.encode('utf-8')) + 1 <= max_chars:
-                current_word_part += word + ' '
-            else:
-                if current_word_part:
-                    # Try to find a suitable split word
-                    for split_word in split_words:
-                        split_index = current_word_part.rfind(' ' + split_word + ' ')
-                        if split_index != -1:
-                            word_batches.append(current_word_part[:split_index].strip())
-                            current_word_part = current_word_part[split_index:].strip() + ' '
-                            break
-                    else:
-                        # If no suitable split word found, just append the current part
-                        word_batches.append(current_word_part.strip())
-                        current_word_part = ""
-                current_word_part += word + ' '
-        if current_word_part:
-            word_batches.append(current_word_part.strip())
-        return word_batches
     for sentence in sentences:
-        if len(current_batch.encode('utf-8')) + len(sentence.encode('utf-8')) <= max_chars:
-            current_batch += sentence
         else:
-            # If adding this sentence would exceed the limit
-            if current_batch:
-                batches.append(current_batch)
-                current_batch = ""
-            # If the sentence itself is longer than max_chars, split it
-            if len(sentence.encode('utf-8')) > max_chars:
-                # First, try to split by colon
-                colon_parts = sentence.split(':')
-                if len(colon_parts) > 1:
-                    for part in colon_parts:
-                        if len(part.encode('utf-8')) <= max_chars:
-                            batches.append(part)
-                        else:
-                            # If colon part is still too long, split by comma
-                            comma_parts = re.split('[,，]', part)
-                            if len(comma_parts) > 1:
-                                current_comma_part = ""
-                                for comma_part in comma_parts:
-                                    if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
-                                        current_comma_part += comma_part + ','
-                                    else:
-                                        if current_comma_part:
-                                            batches.append(current_comma_part.rstrip(','))
-                                        current_comma_part = comma_part + ','
-                                if current_comma_part:
-                                    batches.append(current_comma_part.rstrip(','))
-                            else:
-                                # If no comma, split by words
-                                batches.extend(split_by_words(part))
-                else:
-                    # If no colon, split by comma
-                    comma_parts = re.split('[,，]', sentence)
-                    if len(comma_parts) > 1:
-                        current_comma_part = ""
-                        for comma_part in comma_parts:
-                            if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
-                                current_comma_part += comma_part + ','
-                            else:
-                                if current_comma_part:
-                                    batches.append(current_comma_part.rstrip(','))
-                                current_comma_part = comma_part + ','
-                        if current_comma_part:
-                            batches.append(current_comma_part.rstrip(','))
-                    else:
-                        # If no comma, split by words
-                        batches.extend(split_by_words(sentence))
-            else:
-                current_batch = sentence
-    if current_batch:
-        batches.append(current_batch)
-    return batches
-def infer_batch(ref_audio, ref_text, gen_text_batches, model, remove_silence):
     if model == "F5-TTS":
         ema_model = load_model(model, "F5TTS_Base", DiT, F5TTS_model_cfg, 1200000)
     elif model == "E2-TTS":
@@ -307,8 +243,44 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, model, remove_silence):
         generated_waves.append(generated_wave)
         spectrograms.append(generated_mel_spec[0].cpu().numpy())
-    # Combine all generated waves
-    final_wave = np.concatenate(generated_waves)
     with open(wave_path, "wb") as f:
         sf.write(f.name, final_wave, target_sample_rate)
@@ -329,11 +301,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, model, remove_silence):
     print(spectrogram_path)
-def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, custom_split_words):
-    if not custom_split_words.strip():
-        custom_words = [word.strip() for word in custom_split_words.split(',')]
-        global SPLIT_WORDS
-        SPLIT_WORDS = custom_words
     print(gen_text)
@@ -341,7 +309,7 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, custom_spli
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         aseg = AudioSegment.from_file(ref_audio_orig)
-        non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)
         non_silent_wave = AudioSegment.silent(duration=0)
         for non_silent_seg in non_silent_segs:
             non_silent_wave += non_silent_seg
@@ -373,16 +341,23 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, custom_spli
     else:
         print("Using custom reference text...")
     # Split the input text into batches
     audio, sr = torchaudio.load(ref_audio)
-    max_chars = int(len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (30 - audio.shape[-1] / sr))
-    gen_text_batches = split_text_into_batches(gen_text, max_chars=max_chars)
     print('ref_text', ref_text)
     for i, gen_text in enumerate(gen_text_batches):
         print(f'gen_text {i}', gen_text)
     print(f"Generating audio using {model} in {len(gen_text_batches)} batches, loading models...")
-    return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence)
-infer(ref_audio, ref_text, gen_text, model, remove_silence, ",".join(SPLIT_WORDS))

+import argparse
+import codecs
 import re
+import tempfile
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import tomli
 import torch
 import torchaudio
+import tqdm
+from cached_path import cached_path
 from einops import rearrange
 from pydub import AudioSegment, silence
 from transformers import pipeline
+from vocos import Vocos
+from model import CFM, DiT, MMDiT, UNetT
+from model.utils import (convert_char_to_pinyin, get_tokenizer,
+                         load_checkpoint, save_spectrogram)
 parser = argparse.ArgumentParser(
     prog="python3 inference-cli.py",
     "--remove_silence",
     help="Remove silence.",
 )
+parser.add_argument(
+    "--load_vocoder_from_local",
+    action="store_true",
+    help="load vocoder from local. Default: ../checkpoints/charactr/vocos-mel-24khz",
+)
 args = parser.parse_args()
 config = tomli.load(open(args.config, "rb"))
 remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
 wave_path = Path(output_dir)/"out.wav"
 spectrogram_path = Path(output_dir)/"out.png"
+vocos_local_path = "../checkpoints/charactr/vocos-mel-24khz"
 device = (
     "cuda"
     if torch.cuda.is_available()
     else "mps" if torch.backends.mps.is_available() else "cpu"
 )
+if args.load_vocoder_from_local:
+    print(f"Load vocos from local path {vocos_local_path}")
+    vocos = Vocos.from_hparams(f"{vocos_local_path}/config.yaml")
+    state_dict = torch.load(f"{vocos_local_path}/pytorch_model.bin", map_location=device)
+    vocos.load_state_dict(state_dict)
+    vocos.eval()
+else:
+    print("Donwload Vocos from huggingface charactr/vocos-mel-24khz")
+    vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
 print(f"Using {device} device")
 fix_duration = None
 def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
+    ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
+    if not Path(ckpt_path).exists():
+        ckpt_path = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
     vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
     model = CFM(
         transformer=model_cls(
 )
 E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
+def chunk_text(text, max_chars=135):
+    """
+    Splits the input text into chunks, each with a maximum number of characters.
+    Args:
+        text (str): The text to be split.
+        max_chars (int): The maximum number of characters per chunk.
+    Returns:
+        List[str]: A list of text chunks.
+    """
+    chunks = []
+    current_chunk = ""
+    # Split the text into sentences based on punctuation followed by whitespace
+    sentences = re.split(r'(?<=[;:,.!?])\s+|(?<=[；：，。！？])', text)
     for sentence in sentences:
+        if len(current_chunk.encode('utf-8')) + len(sentence.encode('utf-8')) <= max_chars:
+            current_chunk += sentence + " " if sentence and len(sentence[-1].encode('utf-8')) == 1 else sentence
         else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence + " " if sentence and len(sentence[-1].encode('utf-8')) == 1 else sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def infer_batch(ref_audio, ref_text, gen_text_batches, model, remove_silence, cross_fade_duration=0.15):
     if model == "F5-TTS":
         ema_model = load_model(model, "F5TTS_Base", DiT, F5TTS_model_cfg, 1200000)
     elif model == "E2-TTS":
         generated_waves.append(generated_wave)
         spectrograms.append(generated_mel_spec[0].cpu().numpy())
+    # Combine all generated waves with cross-fading
+    if cross_fade_duration <= 0:
+        # Simply concatenate
+        final_wave = np.concatenate(generated_waves)
+    else:
+        final_wave = generated_waves[0]
+        for i in range(1, len(generated_waves)):
+            prev_wave = final_wave
+            next_wave = generated_waves[i]
+            # Calculate cross-fade samples, ensuring it does not exceed wave lengths
+            cross_fade_samples = int(cross_fade_duration * target_sample_rate)
+            cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
+            if cross_fade_samples <= 0:
+                # No overlap possible, concatenate
+                final_wave = np.concatenate([prev_wave, next_wave])
+                continue
+            # Overlapping parts
+            prev_overlap = prev_wave[-cross_fade_samples:]
+            next_overlap = next_wave[:cross_fade_samples]
+            # Fade out and fade in
+            fade_out = np.linspace(1, 0, cross_fade_samples)
+            fade_in = np.linspace(0, 1, cross_fade_samples)
+            # Cross-faded overlap
+            cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
+            # Combine
+            new_wave = np.concatenate([
+                prev_wave[:-cross_fade_samples],
+                cross_faded_overlap,
+                next_wave[cross_fade_samples:]
+            ])
+            final_wave = new_wave
     with open(wave_path, "wb") as f:
         sf.write(f.name, final_wave, target_sample_rate)
     print(spectrogram_path)
+def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15):
     print(gen_text)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         aseg = AudioSegment.from_file(ref_audio_orig)
+        non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000)
         non_silent_wave = AudioSegment.silent(duration=0)
         for non_silent_seg in non_silent_segs:
             non_silent_wave += non_silent_seg
     else:
         print("Using custom reference text...")
+    # Add the functionality to ensure it ends with ". "
+    if not ref_text.endswith(". ") and not ref_text.endswith("。"):
+        if ref_text.endswith("."):
+            ref_text += " "
+        else:
+            ref_text += ". "
     # Split the input text into batches
     audio, sr = torchaudio.load(ref_audio)
+    max_chars = int(len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
+    gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
     print('ref_text', ref_text)
     for i, gen_text in enumerate(gen_text_batches):
         print(f'gen_text {i}', gen_text)
     print(f"Generating audio using {model} in {len(gen_text_batches)} batches, loading models...")
+    return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence, cross_fade_duration)
+infer(ref_audio, ref_text, gen_text, model, remove_silence)

inference-cli.toml CHANGED Viewed

@@ -6,5 +6,5 @@ ref_text = "Some call me nature, others call me mother nature."
 gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences."
 # File with text to generate. Ignores the text above.
 gen_file = ""
-remove_silence = true
 output_dir = "tests"

 gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences."
 # File with text to generate. Ignores the text above.
 gen_file = ""
+remove_silence = false
 output_dir = "tests"

model/utils.py CHANGED Viewed

@@ -22,12 +22,6 @@ from einops import rearrange, reduce
 import jieba
 from pypinyin import lazy_pinyin, Style
-import zhconv
-from zhon.hanzi import punctuation
-from jiwer import compute_measures
-from funasr import AutoModel
-from faster_whisper import WhisperModel
 from model.ecapa_tdnn import ECAPA_TDNN_SMALL
 from model.modules import MelSpec
@@ -432,6 +426,7 @@ def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path
 def load_asr_model(lang, ckpt_dir = ""):
     if lang == "zh":
         model = AutoModel(
             model = os.path.join(ckpt_dir, "paraformer-zh"),
             # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
@@ -440,6 +435,7 @@ def load_asr_model(lang, ckpt_dir = ""):
             disable_update=True,
             )  # following seed-tts setting
     elif lang == "en":
         model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
         model = WhisperModel(model_size, device="cuda", compute_type="float16")
     return model
@@ -451,6 +447,7 @@ def run_asr_wer(args):
     rank, lang, test_set, ckpt_dir = args
     if lang == "zh":
         torch.cuda.set_device(rank)
     elif lang == "en":
         os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
@@ -458,10 +455,12 @@ def run_asr_wer(args):
         raise NotImplementedError("lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.")
     asr_model = load_asr_model(lang, ckpt_dir = ckpt_dir)
     punctuation_all = punctuation + string.punctuation
     wers = []
     for gen_wav, prompt_wav, truth in tqdm(test_set):
         if lang == "zh":
             res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)

 import jieba
 from pypinyin import lazy_pinyin, Style
 from model.ecapa_tdnn import ECAPA_TDNN_SMALL
 from model.modules import MelSpec
 def load_asr_model(lang, ckpt_dir = ""):
     if lang == "zh":
+        from funasr import AutoModel
         model = AutoModel(
             model = os.path.join(ckpt_dir, "paraformer-zh"),
             # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
             disable_update=True,
             )  # following seed-tts setting
     elif lang == "en":
+        from faster_whisper import WhisperModel
         model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
         model = WhisperModel(model_size, device="cuda", compute_type="float16")
     return model
     rank, lang, test_set, ckpt_dir = args
     if lang == "zh":
+        import zhconv
         torch.cuda.set_device(rank)
     elif lang == "en":
         os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
         raise NotImplementedError("lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.")
     asr_model = load_asr_model(lang, ckpt_dir = ckpt_dir)
+    from zhon.hanzi import punctuation
     punctuation_all = punctuation + string.punctuation
     wers = []
+    from jiwer import compute_measures
     for gen_wav, prompt_wav, truth in tqdm(test_set):
         if lang == "zh":
             res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)

requirements.txt CHANGED Viewed

@@ -5,25 +5,19 @@ datasets
 einops>=0.8.0
 einx>=0.3.0
 ema_pytorch>=0.5.2
-faster_whisper
-funasr
 gradio
 jieba
-jiwer
 librosa
 matplotlib
-numpy==1.23.5
 pydub
 pypinyin
 safetensors
 soundfile
-# torch>=2.0
-# torchaudio>=2.3.0
 torchdiffeq
 tqdm>=4.65.0
 transformers
 vocos
 wandb
 x_transformers>=1.31.14
-zhconv
-zhon

 einops>=0.8.0
 einx>=0.3.0
 ema_pytorch>=0.5.2
 gradio
 jieba
 librosa
 matplotlib
+numpy<=1.26.4
 pydub
 pypinyin
 safetensors
 soundfile
+tomli
 torchdiffeq
 tqdm>=4.65.0
 transformers
 vocos
 wandb
 x_transformers>=1.31.14

requirements_eval.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+faster_whisper
+funasr
+jiwer
+zhconv
+zhon