Spaces:

zhouzhou363
/

f5-tts

Configuration error

App Files Files Community

SWivid commited on Oct 24, 2024

Commit

ba4b04b

1 Parent(s): 254e5e6

finish eval dependencies; update infer_gradio with chat feature

Browse files

Files changed (10) hide show

README.md +5 -48
pyproject.toml +1 -0
src/f5_tts/eval/README.md +45 -0
src/f5_tts/eval/eval_infer_batch.py +9 -9
src/f5_tts/eval/eval_infer_batch.sh +6 -6
src/f5_tts/eval/eval_librispeech_test_clean.py +6 -2
src/f5_tts/eval/eval_seedtts_testset.py +7 -3
src/f5_tts/infer/infer_gradio.py +137 -119
src/f5_tts/infer/utils_infer.py +30 -15
src/f5_tts/model/utils.py +3 -2

README.md CHANGED Viewed

@@ -81,6 +81,9 @@ python scripts/prepare_emilia.py
 # Prepare the Wenetspeech4TTS dataset
 python scripts/prepare_wenetspeech4tts.py
 ```
 ## Training & Finetuning
@@ -175,6 +178,7 @@ python inference-cli.py \
 --gen_text "突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道，我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"
 # Multi voice
 python inference-cli.py -c samples/story.toml
 ```
@@ -211,54 +215,7 @@ To test speech editing capabilities, use the following command.
 python f5_tts/speech_edit.py
 ```
-## Evaluation
-### Prepare Test Datasets
-1. Seed-TTS test set: Download from [seed-tts-eval](https://github.com/BytedanceSpeech/seed-tts-eval).
-2. LibriSpeech test-clean: Download from [OpenSLR](http://www.openslr.org/12/).
-3. Unzip the downloaded datasets and place them in the data/ directory.
-4. Update the path for the test-clean data in `scripts/eval_infer_batch.py`
-5. Our filtered LibriSpeech-PC 4-10s subset is already under data/ in this repo
-### Batch Inference for Test Set
-To run batch inference for evaluations, execute the following commands:
-```bash
-# switch to the main directory
-cd f5_tts
-# batch inference for evaluations
-accelerate config  # if not set before
-bash scripts/eval_infer_batch.sh
-```
-### Download Evaluation Model Checkpoints
-1. Chinese ASR Model: [Paraformer-zh](https://huggingface.co/funasr/paraformer-zh)
-2. English ASR Model: [Faster-Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)
-3. WavLM Model: Download from [Google Drive](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view).
-### Objective Evaluation
-Install packages for evaluation:
-```bash
-pip install -e .[eval]
-```
-Update the path with your batch-inferenced results, and carry out WER / SIM evaluations:
-```bash
-# switch to the main directory
-cd f5_tts
-# Evaluation for Seed-TTS test set
-python scripts/eval_seedtts_testset.py
-# Evaluation for LibriSpeech-PC test-clean (cross-sentence)
-python scripts/eval_librispeech_test_clean.py
-```
 ## Acknowledgements

 # Prepare the Wenetspeech4TTS dataset
 python scripts/prepare_wenetspeech4tts.py
+# https://github.com/SWivid/F5-TTS/discussions/57#discussioncomment-10959029
+python scripts/prepare_csv_wavs.py
 ```
 ## Training & Finetuning
 --gen_text "突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道，我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"
 # Multi voice
+# https://github.com/SWivid/F5-TTS/pull/146#issue-2595207852
 python inference-cli.py -c samples/story.toml
 ```
 python f5_tts/speech_edit.py
 ```
+## [Evaluation](src/f5_tts/eval/README.md)
 ## Acknowledgements

pyproject.toml CHANGED Viewed

@@ -46,6 +46,7 @@ eval = [
     "faster_whisper==0.10.1",
     "funasr",
     "jiwer",
     "zhconv",
     "zhon",
 ]

     "faster_whisper==0.10.1",
     "funasr",
     "jiwer",
+    "modelscope",
     "zhconv",
     "zhon",
 ]

src/f5_tts/eval/README.md ADDED Viewed

	@@ -0,0 +1,45 @@

+## Evaluation
+Install packages for evaluation:
+```bash
+pip install -e .[eval]
+```
+### Prepare Test Datasets
+1. *Seed-TTS testset*: Download from [seed-tts-eval](https://github.com/BytedanceSpeech/seed-tts-eval).
+2. *LibriSpeech test-clean*: Download from [OpenSLR](http://www.openslr.org/12/).
+3. Unzip the downloaded datasets and place them in the `data/` directory.
+4. Update the path for *LibriSpeech test-clean* data in `src/f5_tts/eval/eval_infer_batch.py`
+5. Our filtered LibriSpeech-PC 4-10s subset: `data/librispeech_pc_test_clean_cross_sentence.lst`
+### Batch Inference for Test Set
+To run batch inference for evaluations, execute the following commands:
+```bash
+# batch inference for evaluations
+accelerate config  # if not set before
+bash src/f5_tts/eval/eval_infer_batch.sh
+```
+### Download Evaluation Model Checkpoints
+1. Chinese ASR Model: [Paraformer-zh](https://huggingface.co/funasr/paraformer-zh)
+2. English ASR Model: [Faster-Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)
+3. WavLM Model: Download from [Google Drive](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view).
+Then update in the following scripts with the paths you put evaluation model ckpts to.
+### Objective Evaluation
+Update the path with your batch-inferenced results, and carry out WER / SIM evaluations:
+```bash
+# Evaluation for Seed-TTS test set
+python src/f5_tts/eval/eval_seedtts_testset.py
+# Evaluation for LibriSpeech-PC test-clean (cross-sentence)
+python src/f5_tts/eval/eval_librispeech_test_clean.py
+```

src/f5_tts/eval/eval_infer_batch.py CHANGED Viewed

@@ -14,9 +14,9 @@ from accelerate import Accelerator
 from vocos import Vocos
 from f5_tts.model import CFM, UNetT, DiT
-from f5_tts.model.utils import (
-    load_checkpoint,
-    get_tokenizer,
     get_seedtts_testset_metainfo,
     get_librispeech_test_clean_metainfo,
     get_inference_prompt,
@@ -34,6 +34,7 @@ hop_length = 256
 target_rms = 0.1
 tokenizer = "pinyin"
 def main():
@@ -58,7 +59,7 @@ def main():
     dataset_name = args.dataset
     exp_name = args.expname
     ckpt_step = args.ckptstep
-    ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt"
     nfe_step = args.nfestep
     ode_method = args.odemethod
@@ -80,23 +81,22 @@ def main():
         model_cls = UNetT
         model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
-    datapath = files("f5_tts").joinpath("data")
     if testset == "ls_pc_test_clean":
-        metalst = os.path.join(datapath, "librispeech_pc_test_clean_cross_sentence.lst")
         librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean"  # test-clean path
         metainfo = get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path)
     elif testset == "seedtts_test_zh":
-        metalst = os.path.join(datapath, "seedtts_testset/zh/meta.lst")
         metainfo = get_seedtts_testset_metainfo(metalst)
     elif testset == "seedtts_test_en":
-        metalst = os.path.join(datapath, "seedtts_testset/en/meta.lst")
         metainfo = get_seedtts_testset_metainfo(metalst)
     # path to save genereted wavs
     output_dir = (
         f"results/{exp_name}_{ckpt_step}/{testset}/"
         f"seed{seed}_{ode_method}_nfe{nfe_step}"
         f"{f'_ss{sway_sampling_coef}' if sway_sampling_coef else ''}"

 from vocos import Vocos
 from f5_tts.model import CFM, UNetT, DiT
+from f5_tts.model.utils import get_tokenizer
+from f5_tts.infer.utils_infer import load_checkpoint
+from f5_tts.eval.utils_eval import (
     get_seedtts_testset_metainfo,
     get_librispeech_test_clean_metainfo,
     get_inference_prompt,
 target_rms = 0.1
 tokenizer = "pinyin"
+rel_path = str(files("f5_tts").joinpath("../../"))
 def main():
     dataset_name = args.dataset
     exp_name = args.expname
     ckpt_step = args.ckptstep
+    ckpt_path = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}.pt"
     nfe_step = args.nfestep
     ode_method = args.odemethod
         model_cls = UNetT
         model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
     if testset == "ls_pc_test_clean":
+        metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
         librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean"  # test-clean path
         metainfo = get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path)
     elif testset == "seedtts_test_zh":
+        metalst = rel_path + "/data/seedtts_testset/zh/meta.lst"
         metainfo = get_seedtts_testset_metainfo(metalst)
     elif testset == "seedtts_test_en":
+        metalst = rel_path + "/data/seedtts_testset/en/meta.lst"
         metainfo = get_seedtts_testset_metainfo(metalst)
     # path to save genereted wavs
     output_dir = (
+        f"{rel_path}/"
         f"results/{exp_name}_{ckpt_step}/{testset}/"
         f"seed{seed}_{ode_method}_nfe{nfe_step}"
         f"{f'_ss{sway_sampling_coef}' if sway_sampling_coef else ''}"

src/f5_tts/eval/eval_infer_batch.sh CHANGED Viewed

@@ -1,13 +1,13 @@
 #!/bin/bash
 # e.g. F5-TTS, 16 NFE
-accelerate launch scripts/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "seedtts_test_zh" -nfe 16
-accelerate launch scripts/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "seedtts_test_en" -nfe 16
-accelerate launch scripts/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "ls_pc_test_clean" -nfe 16
 # e.g. Vanilla E2 TTS, 32 NFE
-accelerate launch scripts/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "seedtts_test_zh" -o "midpoint" -ss 0
-accelerate launch scripts/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "seedtts_test_en" -o "midpoint" -ss 0
-accelerate launch scripts/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "ls_pc_test_clean" -o "midpoint" -ss 0
 # etc.

 #!/bin/bash
 # e.g. F5-TTS, 16 NFE
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "seedtts_test_zh" -nfe 16
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "seedtts_test_en" -nfe 16
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "ls_pc_test_clean" -nfe 16
 # e.g. Vanilla E2 TTS, 32 NFE
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "seedtts_test_zh" -o "midpoint" -ss 0
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "seedtts_test_en" -o "midpoint" -ss 0
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "ls_pc_test_clean" -o "midpoint" -ss 0
 # etc.

src/f5_tts/eval/eval_librispeech_test_clean.py CHANGED Viewed

@@ -6,18 +6,22 @@ import os
 sys.path.append(os.getcwd())
 import multiprocessing as mp
 import numpy as np
-from f5_tts.model.utils import (
     get_librispeech_test,
     run_asr_wer,
     run_sim,
 )
 eval_task = "wer"  # sim | wer
 lang = "en"
-metalst = "data/librispeech_pc_test_clean_cross_sentence.lst"
 librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean"  # test-clean path
 gen_wav_dir = "PATH_TO_GENERATED"  # generated wavs

 sys.path.append(os.getcwd())
 import multiprocessing as mp
+from importlib.resources import files
 import numpy as np
+from f5_tts.eval.utils_eval import (
     get_librispeech_test,
     run_asr_wer,
     run_sim,
 )
+rel_path = str(files("f5_tts").joinpath("../../"))
 eval_task = "wer"  # sim | wer
 lang = "en"
+metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
 librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean"  # test-clean path
 gen_wav_dir = "PATH_TO_GENERATED"  # generated wavs

src/f5_tts/eval/eval_seedtts_testset.py CHANGED Viewed

@@ -6,19 +6,23 @@ import os
 sys.path.append(os.getcwd())
 import multiprocessing as mp
 import numpy as np
-from f5_tts.model.utils import (
     get_seed_tts_test,
     run_asr_wer,
     run_sim,
 )
 eval_task = "wer"  # sim | wer
 lang = "zh"  # zh | en
-metalst = f"data/seedtts_testset/{lang}/meta.lst"  # seed-tts testset
-# gen_wav_dir = f"data/seedtts_testset/{lang}/wavs"  # ground truth wavs
 gen_wav_dir = "PATH_TO_GENERATED"  # generated wavs

 sys.path.append(os.getcwd())
 import multiprocessing as mp
+from importlib.resources import files
 import numpy as np
+from f5_tts.eval.utils_eval import (
     get_seed_tts_test,
     run_asr_wer,
     run_sim,
 )
+rel_path = str(files("f5_tts").joinpath("../../"))
 eval_task = "wer"  # sim | wer
 lang = "zh"  # zh | en
+metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst"  # seed-tts testset
+# gen_wav_dir = rel_path + f"/data/seedtts_testset/{lang}/wavs"  # ground truth wavs
 gen_wav_dir = "PATH_TO_GENERATED"  # generated wavs

src/f5_tts/infer/infer_gradio.py CHANGED Viewed

@@ -52,13 +52,11 @@ E2TTS_ema_model = load_model(
     UNetT, E2TTS_model_cfg, str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
 )
-# Initialize Qwen model and tokenizer
-model_name = "Qwen/Qwen2.5-3B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-def generate_response(messages):
     """Generate response using Qwen"""
     text = tokenizer.apply_chat_template(
         messages,
@@ -525,137 +523,157 @@ with gr.Blocks() as app_chat:
 # Voice Chat
 Have a conversation with an AI using your reference voice!
 1. Upload a reference audio clip and optionally its transcript.
-2. Record your message through your microphone.
-3. The AI will respond using the reference voice.
 """
     )
-    with gr.Row():
-        with gr.Column():
-            ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
-        with gr.Column():
-            with gr.Accordion("Advanced Settings", open=False):
-                model_choice_chat = gr.Radio(
-                    choices=["F5-TTS", "E2-TTS"],
-                    label="TTS Model",
-                    value="F5-TTS",
-                )
-                remove_silence_chat = gr.Checkbox(
-                    label="Remove Silences",
-                    value=True,
-                )
-                ref_text_chat = gr.Textbox(
-                    label="Reference Text",
-                    info="Optional: Leave blank to auto-transcribe",
-                    lines=2,
-                )
-                system_prompt_chat = gr.Textbox(
-                    label="System Prompt",
-                    value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
-                    lines=2,
                 )
-    chatbot_interface = gr.Chatbot(label="Conversation")
-    with gr.Row():
-        with gr.Column():
-            audio_output_chat = gr.Audio(autoplay=True)
-        with gr.Column():
-            audio_input_chat = gr.Microphone(
-                label="Or speak your message",
-                type="filepath",
-            )
-    clear_btn_chat = gr.Button("Clear Conversation")
-    conversation_state = gr.State(
-        value=[
-            {
-                "role": "system",
-                "content": "You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
-            }
-        ]
-    )
-    def process_audio_input(audio_path, history, conv_state):
-        """Handle audio input from user"""
-        if not audio_path:
-            return history, conv_state, ""
-        text = ""
-        text = preprocess_ref_audio_text(audio_path, text)[1]
-        if not text.strip():
-            return history, conv_state, ""
-        conv_state.append({"role": "user", "content": text})
-        history.append((text, None))
-        response = generate_response(conv_state)
-        conv_state.append({"role": "assistant", "content": response})
-        history[-1] = (text, response)
-        return history, conv_state, ""
-    def generate_audio_response(history, ref_audio, ref_text, model, remove_silence):
-        """Generate TTS audio for AI response"""
-        if not history or not ref_audio:
-            return None
-        last_user_message, last_ai_response = history[-1]
-        if not last_ai_response:
-            return None
-        audio_result, _ = infer(
-            ref_audio,
-            ref_text,
-            last_ai_response,
-            model,
-            remove_silence,
-            cross_fade_duration=0.15,
-            speed=1.0,
         )
-        return audio_result
-    def clear_conversation():
-        """Reset the conversation"""
-        return [], [
-            {
-                "role": "system",
-                "content": "You are a friendly person, and may impersonate whoever they address you as. Stay in character. Keep your responses concise since they will be spoken out loud.",
-            }
-        ]
-    def update_system_prompt(new_prompt):
-        """Update the system prompt and reset the conversation"""
-        new_conv_state = [{"role": "system", "content": new_prompt}]
-        return [], new_conv_state
-    # Handle audio input
-    audio_input_chat.stop_recording(
-        process_audio_input,
-        inputs=[audio_input_chat, chatbot_interface, conversation_state],
-        outputs=[chatbot_interface, conversation_state],
-    ).then(
-        generate_audio_response,
-        inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
-        outputs=audio_output_chat,
-    )
-    # Handle clear button
-    clear_btn_chat.click(
-        clear_conversation,
-        outputs=[chatbot_interface, conversation_state],
-    )
-    # Handle system prompt change and reset conversation
-    system_prompt_chat.change(
-        update_system_prompt,
-        inputs=system_prompt_chat,
-        outputs=[chatbot_interface, conversation_state],
-    )
 with gr.Blocks() as app:

     UNetT, E2TTS_model_cfg, str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
 )
+chat_model_state = None
+chat_tokenizer_state = None
+def generate_response(messages, model, tokenizer):
     """Generate response using Qwen"""
     text = tokenizer.apply_chat_template(
         messages,
 # Voice Chat
 Have a conversation with an AI using your reference voice!
 1. Upload a reference audio clip and optionally its transcript.
+2. Load the chat model.
+3. Record your message through your microphone.
+4. The AI will respond using the reference voice.
 """
     )
+    load_chat_model_btn = gr.Button("Load Chat Model", variant="primary")
+    chat_interface_container = gr.Column(visible=False)
+    def load_chat_model():
+        global chat_model_state, chat_tokenizer_state
+        if chat_model_state is None:
+            show_info = gr.Info
+            show_info("Loading chat model...")
+            model_name = "Qwen/Qwen2.5-3B-Instruct"
+            chat_model_state = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
+            chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
+            show_info("Chat model loaded.")
+        return gr.update(visible=False), gr.update(visible=True)
+    load_chat_model_btn.click(load_chat_model, outputs=[load_chat_model_btn, chat_interface_container])
+    with chat_interface_container:
+        with gr.Row():
+            with gr.Column():
+                ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
+            with gr.Column():
+                with gr.Accordion("Advanced Settings", open=False):
+                    model_choice_chat = gr.Radio(
+                        choices=["F5-TTS", "E2-TTS"],
+                        label="TTS Model",
+                        value="F5-TTS",
+                    )
+                    remove_silence_chat = gr.Checkbox(
+                        label="Remove Silences",
+                        value=True,
+                    )
+                    ref_text_chat = gr.Textbox(
+                        label="Reference Text",
+                        info="Optional: Leave blank to auto-transcribe",
+                        lines=2,
+                    )
+                    system_prompt_chat = gr.Textbox(
+                        label="System Prompt",
+                        value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
+                        lines=2,
+                    )
+        chatbot_interface = gr.Chatbot(label="Conversation")
+        with gr.Row():
+            with gr.Column():
+                audio_output_chat = gr.Audio(autoplay=True)
+            with gr.Column():
+                audio_input_chat = gr.Microphone(
+                    label="Speak your message",
+                    type="filepath",
                 )
+        clear_btn_chat = gr.Button("Clear Conversation")
+        conversation_state = gr.State(
+            value=[
+                {
+                    "role": "system",
+                    "content": "You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
+                }
+            ]
+        )
+        # Modify process_audio_input to use model and tokenizer from state
+        def process_audio_input(audio_path, history, conv_state):
+            """Handle audio input from user"""
+            if not audio_path:
+                return history, conv_state, ""
+            text = ""
+            text = preprocess_ref_audio_text(audio_path, text)[1]
+            if not text.strip():
+                return history, conv_state, ""
+            conv_state.append({"role": "user", "content": text})
+            history.append((text, None))
+            response = generate_response(conv_state, chat_model_state, chat_tokenizer_state)
+            conv_state.append({"role": "assistant", "content": response})
+            history[-1] = (text, response)
+            return history, conv_state, ""
+        def generate_audio_response(history, ref_audio, ref_text, model, remove_silence):
+            """Generate TTS audio for AI response"""
+            if not history or not ref_audio:
+                return None
+            last_user_message, last_ai_response = history[-1]
+            if not last_ai_response:
+                return None
+            audio_result, _ = infer(
+                ref_audio,
+                ref_text,
+                last_ai_response,
+                model,
+                remove_silence,
+                cross_fade_duration=0.15,
+                speed=1.0,
+            )
+            return audio_result
+        def clear_conversation():
+            """Reset the conversation"""
+            return [], [
+                {
+                    "role": "system",
+                    "content": "You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
+                }
+            ]
+        def update_system_prompt(new_prompt):
+            """Update the system prompt and reset the conversation"""
+            new_conv_state = [{"role": "system", "content": new_prompt}]
+            return [], new_conv_state
+        # Handle audio input
+        audio_input_chat.stop_recording(
+            process_audio_input,
+            inputs=[audio_input_chat, chatbot_interface, conversation_state],
+            outputs=[chatbot_interface, conversation_state],
+        ).then(
+            generate_audio_response,
+            inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
+            outputs=audio_output_chat,
         )
+        # Handle clear button
+        clear_btn_chat.click(
+            clear_conversation,
+            outputs=[chatbot_interface, conversation_state],
+        )
+        # Handle system prompt change and reset conversation
+        system_prompt_chat.change(
+            update_system_prompt,
+            inputs=system_prompt_chat,
+            outputs=[chatbot_interface, conversation_state],
+        )
 with gr.Blocks() as app:

src/f5_tts/infer/utils_infer.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # A unified script for inference process
 # Make adjustments inside functions, and consider both gradio and cli scripts if need to change func output format
 import re
 import tempfile
@@ -23,6 +24,7 @@ from f5_tts.model.utils import (
     convert_char_to_pinyin,
 )
 device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
@@ -194,23 +196,36 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print, device=
         aseg.export(f.name, format="wav")
         ref_audio = f.name
-    if not ref_text.strip():
-        global asr_pipe
-        if asr_pipe is None:
-            initialize_asr_pipeline(device=device)
-        show_info("No reference text provided, transcribing reference audio...")
-        ref_text = asr_pipe(
-            ref_audio,
-            chunk_length_s=30,
-            batch_size=128,
-            generate_kwargs={"task": "transcribe"},
-            return_timestamps=False,
-        )["text"].strip()
-        show_info("Finished transcription")
     else:
-        show_info("Using custom reference text...")
-    # Add the functionality to ensure it ends with ". "
     if not ref_text.endswith(". ") and not ref_text.endswith("。"):
         if ref_text.endswith("."):
             ref_text += " "

 # A unified script for inference process
 # Make adjustments inside functions, and consider both gradio and cli scripts if need to change func output format
+import hashlib
 import re
 import tempfile
     convert_char_to_pinyin,
 )
+_ref_audio_cache = {}
 device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
         aseg.export(f.name, format="wav")
         ref_audio = f.name
+    # Compute a hash of the reference audio file
+    with open(ref_audio, "rb") as audio_file:
+        audio_data = audio_file.read()
+        audio_hash = hashlib.md5(audio_data).hexdigest()
+    global _ref_audio_cache
+    if audio_hash in _ref_audio_cache:
+        # Use cached reference text
+        show_info("Using cached reference text...")
+        ref_text = _ref_audio_cache[audio_hash]
     else:
+        if not ref_text.strip():
+            global asr_pipe
+            if asr_pipe is None:
+                initialize_asr_pipeline(device=device)
+            show_info("No reference text provided, transcribing reference audio...")
+            ref_text = asr_pipe(
+                ref_audio,
+                chunk_length_s=30,
+                batch_size=128,
+                generate_kwargs={"task": "transcribe"},
+                return_timestamps=False,
+            )["text"].strip()
+            show_info("Finished transcription")
+        else:
+            show_info("Using custom reference text...")
+        # Cache the transcribed text
+        _ref_audio_cache[audio_hash] = ref_text
+    # Ensure ref_text ends with a proper sentence-ending punctuation
     if not ref_text.endswith(". ") and not ref_text.endswith("。"):
         if ref_text.endswith("."):
             ref_text += " "

src/f5_tts/model/utils.py CHANGED Viewed

@@ -2,8 +2,8 @@ from __future__ import annotations
 import os
 import random
-from importlib.resources import files
 from collections import defaultdict
 import torch
 from torch.nn.utils.rnn import pad_sequence
@@ -109,7 +109,7 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
                 - if use "byte", set to 256 (unicode byte range)
     """
     if tokenizer in ["pinyin", "char"]:
-        tokenizer_path = os.path.join(files("f5_tts").joinpath("data"), f"{dataset_name}_{tokenizer}/vocab.txt")
         with open(tokenizer_path, "r", encoding="utf-8") as f:
             vocab_char_map = {}
             for i, char in enumerate(f):
@@ -120,6 +120,7 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
     elif tokenizer == "byte":
         vocab_char_map = None
         vocab_size = 256
     elif tokenizer == "custom":
         with open(dataset_name, "r", encoding="utf-8") as f:
             vocab_char_map = {}

 import os
 import random
 from collections import defaultdict
+from importlib.resources import files
 import torch
 from torch.nn.utils.rnn import pad_sequence
                 - if use "byte", set to 256 (unicode byte range)
     """
     if tokenizer in ["pinyin", "char"]:
+        tokenizer_path = os.path.join(files("f5_tts").joinpath("../../data"), f"{dataset_name}_{tokenizer}/vocab.txt")
         with open(tokenizer_path, "r", encoding="utf-8") as f:
             vocab_char_map = {}
             for i, char in enumerate(f):
     elif tokenizer == "byte":
         vocab_char_map = None
         vocab_size = 256
     elif tokenizer == "custom":
         with open(dataset_name, "r", encoding="utf-8") as f:
             vocab_char_map = {}