E2-F5-TTS

Runtime error

App Files Files Community

mrfakename commited on Nov 15, 2024

Commit

e2287e3

verified ·

1 Parent(s): cf0b618

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (3) hide show

src/f5_tts/api.py +2 -2
src/f5_tts/infer/SHARED.md +3 -1
src/f5_tts/infer/utils_infer.py +23 -18

src/f5_tts/api.py CHANGED Viewed

@@ -49,10 +49,10 @@ class F5TTS:
         self.load_vocoder_model(vocoder_name, local_path=local_path)
         self.load_ema_model(model_type, ckpt_file, vocoder_name, vocab_file, ode_method, use_ema, local_path=local_path)
-    def load_vocoder_model(self, vocoder_name, local_path):
         self.vocoder = load_vocoder(vocoder_name, local_path is not None, local_path, self.device)
-    def load_ema_model(self, model_type, ckpt_file, mel_spec_type, vocab_file, ode_method, use_ema, local_path):
         if model_type == "F5-TTS":
             if not ckpt_file:
                 if mel_spec_type == "vocos":

         self.load_vocoder_model(vocoder_name, local_path=local_path)
         self.load_ema_model(model_type, ckpt_file, vocoder_name, vocab_file, ode_method, use_ema, local_path=local_path)
+    def load_vocoder_model(self, vocoder_name, local_path=None):
         self.vocoder = load_vocoder(vocoder_name, local_path is not None, local_path, self.device)
+    def load_ema_model(self, model_type, ckpt_file, mel_spec_type, vocab_file, ode_method, use_ema, local_path=None):
         if model_type == "F5-TTS":
             if not ckpt_file:
                 if mel_spec_type == "vocos":

src/f5_tts/infer/SHARED.md CHANGED Viewed

@@ -18,6 +18,8 @@
 - [Multilingual](#multilingual)
     - [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
 - [Mandarin](#mandarin)
 - [English](#english)
 - [French](#french)
     - [French LibriVox @ finetune @ fr](#french-librivox--finetune--fr)
@@ -67,6 +69,6 @@ MODEL_CKPT: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/model_last_reduced.p
 VOCAB_FILE: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/vocab.txt
 ```
-- Online Inference with [Hugging Face Space](https://huggingface.co/spaces/RASPIAUDIO/f5-tts_french).
 - [Tutorial video to train a new language model](https://www.youtube.com/watch?v=UO4usaOojys).
 - [Discussion about this training can be found here](https://github.com/SWivid/F5-TTS/issues/434).

 - [Multilingual](#multilingual)
     - [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
 - [Mandarin](#mandarin)
+- [Japanese](#japanese)
+    - [F5-TTS Base @ pretrain/finetune @ ja](#f5-tts-base--pretrainfinetune--ja)
 - [English](#english)
 - [French](#french)
     - [French LibriVox @ finetune @ fr](#french-librivox--finetune--fr)
 VOCAB_FILE: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/vocab.txt
 ```
+- [Online Inference with Hugging Face Space](https://huggingface.co/spaces/RASPIAUDIO/f5-tts_french).
 - [Tutorial video to train a new language model](https://www.youtube.com/watch?v=UO4usaOojys).
 - [Discussion about this training can be found here](https://github.com/SWivid/F5-TTS/issues/434).

src/f5_tts/infer/utils_infer.py CHANGED Viewed

@@ -90,36 +90,41 @@ def chunk_text(text, max_chars=135):
 # load vocoder
-def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=device):
     if vocoder_name == "vocos":
-        if is_local:
             print(f"Load vocos from local path {local_path}")
-            repo_id = "charactr/vocos-mel-24khz"
-            revision = None
-            config_path = hf_hub_download(
-                repo_id=repo_id, cache_dir=local_path, filename="config.yaml", revision=revision
-            )
-            model_path = hf_hub_download(
-                repo_id=repo_id, cache_dir=local_path, filename="pytorch_model.bin", revision=revision
-            )
-            vocoder = Vocos.from_hparams(config_path=config_path)
-            state_dict = torch.load(model_path, map_location="cpu")
-            vocoder.load_state_dict(state_dict)
-            vocoder = vocoder.eval().to(device)
         else:
             print("Download Vocos from huggingface charactr/vocos-mel-24khz")
-            vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
     elif vocoder_name == "bigvgan":
         try:
             from third_party.BigVGAN import bigvgan
         except ImportError:
             print("You need to follow the README to init submodule and change the BigVGAN source code.")
-        if is_local:
             """download from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main"""
-            local_path = snapshot_download(repo_id="nvidia/bigvgan_v2_24khz_100band_256x", cache_dir=local_path)
             vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
         else:
-            vocoder = bigvgan.BigVGAN.from_pretrained("nvidia/bigvgan_v2_24khz_100band_256x", use_cuda_kernel=False)
         vocoder.remove_weight_norm()
         vocoder = vocoder.eval().to(device)

 # load vocoder
+def load_vocoder(vocoder_name="vocos", is_local=False, local_path=None, device=device):
     if vocoder_name == "vocos":
+        # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
+        if is_local and local_path is not None:
             print(f"Load vocos from local path {local_path}")
+            config_path = f"{local_path}/config.yaml"
+            model_path = f"{local_path}/pytorch_model.bin"
         else:
             print("Download Vocos from huggingface charactr/vocos-mel-24khz")
+            repo_id = "charactr/vocos-mel-24khz"
+            config_path = hf_hub_download(repo_id=repo_id, cache_dir=local_path, filename="config.yaml")
+            model_path = hf_hub_download(repo_id=repo_id, cache_dir=local_path, filename="pytorch_model.bin")
+        vocoder = Vocos.from_hparams(config_path)
+        state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
+        from vocos.feature_extractors import EncodecFeatures
+        if isinstance(vocoder.feature_extractor, EncodecFeatures):
+            encodec_parameters = {
+                "feature_extractor.encodec." + key: value
+                for key, value in vocoder.feature_extractor.encodec.state_dict().items()
+            }
+            state_dict.update(encodec_parameters)
+        vocoder.load_state_dict(state_dict)
+        vocoder = vocoder.eval().to(device)
     elif vocoder_name == "bigvgan":
         try:
             from third_party.BigVGAN import bigvgan
         except ImportError:
             print("You need to follow the README to init submodule and change the BigVGAN source code.")
+        if is_local and local_path is not None:
             """download from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main"""
             vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
         else:
+            local_path = snapshot_download(repo_id="nvidia/bigvgan_v2_24khz_100band_256x", cache_dir=local_path)
+            vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
         vocoder.remove_weight_norm()
         vocoder = vocoder.eval().to(device)