Spaces:

zhouzhou363
/

f5-tts

Configuration error

SWivid commited on Oct 13, 2024

Commit

83fbd34

1 Parent(s): 68b4ce0

convert all input audio to mono

Files changed (4) hide show

gradio_app.py CHANGED Viewed

@@ -119,6 +119,8 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence):
     else:
         gr.Info("Using custom reference text...")
     audio, sr = torchaudio.load(ref_audio)
     rms = torch.sqrt(torch.mean(torch.square(audio)))
     if rms < target_rms:

     else:
         gr.Info("Using custom reference text...")
     audio, sr = torchaudio.load(ref_audio)
+    if audio.shape[0] > 1:
+        audio = torch.mean(audio, dim=0, keepdim=True)
     rms = torch.sqrt(torch.mean(torch.square(audio)))
     if rms < target_rms:

model/utils.py CHANGED Viewed

@@ -134,7 +134,7 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
                 - if use "byte", set to 256 (unicode byte range)
     '''
     if tokenizer in ["pinyin", "char"]:
-        with open (f"data/{dataset_name}_{tokenizer}/vocab.txt", "r") as f:
             vocab_char_map = {}
             for i, char in enumerate(f):
                 vocab_char_map[char[:-1]] = i

                 - if use "byte", set to 256 (unicode byte range)
     '''
     if tokenizer in ["pinyin", "char"]:
+        with open (f"data/{dataset_name}_{tokenizer}/vocab.txt", "r", encoding="utf-8") as f:
             vocab_char_map = {}
             for i, char in enumerate(f):
                 vocab_char_map[char[:-1]] = i

test_infer_single.py CHANGED Viewed

@@ -105,6 +105,8 @@ model = load_checkpoint(model, ckpt_path, device, use_ema = use_ema)
 # Audio
 audio, sr = torchaudio.load(ref_audio)
 rms = torch.sqrt(torch.mean(torch.square(audio)))
 if rms < target_rms:
     audio = audio * target_rms / rms

 # Audio
 audio, sr = torchaudio.load(ref_audio)
+if audio.shape[0] > 1:
+    audio = torch.mean(audio, dim=0, keepdim=True)
 rms = torch.sqrt(torch.mean(torch.square(audio)))
 if rms < target_rms:
     audio = audio * target_rms / rms

test_infer_single_edit.py CHANGED Viewed

@@ -116,6 +116,8 @@ model = load_checkpoint(model, ckpt_path, device, use_ema = use_ema)
 # Audio
 audio, sr = torchaudio.load(audio_to_edit)
 rms = torch.sqrt(torch.mean(torch.square(audio)))
 if rms < target_rms:
     audio = audio * target_rms / rms

 # Audio
 audio, sr = torchaudio.load(audio_to_edit)
+if audio.shape[0] > 1:
+    audio = torch.mean(audio, dim=0, keepdim=True)
 rms = torch.sqrt(torch.mean(torch.square(audio)))
 if rms < target_rms:
     audio = audio * target_rms / rms