Spaces:
Configuration error
Configuration error
convert all input audio to mono
Browse files- gradio_app.py +2 -0
- model/utils.py +1 -1
- test_infer_single.py +2 -0
- test_infer_single_edit.py +2 -0
gradio_app.py
CHANGED
@@ -119,6 +119,8 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence):
|
|
119 |
else:
|
120 |
gr.Info("Using custom reference text...")
|
121 |
audio, sr = torchaudio.load(ref_audio)
|
|
|
|
|
122 |
|
123 |
rms = torch.sqrt(torch.mean(torch.square(audio)))
|
124 |
if rms < target_rms:
|
|
|
119 |
else:
|
120 |
gr.Info("Using custom reference text...")
|
121 |
audio, sr = torchaudio.load(ref_audio)
|
122 |
+
if audio.shape[0] > 1:
|
123 |
+
audio = torch.mean(audio, dim=0, keepdim=True)
|
124 |
|
125 |
rms = torch.sqrt(torch.mean(torch.square(audio)))
|
126 |
if rms < target_rms:
|
model/utils.py
CHANGED
@@ -134,7 +134,7 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
|
|
134 |
- if use "byte", set to 256 (unicode byte range)
|
135 |
'''
|
136 |
if tokenizer in ["pinyin", "char"]:
|
137 |
-
with open (f"data/{dataset_name}_{tokenizer}/vocab.txt", "r") as f:
|
138 |
vocab_char_map = {}
|
139 |
for i, char in enumerate(f):
|
140 |
vocab_char_map[char[:-1]] = i
|
|
|
134 |
- if use "byte", set to 256 (unicode byte range)
|
135 |
'''
|
136 |
if tokenizer in ["pinyin", "char"]:
|
137 |
+
with open (f"data/{dataset_name}_{tokenizer}/vocab.txt", "r", encoding="utf-8") as f:
|
138 |
vocab_char_map = {}
|
139 |
for i, char in enumerate(f):
|
140 |
vocab_char_map[char[:-1]] = i
|
test_infer_single.py
CHANGED
@@ -105,6 +105,8 @@ model = load_checkpoint(model, ckpt_path, device, use_ema = use_ema)
|
|
105 |
|
106 |
# Audio
|
107 |
audio, sr = torchaudio.load(ref_audio)
|
|
|
|
|
108 |
rms = torch.sqrt(torch.mean(torch.square(audio)))
|
109 |
if rms < target_rms:
|
110 |
audio = audio * target_rms / rms
|
|
|
105 |
|
106 |
# Audio
|
107 |
audio, sr = torchaudio.load(ref_audio)
|
108 |
+
if audio.shape[0] > 1:
|
109 |
+
audio = torch.mean(audio, dim=0, keepdim=True)
|
110 |
rms = torch.sqrt(torch.mean(torch.square(audio)))
|
111 |
if rms < target_rms:
|
112 |
audio = audio * target_rms / rms
|
test_infer_single_edit.py
CHANGED
@@ -116,6 +116,8 @@ model = load_checkpoint(model, ckpt_path, device, use_ema = use_ema)
|
|
116 |
|
117 |
# Audio
|
118 |
audio, sr = torchaudio.load(audio_to_edit)
|
|
|
|
|
119 |
rms = torch.sqrt(torch.mean(torch.square(audio)))
|
120 |
if rms < target_rms:
|
121 |
audio = audio * target_rms / rms
|
|
|
116 |
|
117 |
# Audio
|
118 |
audio, sr = torchaudio.load(audio_to_edit)
|
119 |
+
if audio.shape[0] > 1:
|
120 |
+
audio = torch.mean(audio, dim=0, keepdim=True)
|
121 |
rms = torch.sqrt(torch.mean(torch.square(audio)))
|
122 |
if rms < target_rms:
|
123 |
audio = audio * target_rms / rms
|