SWivid commited on
Commit
83fbd34
·
1 Parent(s): 68b4ce0

convert all input audio to mono

Browse files
gradio_app.py CHANGED
@@ -119,6 +119,8 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence):
119
  else:
120
  gr.Info("Using custom reference text...")
121
  audio, sr = torchaudio.load(ref_audio)
 
 
122
 
123
  rms = torch.sqrt(torch.mean(torch.square(audio)))
124
  if rms < target_rms:
 
119
  else:
120
  gr.Info("Using custom reference text...")
121
  audio, sr = torchaudio.load(ref_audio)
122
+ if audio.shape[0] > 1:
123
+ audio = torch.mean(audio, dim=0, keepdim=True)
124
 
125
  rms = torch.sqrt(torch.mean(torch.square(audio)))
126
  if rms < target_rms:
model/utils.py CHANGED
@@ -134,7 +134,7 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
134
  - if use "byte", set to 256 (unicode byte range)
135
  '''
136
  if tokenizer in ["pinyin", "char"]:
137
- with open (f"data/{dataset_name}_{tokenizer}/vocab.txt", "r") as f:
138
  vocab_char_map = {}
139
  for i, char in enumerate(f):
140
  vocab_char_map[char[:-1]] = i
 
134
  - if use "byte", set to 256 (unicode byte range)
135
  '''
136
  if tokenizer in ["pinyin", "char"]:
137
+ with open (f"data/{dataset_name}_{tokenizer}/vocab.txt", "r", encoding="utf-8") as f:
138
  vocab_char_map = {}
139
  for i, char in enumerate(f):
140
  vocab_char_map[char[:-1]] = i
test_infer_single.py CHANGED
@@ -105,6 +105,8 @@ model = load_checkpoint(model, ckpt_path, device, use_ema = use_ema)
105
 
106
  # Audio
107
  audio, sr = torchaudio.load(ref_audio)
 
 
108
  rms = torch.sqrt(torch.mean(torch.square(audio)))
109
  if rms < target_rms:
110
  audio = audio * target_rms / rms
 
105
 
106
  # Audio
107
  audio, sr = torchaudio.load(ref_audio)
108
+ if audio.shape[0] > 1:
109
+ audio = torch.mean(audio, dim=0, keepdim=True)
110
  rms = torch.sqrt(torch.mean(torch.square(audio)))
111
  if rms < target_rms:
112
  audio = audio * target_rms / rms
test_infer_single_edit.py CHANGED
@@ -116,6 +116,8 @@ model = load_checkpoint(model, ckpt_path, device, use_ema = use_ema)
116
 
117
  # Audio
118
  audio, sr = torchaudio.load(audio_to_edit)
 
 
119
  rms = torch.sqrt(torch.mean(torch.square(audio)))
120
  if rms < target_rms:
121
  audio = audio * target_rms / rms
 
116
 
117
  # Audio
118
  audio, sr = torchaudio.load(audio_to_edit)
119
+ if audio.shape[0] > 1:
120
+ audio = torch.mean(audio, dim=0, keepdim=True)
121
  rms = torch.sqrt(torch.mean(torch.square(audio)))
122
  if rms < target_rms:
123
  audio = audio * target_rms / rms