Zvo

Runtime error

hynt commited on Jul 19

Commit

0ef75b8

verified ·

1 Parent(s): e11fe04

Update utils.py

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -8,10 +8,41 @@ import re
 import torch
 import numpy as np
 import os
 _ref_audio_cache = {}
 asr_pipe = None
 def chunk_text(text, max_chars=135):
     # print(text)
@@ -129,9 +160,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
     show_info("Converting audio...")
-    ref_audio_orig_converted = ref_audio_orig.replace(".wav", "_24k.wav").replace(".mp3", "_24k.mp3").replace(".m4a", "_24k.m4a").replace(".flac", "_24k.flac")
-    os.system("sox " + ref_audio_orig + " -r 24000 -c 1 " + ref_audio_orig_converted)
     ref_audio_orig = ref_audio_orig_converted

 import torch
 import numpy as np
 import os
+from scipy.io import wavfile
+from scipy.signal import resample_poly
 _ref_audio_cache = {}
 asr_pipe = None
+def resample_to_24khz(input_path: str, output_path: str):
+    """
+    Resample WAV audio file to 24,000 Hz using scipy.
+    Parameters:
+    - input_path (str): Path to the input WAV file.
+    - output_path (str): Path to save the output WAV file.
+    """
+    # Load WAV file
+    orig_sr, audio = wavfile.read(input_path)
+    # Convert to mono if stereo
+    if len(audio.shape) == 2:
+        audio = audio.mean(axis=1)
+    # Convert to float32 for processing
+    if audio.dtype != np.float32:
+        audio = audio.astype(np.float32) / np.iinfo(audio.dtype).max
+    # Resample
+    target_sr = 24000
+    resampled = resample_poly(audio, target_sr, orig_sr)
+    # Convert back to int16 for saving
+    resampled_int16 = (resampled * 32767).astype(np.int16)
+    # Save output
+    wavfile.write(output_path, target_sr, resampled_int16)
 def chunk_text(text, max_chars=135):
     # print(text)
     show_info("Converting audio...")
+    resample_to_24khz(ref_audio_orig, ref_audio_orig_converted)
     ref_audio_orig = ref_audio_orig_converted