Spaces:

junseok520
/

VoxSIM

Running

junseok commited on Mar 4

Commit

f96e2ca

1 Parent(s): f8273e8

new commit

Files changed (2) hide show

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import gradio as gr
 model = load_model("wavlm_ecapa.model")
 model.eval()
-def calc_spksim(inp_path, ref_path):
     inp_wavs, inp_wav = loadWav(inp_path)
     ref_wavs, ref_wav = loadWav(ref_path)
@@ -35,7 +35,7 @@ Paper is available [here](https://arxiv.org/abs/2407.18505)
 """
 iface = gr.Interface(
-    fn=calc_spksim,
     inputs=(
         gr.Audio(label="Input Audio"),
         gr.Audio(label="Reference Audio")

 model = load_model("wavlm_ecapa.model")
 model.eval()
+def calc_voxsim(inp_path, ref_path):
     inp_wavs, inp_wav = loadWav(inp_path)
     ref_wavs, ref_wav = loadWav(ref_path)
 """
 iface = gr.Interface(
+    fn=calc_voxsim,
     inputs=(
         gr.Audio(label="Input Audio"),
         gr.Audio(label="Reference Audio")

predict.py CHANGED Viewed

@@ -31,8 +31,11 @@ def loadWav(filename, max_frames: int = 400):
     max_audio = max_frames * 160 + 240
     # Read wav file and convert to torch tensor
-    print(type(filename))
-    audio, sr = librosa.load(filename, sr=16000)
     audio_org = audio.copy()
     audiosize = audio.shape[0]

     max_audio = max_frames * 160 + 240
     # Read wav file and convert to torch tensor
+    if type(filename) == tuple:
+        sr, audio = filename
+        audio = librosa.util.normalize(audio)
+    else:
+        audio, sr = librosa.load(filename, sr=16000)
     audio_org = audio.copy()
     audiosize = audio.shape[0]