Spaces:

junseok520
/

VoxSIM

Running

File size: 1,892 Bytes

2216a22
 
 
 
 
ce904ba
2216a22
 
 
 
f96e2ca
ce904ba
2216a22
 
ce904ba
2216a22
 
 
 
 
ce904ba
2216a22
 
92b8a7c
ce904ba
92b8a7c
ce904ba
92b8a7c
ce904ba
92b8a7c
ce904ba
2216a22
 
 
 
ce904ba
2216a22
 
 
 
 
 
 
 
 
 
f96e2ca
2216a22
738a30d
 
2216a22
 
 
 
b5cf8a0

from score import load_model
from predict import loadWav
import torch
import torch.nn.functional as F
import gradio as gr
import time

model = load_model("wavlm_ecapa.model")
model.eval()

def calc_voxsim(inp_path, ref_path):
    start = time.time()
    inp_wavs, inp_wav = loadWav(inp_path)
    ref_wavs, ref_wav = loadWav(ref_path)
    print("loadWav time: ", time.time() - start)

    inp_wavs = torch.FloatTensor(inp_wavs)
    inp_wav = torch.FloatTensor(inp_wav)
    ref_wavs = torch.FloatTensor(ref_wavs)
    ref_wav = torch.FloatTensor(ref_wav)
    print("torch.FloatTensor time: ", time.time() - start)

    with torch.no_grad():
        input_emb_1 = F.normalize(model.forward(inp_wavs), p=2, dim=1)
        print("input_emb_1 time: ", time.time() - start)
        input_emb_2 = F.normalize(model.forward(inp_wav), p=2, dim=1)
        print("input_emb_2 time: ", time.time() - start)
        ref_emb_1 = F.normalize(model.forward(ref_wavs), p=2, dim=1)
        print("ref_emb_1 time: ", time.time() - start)
        ref_emb_2 = F.normalize(model.forward(ref_wav), p=2, dim=1)
        print("ref_emb_2 time: ", time.time() - start)

        score_1 = torch.mean(torch.matmul(input_emb_1, ref_emb_1.T))
        score_2 = torch.mean(torch.matmul(input_emb_2, ref_emb_2.T))
        score = (score_1 + score_2) / 2
        print("score time: ", time.time() - start)
        return score.detach().cpu().numpy()

description = """
Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset.
This demo only accepts .wav format. Best at 16 kHz sampling rate.

Paper is available [here](https://arxiv.org/abs/2407.18505)
"""

iface = gr.Interface(
    fn=calc_voxsim,
    inputs=(
        gr.Audio(label="Input Audio"),
        gr.Audio(label="Reference Audio")
    ),
    outputs="text",
    title="voice similarity with VoxSim",
    description=description,
).launch()