Spaces:
Running
Running
from score import load_model | |
from predict import loadWav | |
import torch | |
import torch.nn.functional as F | |
import gradio as gr | |
import time | |
model = load_model("wavlm_ecapa.model") | |
model.eval() | |
def calc_voxsim(inp_path, ref_path): | |
start = time.time() | |
inp_wavs, inp_wav = loadWav(inp_path) | |
ref_wavs, ref_wav = loadWav(ref_path) | |
print("loadWav time: ", time.time() - start) | |
inp_wavs = torch.FloatTensor(inp_wavs) | |
inp_wav = torch.FloatTensor(inp_wav) | |
ref_wavs = torch.FloatTensor(ref_wavs) | |
ref_wav = torch.FloatTensor(ref_wav) | |
print("torch.FloatTensor time: ", time.time() - start) | |
with torch.no_grad(): | |
input_emb_1 = F.normalize(model.forward(inp_wavs), p=2, dim=1) | |
print("input_emb_1 time: ", time.time() - start) | |
input_emb_2 = F.normalize(model.forward(inp_wav), p=2, dim=1) | |
print("input_emb_2 time: ", time.time() - start) | |
ref_emb_1 = F.normalize(model.forward(ref_wavs), p=2, dim=1) | |
print("ref_emb_1 time: ", time.time() - start) | |
ref_emb_2 = F.normalize(model.forward(ref_wav), p=2, dim=1) | |
print("ref_emb_2 time: ", time.time() - start) | |
score_1 = torch.mean(torch.matmul(input_emb_1, ref_emb_1.T)) | |
score_2 = torch.mean(torch.matmul(input_emb_2, ref_emb_2.T)) | |
score = (score_1 + score_2) / 2 | |
print("score time: ", time.time() - start) | |
return score.detach().cpu().numpy() | |
description = """ | |
Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset. | |
This demo only accepts .wav format. Best at 16 kHz sampling rate. | |
Paper is available [here](https://arxiv.org/abs/2407.18505) | |
""" | |
iface = gr.Interface( | |
fn=calc_voxsim, | |
inputs=( | |
gr.Audio(label="Input Audio"), | |
gr.Audio(label="Reference Audio") | |
), | |
outputs="text", | |
title="voice similarity with VoxSim", | |
description=description, | |
).launch() |