VoxSIM / app.py
junseok
new commit
ce904ba
raw
history blame
1.89 kB
from score import load_model
from predict import loadWav
import torch
import torch.nn.functional as F
import gradio as gr
import time
model = load_model("wavlm_ecapa.model")
model.eval()
def calc_voxsim(inp_path, ref_path):
start = time.time()
inp_wavs, inp_wav = loadWav(inp_path)
ref_wavs, ref_wav = loadWav(ref_path)
print("loadWav time: ", time.time() - start)
inp_wavs = torch.FloatTensor(inp_wavs)
inp_wav = torch.FloatTensor(inp_wav)
ref_wavs = torch.FloatTensor(ref_wavs)
ref_wav = torch.FloatTensor(ref_wav)
print("torch.FloatTensor time: ", time.time() - start)
with torch.no_grad():
input_emb_1 = F.normalize(model.forward(inp_wavs), p=2, dim=1)
print("input_emb_1 time: ", time.time() - start)
input_emb_2 = F.normalize(model.forward(inp_wav), p=2, dim=1)
print("input_emb_2 time: ", time.time() - start)
ref_emb_1 = F.normalize(model.forward(ref_wavs), p=2, dim=1)
print("ref_emb_1 time: ", time.time() - start)
ref_emb_2 = F.normalize(model.forward(ref_wav), p=2, dim=1)
print("ref_emb_2 time: ", time.time() - start)
score_1 = torch.mean(torch.matmul(input_emb_1, ref_emb_1.T))
score_2 = torch.mean(torch.matmul(input_emb_2, ref_emb_2.T))
score = (score_1 + score_2) / 2
print("score time: ", time.time() - start)
return score.detach().cpu().numpy()
description = """
Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset.
This demo only accepts .wav format. Best at 16 kHz sampling rate.
Paper is available [here](https://arxiv.org/abs/2407.18505)
"""
iface = gr.Interface(
fn=calc_voxsim,
inputs=(
gr.Audio(label="Input Audio"),
gr.Audio(label="Reference Audio")
),
outputs="text",
title="voice similarity with VoxSim",
description=description,
).launch()