Spaces:
Running
Running
File size: 1,892 Bytes
2216a22 ce904ba 2216a22 f96e2ca ce904ba 2216a22 ce904ba 2216a22 ce904ba 2216a22 92b8a7c ce904ba 92b8a7c ce904ba 92b8a7c ce904ba 92b8a7c ce904ba 2216a22 ce904ba 2216a22 f96e2ca 2216a22 738a30d 2216a22 b5cf8a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from score import load_model
from predict import loadWav
import torch
import torch.nn.functional as F
import gradio as gr
import time
model = load_model("wavlm_ecapa.model")
model.eval()
def calc_voxsim(inp_path, ref_path):
start = time.time()
inp_wavs, inp_wav = loadWav(inp_path)
ref_wavs, ref_wav = loadWav(ref_path)
print("loadWav time: ", time.time() - start)
inp_wavs = torch.FloatTensor(inp_wavs)
inp_wav = torch.FloatTensor(inp_wav)
ref_wavs = torch.FloatTensor(ref_wavs)
ref_wav = torch.FloatTensor(ref_wav)
print("torch.FloatTensor time: ", time.time() - start)
with torch.no_grad():
input_emb_1 = F.normalize(model.forward(inp_wavs), p=2, dim=1)
print("input_emb_1 time: ", time.time() - start)
input_emb_2 = F.normalize(model.forward(inp_wav), p=2, dim=1)
print("input_emb_2 time: ", time.time() - start)
ref_emb_1 = F.normalize(model.forward(ref_wavs), p=2, dim=1)
print("ref_emb_1 time: ", time.time() - start)
ref_emb_2 = F.normalize(model.forward(ref_wav), p=2, dim=1)
print("ref_emb_2 time: ", time.time() - start)
score_1 = torch.mean(torch.matmul(input_emb_1, ref_emb_1.T))
score_2 = torch.mean(torch.matmul(input_emb_2, ref_emb_2.T))
score = (score_1 + score_2) / 2
print("score time: ", time.time() - start)
return score.detach().cpu().numpy()
description = """
Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset.
This demo only accepts .wav format. Best at 16 kHz sampling rate.
Paper is available [here](https://arxiv.org/abs/2407.18505)
"""
iface = gr.Interface(
fn=calc_voxsim,
inputs=(
gr.Audio(label="Input Audio"),
gr.Audio(label="Reference Audio")
),
outputs="text",
title="voice similarity with VoxSim",
description=description,
).launch() |