File size: 1,892 Bytes
2216a22
 
 
 
 
ce904ba
2216a22
 
 
 
f96e2ca
ce904ba
2216a22
 
ce904ba
2216a22
 
 
 
 
ce904ba
2216a22
 
92b8a7c
ce904ba
92b8a7c
ce904ba
92b8a7c
ce904ba
92b8a7c
ce904ba
2216a22
 
 
 
ce904ba
2216a22
 
 
 
 
 
 
 
 
 
f96e2ca
2216a22
738a30d
 
2216a22
 
 
 
b5cf8a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from score import load_model
from predict import loadWav
import torch
import torch.nn.functional as F
import gradio as gr
import time

model = load_model("wavlm_ecapa.model")
model.eval()

def calc_voxsim(inp_path, ref_path):
    start = time.time()
    inp_wavs, inp_wav = loadWav(inp_path)
    ref_wavs, ref_wav = loadWav(ref_path)
    print("loadWav time: ", time.time() - start)

    inp_wavs = torch.FloatTensor(inp_wavs)
    inp_wav = torch.FloatTensor(inp_wav)
    ref_wavs = torch.FloatTensor(ref_wavs)
    ref_wav = torch.FloatTensor(ref_wav)
    print("torch.FloatTensor time: ", time.time() - start)

    with torch.no_grad():
        input_emb_1 = F.normalize(model.forward(inp_wavs), p=2, dim=1)
        print("input_emb_1 time: ", time.time() - start)
        input_emb_2 = F.normalize(model.forward(inp_wav), p=2, dim=1)
        print("input_emb_2 time: ", time.time() - start)
        ref_emb_1 = F.normalize(model.forward(ref_wavs), p=2, dim=1)
        print("ref_emb_1 time: ", time.time() - start)
        ref_emb_2 = F.normalize(model.forward(ref_wav), p=2, dim=1)
        print("ref_emb_2 time: ", time.time() - start)

        score_1 = torch.mean(torch.matmul(input_emb_1, ref_emb_1.T))
        score_2 = torch.mean(torch.matmul(input_emb_2, ref_emb_2.T))
        score = (score_1 + score_2) / 2
        print("score time: ", time.time() - start)
        return score.detach().cpu().numpy()

description = """
Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset.
This demo only accepts .wav format. Best at 16 kHz sampling rate.

Paper is available [here](https://arxiv.org/abs/2407.18505)
"""

iface = gr.Interface(
    fn=calc_voxsim,
    inputs=(
        gr.Audio(label="Input Audio"),
        gr.Audio(label="Reference Audio")
    ),
    outputs="text",
    title="voice similarity with VoxSim",
    description=description,
).launch()