Spaces:

junseok520
/

VoxSIM

Running

VoxSIM / app.py

junseok

new commit

ce904ba 6 months ago

1.89 kB

	from score import load_model
	from predict import loadWav
	import torch
	import torch.nn.functional as F
	import gradio as gr
	import time

	model = load_model("wavlm_ecapa.model")
	model.eval()

	def calc_voxsim(inp_path, ref_path):
	start = time.time()
	inp_wavs, inp_wav = loadWav(inp_path)
	ref_wavs, ref_wav = loadWav(ref_path)
	print("loadWav time: ", time.time() - start)

	inp_wavs = torch.FloatTensor(inp_wavs)
	inp_wav = torch.FloatTensor(inp_wav)
	ref_wavs = torch.FloatTensor(ref_wavs)
	ref_wav = torch.FloatTensor(ref_wav)
	print("torch.FloatTensor time: ", time.time() - start)

	with torch.no_grad():
	input_emb_1 = F.normalize(model.forward(inp_wavs), p=2, dim=1)
	print("input_emb_1 time: ", time.time() - start)
	input_emb_2 = F.normalize(model.forward(inp_wav), p=2, dim=1)
	print("input_emb_2 time: ", time.time() - start)
	ref_emb_1 = F.normalize(model.forward(ref_wavs), p=2, dim=1)
	print("ref_emb_1 time: ", time.time() - start)
	ref_emb_2 = F.normalize(model.forward(ref_wav), p=2, dim=1)
	print("ref_emb_2 time: ", time.time() - start)

	score_1 = torch.mean(torch.matmul(input_emb_1, ref_emb_1.T))
	score_2 = torch.mean(torch.matmul(input_emb_2, ref_emb_2.T))
	score = (score_1 + score_2) / 2
	print("score time: ", time.time() - start)
	return score.detach().cpu().numpy()

	description = """
	Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset.
	This demo only accepts .wav format. Best at 16 kHz sampling rate.

	Paper is available [here](https://arxiv.org/abs/2407.18505)
	"""

	iface = gr.Interface(
	fn=calc_voxsim,
	inputs=(
	gr.Audio(label="Input Audio"),
	gr.Audio(label="Reference Audio")
	),
	outputs="text",
	title="voice similarity with VoxSim",
	description=description,
	).launch()