Spaces:
Runtime error
Runtime error
File size: 1,600 Bytes
7a9eea4 09b7c54 7a9eea4 66b4abd 7a9eea4 66b4abd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import gradio as gr
import torch
import torchaudio
from transformers import AutoFeatureExtractor, AutoModelForAudioXVector
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def similarity_fn(speaker1, speaker2):
if not (speaker1 and speaker2):
return gr.Textbox(value='<b style="color:red">ERROR: Please record audio for *both* speakers!</b>')
wav1, _ = torchaudio.load(speaker1)
wav2, _ = torchaudio.load(speaker2)
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sv")
model = AutoModelForAudioXVector.from_pretrained("microsoft/wavlm-base-plus-sv").to(device)
input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
input2 = feature_extractor(wav2.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
with torch.no_grad():
emb1 = model(input1).embeddings
emb2 = model(input2).embeddings
emb1 = torch.nn.functional.normalize(emb1, dim=-1).cpu()
emb2 = torch.nn.functional.normalize(emb2, dim=-1).cpu()
similarity = torch.nn.CosineSimilarity(dim=-1)(emb1, emb2).numpy()[0]
if similarity >= 0.8:
label = "The speakers are similar"
color = "green"
else:
label = "The speakers are different"
color = "red"
return gr.Textbox(value=f"<span style='color:{color}'>{label}</span>")
demo = gr.Interface(
speaker1=gr.Audio(source="microphone", type="filepath"),
speaker2=gr.Audio(source="microphone", type="filepath"),
output=gr.Textbox(),
fn=similarity_fn
)
demo.launch()
|