Demo / app.py
Emma123453's picture
Update app.py
1dede48 verified
raw
history blame
2.67 kB
from TTS.api import TTS
from speechbrain.pretrained import SpeakerRecognition
from transformers import pipeline
import gradio as gr
import numpy as np
import soundfile as sf
from scipy.signal import resample
from scipy.io.wavfile import write as write_wav
from tempfile import NamedTemporaryFile
import os
# Load voice cloning model (XTTS)
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False)
# Load spoof detection models
sb = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp_model")
ast_pipe = pipeline("audio-classification", model="MattyB95/AST-VoxCelebSpoof-Synthetic-Voice-Detection")
df_pipe = pipeline("audio-classification", model="MelodyMachine/Deepfake-audio-detection-V2")
def spoof_and_detect(voice_sample, desired_sr=16000):
ref_audio_array, ref_sr = voice_sample
# Resample to 16kHz
if ref_sr != desired_sr:
duration = ref_audio_array.shape[0] / ref_sr
num_samples = int(duration * desired_sr)
ref_audio_array = resample(ref_audio_array, num_samples)
ref_sr = desired_sr
# Save reference audio
with NamedTemporaryFile(suffix=".wav", mode='wb', delete=False) as ref_wav:
ref_temp_path = ref_wav.name
write_wav(ref_temp_path, ref_sr, ref_audio_array.astype("float32"))
# Clone voice
clone_path = ref_temp_path.replace(".wav", "_clone.wav")
tts.tts_to_file(
text="My voice is my password.",
speaker_wav=ref_temp_path,
file_path=clone_path,
language="en"
)
# Spoof detection
sb_score, sb_label = sb.verify_files(ref_temp_path, clone_path)
ast_ref = ast_pipe(ref_temp_path)[0]
ast_clone = ast_pipe(clone_path)[0]
df_ref = df_pipe(ref_temp_path)[0]
df_clone = df_pipe(clone_path)[0]
results = {
"SpeechBrain": str(sb_label.item()),
"AST REF": f"{ast_ref['label']} ({ast_ref['score']:.2f})",
"AST CLONE": f"{ast_clone['label']} ({ast_clone['score']:.2f})",
"Deepfake REF": f"{df_ref['label']} ({df_ref['score']:.2f})",
"Deepfake CLONE": f"{df_clone['label']} ({df_clone['score']:.2f})",
}
return ref_temp_path, clone_path, results
demo = gr.Interface(
fn=spoof_and_detect,
inputs=gr.Audio(source="microphone", type="numpy", label="🎀 Record your voice"),
outputs=[
gr.Audio(label="🎧 Original"),
gr.Audio(label="🎧 Cloned"),
gr.JSON(label="πŸ§ͺ Spoof Detection Results")
],
title="Voice Cloning + Spoof Detection",
description="Clone a speaker's voice and evaluate with 3 spoof detection models."
)
demo.launch()