Spaces:
Runtime error
Runtime error
import os | |
os.environ["COQUI_TOS_AGREED"] = "1" | |
import torch | |
from torch.serialization import add_safe_globals | |
from TTS.tts.configs.xtts_config import XttsConfig | |
from TTS.tts.models.xtts import XttsAudioConfig | |
add_safe_globals([XttsConfig, XttsAudioConfig]) | |
from TTS.api import TTS | |
from speechbrain.inference import SpeakerRecognition | |
from transformers import pipeline | |
import gradio as gr | |
import numpy as np | |
import soundfile as sf | |
from scipy.signal import resample | |
from scipy.io.wavfile import write as write_wav | |
from tempfile import NamedTemporaryFile | |
# Load voice cloning model (XTTS) | |
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False) | |
# Load spoof detection models | |
sb = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp_model") | |
ast_pipe = pipeline("audio-classification", model="MattyB95/AST-VoxCelebSpoof-Synthetic-Voice-Detection") | |
df_pipe = pipeline("audio-classification", model="MelodyMachine/Deepfake-audio-detection-V2") | |
def spoof_and_detect(voice_sample, desired_sr=16000): | |
ref_audio_array, ref_sr = voice_sample | |
# Resample to 16kHz | |
if ref_sr != desired_sr: | |
duration = ref_audio_array.shape[0] / ref_sr | |
num_samples = int(duration * desired_sr) | |
ref_audio_array = resample(ref_audio_array, num_samples) | |
ref_sr = desired_sr | |
# Save reference audio | |
with NamedTemporaryFile(suffix=".wav", mode='wb', delete=False) as ref_wav: | |
ref_temp_path = ref_wav.name | |
write_wav(ref_temp_path, ref_sr, ref_audio_array.astype("float32")) | |
# Clone voice | |
clone_path = ref_temp_path.replace(".wav", "_clone.wav") | |
tts.tts_to_file( | |
text="My voice is my password.", | |
speaker_wav=ref_temp_path, | |
file_path=clone_path, | |
language="en" | |
) | |
# Spoof detection | |
sb_score, sb_label = sb.verify_files(ref_temp_path, clone_path) | |
ast_ref = ast_pipe(ref_temp_path)[0] | |
ast_clone = ast_pipe(clone_path)[0] | |
df_ref = df_pipe(ref_temp_path)[0] | |
df_clone = df_pipe(clone_path)[0] | |
results = { | |
"SpeechBrain": str(sb_label.item()), | |
"AST REF": f"{ast_ref['label']} ({ast_ref['score']:.2f})", | |
"AST CLONE": f"{ast_clone['label']} ({ast_clone['score']:.2f})", | |
"Deepfake REF": f"{df_ref['label']} ({df_ref['score']:.2f})", | |
"Deepfake CLONE": f"{df_clone['label']} ({df_clone['score']:.2f})", | |
} | |
return ref_temp_path, clone_path, results | |
demo = gr.Interface( | |
fn=spoof_and_detect, | |
inputs=gr.Audio(source="microphone", type="numpy", label="π€ Record your voice"), | |
outputs=[ | |
gr.Audio(label="π§ Original"), | |
gr.Audio(label="π§ Cloned"), | |
gr.JSON(label="π§ͺ Spoof Detection Results") | |
], | |
title="Voice Cloning + Spoof Detection", | |
description="Clone a speaker's voice and evaluate with 3 spoof detection models." | |
) | |
demo.launch() |