import os os.environ["COQUI_TOS_AGREED"] = "1" import torch from torch.serialization import add_safe_globals from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import XttsAudioConfig add_safe_globals([XttsConfig, XttsAudioConfig]) from TTS.api import TTS from speechbrain.inference import SpeakerRecognition from transformers import pipeline import gradio as gr import numpy as np import soundfile as sf from scipy.signal import resample from scipy.io.wavfile import write as write_wav from tempfile import NamedTemporaryFile # Load voice cloning model (XTTS) tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False) # Load spoof detection models sb = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp_model") ast_pipe = pipeline("audio-classification", model="MattyB95/AST-VoxCelebSpoof-Synthetic-Voice-Detection") df_pipe = pipeline("audio-classification", model="MelodyMachine/Deepfake-audio-detection-V2") def spoof_and_detect(voice_sample, desired_sr=16000): ref_audio_array, ref_sr = voice_sample # Resample to 16kHz if ref_sr != desired_sr: duration = ref_audio_array.shape[0] / ref_sr num_samples = int(duration * desired_sr) ref_audio_array = resample(ref_audio_array, num_samples) ref_sr = desired_sr # Save reference audio with NamedTemporaryFile(suffix=".wav", mode='wb', delete=False) as ref_wav: ref_temp_path = ref_wav.name write_wav(ref_temp_path, ref_sr, ref_audio_array.astype("float32")) # Clone voice clone_path = ref_temp_path.replace(".wav", "_clone.wav") tts.tts_to_file( text="My voice is my password.", speaker_wav=ref_temp_path, file_path=clone_path, language="en" ) # Spoof detection sb_score, sb_label = sb.verify_files(ref_temp_path, clone_path) ast_ref = ast_pipe(ref_temp_path)[0] ast_clone = ast_pipe(clone_path)[0] df_ref = df_pipe(ref_temp_path)[0] df_clone = df_pipe(clone_path)[0] results = { "SpeechBrain": str(sb_label.item()), "AST REF": f"{ast_ref['label']} ({ast_ref['score']:.2f})", "AST CLONE": f"{ast_clone['label']} ({ast_clone['score']:.2f})", "Deepfake REF": f"{df_ref['label']} ({df_ref['score']:.2f})", "Deepfake CLONE": f"{df_clone['label']} ({df_clone['score']:.2f})", } return ref_temp_path, clone_path, results demo = gr.Interface( fn=spoof_and_detect, inputs=gr.Audio(source="microphone", type="numpy", label="๐ŸŽค Record your voice"), outputs=[ gr.Audio(label="๐ŸŽง Original"), gr.Audio(label="๐ŸŽง Cloned"), gr.JSON(label="๐Ÿงช Spoof Detection Results") ], title="Voice Cloning + Spoof Detection", description="Clone a speaker's voice and evaluate with 3 spoof detection models." ) demo.launch()