File size: 2,913 Bytes
9313c8d
 
 
da9415e
 
 
2841564
da9415e
2841564
da9415e
5df2441
9313c8d
5df2441
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dede48
5df2441
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
os.environ["COQUI_TOS_AGREED"] = "1"

import torch
from torch.serialization import add_safe_globals
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import XttsAudioConfig

add_safe_globals([XttsConfig, XttsAudioConfig])

from TTS.api import TTS
from speechbrain.inference import SpeakerRecognition
from transformers import pipeline
import gradio as gr
import numpy as np
import soundfile as sf
from scipy.signal import resample
from scipy.io.wavfile import write as write_wav
from tempfile import NamedTemporaryFile

# Load voice cloning model (XTTS)
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False)

# Load spoof detection models
sb = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp_model")
ast_pipe = pipeline("audio-classification", model="MattyB95/AST-VoxCelebSpoof-Synthetic-Voice-Detection")
df_pipe = pipeline("audio-classification", model="MelodyMachine/Deepfake-audio-detection-V2")

def spoof_and_detect(voice_sample, desired_sr=16000):
    ref_audio_array, ref_sr = voice_sample

    # Resample to 16kHz
    if ref_sr != desired_sr:
        duration = ref_audio_array.shape[0] / ref_sr
        num_samples = int(duration * desired_sr)
        ref_audio_array = resample(ref_audio_array, num_samples)
        ref_sr = desired_sr

    # Save reference audio
    with NamedTemporaryFile(suffix=".wav", mode='wb', delete=False) as ref_wav:
        ref_temp_path = ref_wav.name
        write_wav(ref_temp_path, ref_sr, ref_audio_array.astype("float32"))

    # Clone voice
    clone_path = ref_temp_path.replace(".wav", "_clone.wav")
    tts.tts_to_file(
        text="My voice is my password.",
        speaker_wav=ref_temp_path,
        file_path=clone_path,
        language="en"
    )

    # Spoof detection
    sb_score, sb_label = sb.verify_files(ref_temp_path, clone_path)
    ast_ref = ast_pipe(ref_temp_path)[0]
    ast_clone = ast_pipe(clone_path)[0]
    df_ref = df_pipe(ref_temp_path)[0]
    df_clone = df_pipe(clone_path)[0]

    results = {
        "SpeechBrain": str(sb_label.item()),
        "AST REF": f"{ast_ref['label']} ({ast_ref['score']:.2f})",
        "AST CLONE": f"{ast_clone['label']} ({ast_clone['score']:.2f})",
        "Deepfake REF": f"{df_ref['label']} ({df_ref['score']:.2f})",
        "Deepfake CLONE": f"{df_clone['label']} ({df_clone['score']:.2f})",
    }

    return ref_temp_path, clone_path, results

demo = gr.Interface(
    fn=spoof_and_detect,
    inputs=gr.Audio(source="microphone", type="numpy", label="🎤 Record your voice"),
    outputs=[
        gr.Audio(label="🎧 Original"),
        gr.Audio(label="🎧 Cloned"),
        gr.JSON(label="🧪 Spoof Detection Results")
    ],
    title="Voice Cloning + Spoof Detection",
    description="Clone a speaker's voice and evaluate with 3 spoof detection models."
)

demo.launch()