Commit
·
f510c1c
1
Parent(s):
d71af44
Intitial commit
Browse files- .gitignore +3 -0
- app.py +56 -0
- ui/coming_soon.py +40 -0
- ui/evaluation.py +61 -0
- ui/leaderboard.py +72 -0
- ui/metrics.py +27 -0
- ui/submission.py +29 -0
- utils.py +28 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/*
|
2 |
+
*.pyc
|
3 |
+
ui/__pycache__
|
app.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
import gradio as gr
|
6 |
+
from ui.leaderboard import render_leader_board, render_info_html, render_citation
|
7 |
+
from ui.evaluation import render_eval_info
|
8 |
+
from ui.coming_soon import render_coming_soon
|
9 |
+
from ui.submission import render_submission_page
|
10 |
+
import os
|
11 |
+
from utils import load_leaderboard, custom_css
|
12 |
+
from huggingface_hub import snapshot_download
|
13 |
+
import gradio as gr
|
14 |
+
import os
|
15 |
+
import json
|
16 |
+
|
17 |
+
REPO_ID = os.getenv('REPO_ID')
|
18 |
+
DB_ERR_PATH = f'./data/data/leaderboard_err.csv'
|
19 |
+
CITATIONS_PATH = f'./data/data/model_citations.json'
|
20 |
+
|
21 |
+
if not os.path.exists('./data/data'):
|
22 |
+
snapshot_download(repo_id=REPO_ID,
|
23 |
+
repo_type="dataset", local_dir='./data/data')
|
24 |
+
|
25 |
+
with open(CITATIONS_PATH, 'r') as f:
|
26 |
+
model_citations = json.load(f)
|
27 |
+
|
28 |
+
# Load leaderboard data
|
29 |
+
leaderboard_df_err = load_leaderboard(DB_ERR_PATH)
|
30 |
+
|
31 |
+
def create_ui():
|
32 |
+
with gr.Blocks(theme=gr.themes.Soft(text_size=gr.themes.sizes.text_md), css=custom_css) as demo:
|
33 |
+
# gr.Markdown("# Speech Deep Fake Arena")
|
34 |
+
gr.Image('./data/data/df_arena_2.jpg', interactive=False,
|
35 |
+
show_fullscreen_button=False, show_share_button=False, show_label=False)
|
36 |
+
|
37 |
+
with gr.Tabs():
|
38 |
+
with gr.Tab("🏆 Leaderboard"):
|
39 |
+
with gr.Column():
|
40 |
+
render_info_html()
|
41 |
+
gr.Markdown("Table for Equal Error Rate (EER %) for different systems")
|
42 |
+
render_leader_board(leaderboard_df_err, model_citations) # Adjust this to work with Gradio components
|
43 |
+
render_citation()
|
44 |
+
|
45 |
+
with gr.Tab("📊 Metrics"):
|
46 |
+
render_eval_info()
|
47 |
+
|
48 |
+
with gr.Tab("📤 Submit your own system !"):
|
49 |
+
render_submission_page()
|
50 |
+
|
51 |
+
with gr.Tab("🔜 Coming Soon"):
|
52 |
+
render_coming_soon()
|
53 |
+
return demo
|
54 |
+
|
55 |
+
# Launch the app
|
56 |
+
create_ui().launch()
|
ui/coming_soon.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
def render_coming_soon():
|
4 |
+
text = r"""
|
5 |
+
|
6 |
+
### **1. More evaluation metrics**
|
7 |
+
- Accuracy
|
8 |
+
- Precision, Recall and F1 Score
|
9 |
+
- minDCF
|
10 |
+
|
11 |
+
#### **2. More Datasets and Models:**
|
12 |
+
|
13 |
+
**Datasets:**
|
14 |
+
|
15 |
+
- MLAAD
|
16 |
+
- Latin-American-Spanish-Deepfake-Dataset
|
17 |
+
- CodecFake-Omni
|
18 |
+
- Hindi audio-video-Deepfake
|
19 |
+
- SpoofCeleb
|
20 |
+
- VoiceWukong
|
21 |
+
- Codecfake Yi Lu et.al.
|
22 |
+
- CodecFake Haibin Wu et.al.
|
23 |
+
- LRPD
|
24 |
+
- EmoFake
|
25 |
+
|
26 |
+
|
27 |
+
**Models:**
|
28 |
+
- Wav2Vec2-AASIST
|
29 |
+
- RawNet3
|
30 |
+
- AASIST2
|
31 |
+
|
32 |
+
#### **3. Top performing DF systems live demo**
|
33 |
+
|
34 |
+
Run inference using your own audio samples on top performing DF systems. Get probability scores for each system
|
35 |
+
|
36 |
+
"""
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
return gr.Markdown(text)
|
ui/evaluation.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
def render_eval_info():
|
4 |
+
text = r"""
|
5 |
+
|
6 |
+
We use **Equal Error Rate (EER %)** a standard method used in bimoretric and anti-spoofing systems.
|
7 |
+
|
8 |
+
### **What is EER?**
|
9 |
+
Equal Error Rate (EER) is a performance metric used to evaluate biometric systems. It represents the point at which the **False Acceptance Rate (FAR)** and **False Rejection Rate (FRR)** are equal. A lower EER indicates a more accurate system.
|
10 |
+
|
11 |
+
#### **False Acceptance Rate (FAR)**
|
12 |
+
FAR is the proportion of **unauthorized** users incorrectly accepted by the system.
|
13 |
+
|
14 |
+
$FAR = \frac{\text{False Acceptances}}{\text{Total Imposter Attempts}}$
|
15 |
+
|
16 |
+
#### **False Rejection Rate (FRR)**
|
17 |
+
FRR is the proportion of **genuine** users incorrectly rejected by the system.
|
18 |
+
|
19 |
+
$FRR = \frac{\text{False Rejections}}{\text{Total Genuine Attempts}}$
|
20 |
+
|
21 |
+
|
22 |
+
- EER is the point at which FAR and FRR are equal.
|
23 |
+
|
24 |
+
### How to compute your own EER score file ?
|
25 |
+
|
26 |
+
In order to streamline the evaluation process across many models and datasets, we
|
27 |
+
have developed df_arena_toolkit which can be used to compute score files for evaluation.
|
28 |
+
The tool can be found at https://github.com/Speech-Arena/speech_df_arena.
|
29 |
+
|
30 |
+
### Usage
|
31 |
+
#### 1. Data Preparation
|
32 |
+
Create metadata.csv for your desired dataset with below format:
|
33 |
+
|
34 |
+
```
|
35 |
+
file_name,label
|
36 |
+
/path/to/audio1,spoof
|
37 |
+
/path/to/audio2,bonafide
|
38 |
+
...
|
39 |
+
|
40 |
+
```
|
41 |
+
NOTE : The labels should contain "spoof" for spoofed samples and "bonafide" for real samples.
|
42 |
+
All the file_name paths should be absolute
|
43 |
+
|
44 |
+
#### 2. Evaluation
|
45 |
+
|
46 |
+
Example usage :
|
47 |
+
```py
|
48 |
+
python evaluation.py --model_name wavlm_ecapa
|
49 |
+
--batch_size 32
|
50 |
+
--protocol_file_path /path/to/metadata.csv
|
51 |
+
--model_path /path/to/model.ckpt
|
52 |
+
--out_score_file_name scores.txt
|
53 |
+
--trim pad
|
54 |
+
--num workers 4
|
55 |
+
```
|
56 |
+
|
57 |
+
NOTES
|
58 |
+
- Checkpoints and config files for the open source systems used in our benchmark can be found at :
|
59 |
+
- Example inference command for every model can be found at :
|
60 |
+
"""
|
61 |
+
return gr.Markdown(text, latex_delimiters=[{ "left": "$", "right": "$", "display": True }])
|
ui/leaderboard.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import pandas as pd
|
3 |
+
import gradio as gr
|
4 |
+
from utils import load_leaderboard
|
5 |
+
import numpy as np
|
6 |
+
from huggingface_hub import snapshot_download
|
7 |
+
|
8 |
+
|
9 |
+
def make_clickable(url, name):
|
10 |
+
return f'<a href="{url}" target="_blank">{name}</a>'
|
11 |
+
|
12 |
+
def render_info_html():
|
13 |
+
# info_text = "With the growing advent of machine-generated speech, the scientific community is responding with exciting resources " \
|
14 |
+
# "to detect deep fakes. With research moving at such a rapid pace, it becomes challenging to keep track of generalizability " \
|
15 |
+
# "of SOTA DF detection systems. This leaderboard thus presents a comprehensive benchmark of 10 SOTA speech antispoofing " \
|
16 |
+
# "systems across 13 popular speech deep fake detection datasets."
|
17 |
+
info_text = """The advent of machine generated speech calls for dedicated research to develop countermeasure systems to protect against their misuse.
|
18 |
+
The speech DF arena leaderboard provides a standardized platform to compare different speech deepfake detection approaches and ranks them on Huggingface.
|
19 |
+
By assessing models across diverse datasets and attack scenarios, the speech DF arena leaderboard aims to help researchers and developers enhance the reliability and robustness of deepfake detection systems
|
20 |
+
, ensuring safer and more trustworthy audio communications. We report the average EER (lower the better).
|
21 |
+
Models are ranked based on their Average EER, from lowest to highest. Check the Metrics tab to understand how the models are evaluated.
|
22 |
+
If you want results for a model that is not listed here, you can submit a request for it to be included through under "submit your own system" tab.
|
23 |
+
The leaderboard currently focuses on English and Chinese speech deepfake detection datasets."""
|
24 |
+
|
25 |
+
# HTML formatted info text
|
26 |
+
return gr.Markdown(info_text)
|
27 |
+
|
28 |
+
def highlight_min(s, props=''):
|
29 |
+
return np.where(s == np.nanmin(s.values), props, '')
|
30 |
+
|
31 |
+
def render_leader_board(leaderboard_df, model_citations):
|
32 |
+
|
33 |
+
if not leaderboard_df.empty:
|
34 |
+
print(leaderboard_df.shape)
|
35 |
+
leaderboard_df.insert(2, 'Average EER(%)', leaderboard_df.iloc[:, 3:].mean(axis=1))
|
36 |
+
|
37 |
+
leaderboard_df = leaderboard_df.sort_values(by="Average EER(%)", ascending=True).reset_index(drop=True)
|
38 |
+
|
39 |
+
# Assign rank emojis 🥇🥈🥉
|
40 |
+
leaderboard_df["System"] = leaderboard_df["System"].apply(lambda x: f"[{x}]({model_citations.get(x, '#')})")
|
41 |
+
|
42 |
+
emojis = ["🥇", "🥈", "🥉"]
|
43 |
+
|
44 |
+
leaderboard_df.loc[0, "System"] = f"{emojis[0]} {leaderboard_df.System[0]}"
|
45 |
+
leaderboard_df.loc[1, "System"] = f"{emojis[1]} {leaderboard_df.System[1]}"
|
46 |
+
leaderboard_df.loc[2, "System"] = f"{emojis[2]} {leaderboard_df.System[2]}"
|
47 |
+
|
48 |
+
columns_to_style = [col for col in leaderboard_df.columns if col != 'Training Data']
|
49 |
+
|
50 |
+
styler = (
|
51 |
+
leaderboard_df
|
52 |
+
.style \
|
53 |
+
.format(precision=2)
|
54 |
+
.apply(highlight_min, props='color:green', axis=0, subset=columns_to_style)
|
55 |
+
)
|
56 |
+
|
57 |
+
return gr.Dataframe(styler, datatype=['markdown'] * 2 + ['number'] * 16)
|
58 |
+
return gr.HTML(value="<p>No data available in the leaderboard.</p>")
|
59 |
+
|
60 |
+
def render_citation():
|
61 |
+
return gr.Markdown(r"""
|
62 |
+
If you use Speech DF Arena in your work, it can be cited as:
|
63 |
+
|
64 |
+
```bibtex
|
65 |
+
@misc{speecharena-df-leaderboard,
|
66 |
+
title = {Speech Arena: DeepFake Leaderboard},
|
67 |
+
author = {Speech Arena},
|
68 |
+
year = 2025,
|
69 |
+
publisher = {Hugging Face},
|
70 |
+
howpublished = "\url{link}"
|
71 |
+
}
|
72 |
+
```""")
|
ui/metrics.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
def render_metrics():
|
4 |
+
text = r"""
|
5 |
+
We use **Equal Error Rate (EER %)** a standard method used in bimoretric and anti-spoofing systems.
|
6 |
+
|
7 |
+
### **What is EER?**
|
8 |
+
Equal Error Rate (EER) is a performance metric used to evaluate biometric systems. It represents the point at which the **False Acceptance Rate (FAR)** and **False Rejection Rate (FRR)** are equal. A lower EER indicates a more accurate system.
|
9 |
+
|
10 |
+
#### **False Acceptance Rate (FAR)**
|
11 |
+
FAR is the proportion of **unauthorized** users incorrectly accepted by the system.
|
12 |
+
|
13 |
+
$FAR = \frac{\text{False Acceptances}}{\text{Total Imposter Attempts}}$
|
14 |
+
|
15 |
+
A high FAR means the system is too lenient, allowing unauthorized access.
|
16 |
+
|
17 |
+
#### **False Rejection Rate (FRR)**
|
18 |
+
FRR is the proportion of **genuine** users incorrectly rejected by the system.
|
19 |
+
|
20 |
+
$FRR = \frac{\text{False Rejections}}{\text{Total Genuine Attempts}}$
|
21 |
+
|
22 |
+
A high FRR means the system is too strict, denying access to legitimate users.
|
23 |
+
|
24 |
+
### EER is the point at which FAR and FRR are equal.
|
25 |
+
"""
|
26 |
+
|
27 |
+
return gr.Markdown(text, latex_delimiters=[ {"left": "$", "right": "$", "display": False }])
|
ui/submission.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
def render_submission_page():
|
4 |
+
text = r"""Want to submit your own system to the leaderboard? Submit
|
5 |
+
all the scores files for your system across evaluation sets of all the below supported datasets at
|
6 |
+
<[email protected]> and we will handle the rest. Score files can be generated using the df_arena_toolkit.
|
7 |
+
|
8 |
+
|
9 |
+
- [ASVSpoof2019](https://www.asvspoof.org/index2019.html)
|
10 |
+
- [ASVSpoof2021LA](https://www.asvspoof.org/index2021.html)
|
11 |
+
- [ASVSpoof2021DF](https://www.asvspoof.org/index2021.html)
|
12 |
+
- [ASVSpoof2024-Dev](https://www.asvspoof.org/workshop2024)
|
13 |
+
- [ASVSpoof2024-Eval](https://www.asvspoof.org/workshop2024)
|
14 |
+
- [FakeOrReal](https://bil.eecs.yorku.ca/datasets/)
|
15 |
+
- [codecfake3](https://github.com/xieyuankun/Codecfake)
|
16 |
+
- [ADD2022 Track 1](http://addchallenge.cn/add2022)
|
17 |
+
- [ADD2022 Track 3](http://addchallenge.cn/add2022)
|
18 |
+
- [ADD 2023 R1](http://addchallenge.cn/add2023)
|
19 |
+
- [ADD2023 R2](http://addchallenge.cn/add2023)
|
20 |
+
- [DFADD](https://github.com/isjwdu/DFADD)
|
21 |
+
- [LibriVoc](https://github.com/csun22/Synthetic-Voice-Detection-Vocoder-Artifacts)
|
22 |
+
- [SONAR](https://github.com/Jessegator/SONAR)
|
23 |
+
"""
|
24 |
+
|
25 |
+
|
26 |
+
return gr.Markdown(text)
|
27 |
+
|
28 |
+
|
29 |
+
|
utils.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sqlite3
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
def load_leaderboard(db_path):
|
5 |
+
df = pd.read_csv(db_path) # Update table name if needed
|
6 |
+
|
7 |
+
return df
|
8 |
+
|
9 |
+
custom_css = """
|
10 |
+
h1, {
|
11 |
+
font-size: 48px !important; /* Increase heading sizes */
|
12 |
+
line-height: 2.0 !important; /* Increase line spacing */
|
13 |
+
text-align: center !important; /* Center align headings */
|
14 |
+
}
|
15 |
+
|
16 |
+
.gradio-container {
|
17 |
+
padding: 30px !important; /* Increase padding around the UI */
|
18 |
+
}
|
19 |
+
|
20 |
+
.markdown-body p {
|
21 |
+
font-size: 28px !important; /* Increase text size */
|
22 |
+
line-height: 2.0 !important; /* More space between lines */
|
23 |
+
}
|
24 |
+
|
25 |
+
.gradio-container .gr-block {
|
26 |
+
margin-bottom: 20px !important; /* Add more space between elements */
|
27 |
+
}
|
28 |
+
"""
|