Speech-Arena-2025 commited on
Commit
f510c1c
·
1 Parent(s): d71af44

Intitial commit

Browse files
Files changed (8) hide show
  1. .gitignore +3 -0
  2. app.py +56 -0
  3. ui/coming_soon.py +40 -0
  4. ui/evaluation.py +61 -0
  5. ui/leaderboard.py +72 -0
  6. ui/metrics.py +27 -0
  7. ui/submission.py +29 -0
  8. utils.py +28 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__/*
2
+ *.pyc
3
+ ui/__pycache__
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+ import pandas as pd
5
+ import gradio as gr
6
+ from ui.leaderboard import render_leader_board, render_info_html, render_citation
7
+ from ui.evaluation import render_eval_info
8
+ from ui.coming_soon import render_coming_soon
9
+ from ui.submission import render_submission_page
10
+ import os
11
+ from utils import load_leaderboard, custom_css
12
+ from huggingface_hub import snapshot_download
13
+ import gradio as gr
14
+ import os
15
+ import json
16
+
17
+ REPO_ID = os.getenv('REPO_ID')
18
+ DB_ERR_PATH = f'./data/data/leaderboard_err.csv'
19
+ CITATIONS_PATH = f'./data/data/model_citations.json'
20
+
21
+ if not os.path.exists('./data/data'):
22
+ snapshot_download(repo_id=REPO_ID,
23
+ repo_type="dataset", local_dir='./data/data')
24
+
25
+ with open(CITATIONS_PATH, 'r') as f:
26
+ model_citations = json.load(f)
27
+
28
+ # Load leaderboard data
29
+ leaderboard_df_err = load_leaderboard(DB_ERR_PATH)
30
+
31
+ def create_ui():
32
+ with gr.Blocks(theme=gr.themes.Soft(text_size=gr.themes.sizes.text_md), css=custom_css) as demo:
33
+ # gr.Markdown("# Speech Deep Fake Arena")
34
+ gr.Image('./data/data/df_arena_2.jpg', interactive=False,
35
+ show_fullscreen_button=False, show_share_button=False, show_label=False)
36
+
37
+ with gr.Tabs():
38
+ with gr.Tab("🏆 Leaderboard"):
39
+ with gr.Column():
40
+ render_info_html()
41
+ gr.Markdown("Table for Equal Error Rate (EER %) for different systems")
42
+ render_leader_board(leaderboard_df_err, model_citations) # Adjust this to work with Gradio components
43
+ render_citation()
44
+
45
+ with gr.Tab("📊 Metrics"):
46
+ render_eval_info()
47
+
48
+ with gr.Tab("📤 Submit your own system !"):
49
+ render_submission_page()
50
+
51
+ with gr.Tab("🔜 Coming Soon"):
52
+ render_coming_soon()
53
+ return demo
54
+
55
+ # Launch the app
56
+ create_ui().launch()
ui/coming_soon.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def render_coming_soon():
4
+ text = r"""
5
+
6
+ ### **1. More evaluation metrics**
7
+ - Accuracy
8
+ - Precision, Recall and F1 Score
9
+ - minDCF
10
+
11
+ #### **2. More Datasets and Models:**
12
+
13
+ **Datasets:**
14
+
15
+ - MLAAD
16
+ - Latin-American-Spanish-Deepfake-Dataset
17
+ - CodecFake-Omni
18
+ - Hindi audio-video-Deepfake
19
+ - SpoofCeleb
20
+ - VoiceWukong
21
+ - Codecfake Yi Lu et.al.
22
+ - CodecFake Haibin Wu et.al.
23
+ - LRPD
24
+ - EmoFake
25
+
26
+
27
+ **Models:**
28
+ - Wav2Vec2-AASIST
29
+ - RawNet3
30
+ - AASIST2
31
+
32
+ #### **3. Top performing DF systems live demo**
33
+
34
+ Run inference using your own audio samples on top performing DF systems. Get probability scores for each system
35
+
36
+ """
37
+
38
+
39
+
40
+ return gr.Markdown(text)
ui/evaluation.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def render_eval_info():
4
+ text = r"""
5
+
6
+ We use **Equal Error Rate (EER %)** a standard method used in bimoretric and anti-spoofing systems.
7
+
8
+ ### **What is EER?**
9
+ Equal Error Rate (EER) is a performance metric used to evaluate biometric systems. It represents the point at which the **False Acceptance Rate (FAR)** and **False Rejection Rate (FRR)** are equal. A lower EER indicates a more accurate system.
10
+
11
+ #### **False Acceptance Rate (FAR)**
12
+ FAR is the proportion of **unauthorized** users incorrectly accepted by the system.
13
+
14
+ $FAR = \frac{\text{False Acceptances}}{\text{Total Imposter Attempts}}$
15
+
16
+ #### **False Rejection Rate (FRR)**
17
+ FRR is the proportion of **genuine** users incorrectly rejected by the system.
18
+
19
+ $FRR = \frac{\text{False Rejections}}{\text{Total Genuine Attempts}}$
20
+
21
+
22
+ - EER is the point at which FAR and FRR are equal.
23
+
24
+ ### How to compute your own EER score file ?
25
+
26
+ In order to streamline the evaluation process across many models and datasets, we
27
+ have developed df_arena_toolkit which can be used to compute score files for evaluation.
28
+ The tool can be found at https://github.com/Speech-Arena/speech_df_arena.
29
+
30
+ ### Usage
31
+ #### 1. Data Preparation
32
+ Create metadata.csv for your desired dataset with below format:
33
+
34
+ ```
35
+ file_name,label
36
+ /path/to/audio1,spoof
37
+ /path/to/audio2,bonafide
38
+ ...
39
+
40
+ ```
41
+ NOTE : The labels should contain "spoof" for spoofed samples and "bonafide" for real samples.
42
+ All the file_name paths should be absolute
43
+
44
+ #### 2. Evaluation
45
+
46
+ Example usage :
47
+ ```py
48
+ python evaluation.py --model_name wavlm_ecapa
49
+ --batch_size 32
50
+ --protocol_file_path /path/to/metadata.csv
51
+ --model_path /path/to/model.ckpt
52
+ --out_score_file_name scores.txt
53
+ --trim pad
54
+ --num workers 4
55
+ ```
56
+
57
+ NOTES
58
+ - Checkpoints and config files for the open source systems used in our benchmark can be found at :
59
+ - Example inference command for every model can be found at :
60
+ """
61
+ return gr.Markdown(text, latex_delimiters=[{ "left": "$", "right": "$", "display": True }])
ui/leaderboard.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import gradio as gr
4
+ from utils import load_leaderboard
5
+ import numpy as np
6
+ from huggingface_hub import snapshot_download
7
+
8
+
9
+ def make_clickable(url, name):
10
+ return f'<a href="{url}" target="_blank">{name}</a>'
11
+
12
+ def render_info_html():
13
+ # info_text = "With the growing advent of machine-generated speech, the scientific community is responding with exciting resources " \
14
+ # "to detect deep fakes. With research moving at such a rapid pace, it becomes challenging to keep track of generalizability " \
15
+ # "of SOTA DF detection systems. This leaderboard thus presents a comprehensive benchmark of 10 SOTA speech antispoofing " \
16
+ # "systems across 13 popular speech deep fake detection datasets."
17
+ info_text = """The advent of machine generated speech calls for dedicated research to develop countermeasure systems to protect against their misuse.
18
+ The speech DF arena leaderboard provides a standardized platform to compare different speech deepfake detection approaches and ranks them on Huggingface.
19
+ By assessing models across diverse datasets and attack scenarios, the speech DF arena leaderboard aims to help researchers and developers enhance the reliability and robustness of deepfake detection systems
20
+ , ensuring safer and more trustworthy audio communications. We report the average EER (lower the better).
21
+ Models are ranked based on their Average EER, from lowest to highest. Check the Metrics tab to understand how the models are evaluated.
22
+ If you want results for a model that is not listed here, you can submit a request for it to be included through under "submit your own system" tab.
23
+ The leaderboard currently focuses on English and Chinese speech deepfake detection datasets."""
24
+
25
+ # HTML formatted info text
26
+ return gr.Markdown(info_text)
27
+
28
+ def highlight_min(s, props=''):
29
+ return np.where(s == np.nanmin(s.values), props, '')
30
+
31
+ def render_leader_board(leaderboard_df, model_citations):
32
+
33
+ if not leaderboard_df.empty:
34
+ print(leaderboard_df.shape)
35
+ leaderboard_df.insert(2, 'Average EER(%)', leaderboard_df.iloc[:, 3:].mean(axis=1))
36
+
37
+ leaderboard_df = leaderboard_df.sort_values(by="Average EER(%)", ascending=True).reset_index(drop=True)
38
+
39
+ # Assign rank emojis 🥇🥈🥉
40
+ leaderboard_df["System"] = leaderboard_df["System"].apply(lambda x: f"[{x}]({model_citations.get(x, '#')})")
41
+
42
+ emojis = ["🥇", "🥈", "🥉"]
43
+
44
+ leaderboard_df.loc[0, "System"] = f"{emojis[0]} {leaderboard_df.System[0]}"
45
+ leaderboard_df.loc[1, "System"] = f"{emojis[1]} {leaderboard_df.System[1]}"
46
+ leaderboard_df.loc[2, "System"] = f"{emojis[2]} {leaderboard_df.System[2]}"
47
+
48
+ columns_to_style = [col for col in leaderboard_df.columns if col != 'Training Data']
49
+
50
+ styler = (
51
+ leaderboard_df
52
+ .style \
53
+ .format(precision=2)
54
+ .apply(highlight_min, props='color:green', axis=0, subset=columns_to_style)
55
+ )
56
+
57
+ return gr.Dataframe(styler, datatype=['markdown'] * 2 + ['number'] * 16)
58
+ return gr.HTML(value="<p>No data available in the leaderboard.</p>")
59
+
60
+ def render_citation():
61
+ return gr.Markdown(r"""
62
+ If you use Speech DF Arena in your work, it can be cited as:
63
+
64
+ ```bibtex
65
+ @misc{speecharena-df-leaderboard,
66
+ title = {Speech Arena: DeepFake Leaderboard},
67
+ author = {Speech Arena},
68
+ year = 2025,
69
+ publisher = {Hugging Face},
70
+ howpublished = "\url{link}"
71
+ }
72
+ ```""")
ui/metrics.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def render_metrics():
4
+ text = r"""
5
+ We use **Equal Error Rate (EER %)** a standard method used in bimoretric and anti-spoofing systems.
6
+
7
+ ### **What is EER?**
8
+ Equal Error Rate (EER) is a performance metric used to evaluate biometric systems. It represents the point at which the **False Acceptance Rate (FAR)** and **False Rejection Rate (FRR)** are equal. A lower EER indicates a more accurate system.
9
+
10
+ #### **False Acceptance Rate (FAR)**
11
+ FAR is the proportion of **unauthorized** users incorrectly accepted by the system.
12
+
13
+ $FAR = \frac{\text{False Acceptances}}{\text{Total Imposter Attempts}}$
14
+
15
+ A high FAR means the system is too lenient, allowing unauthorized access.
16
+
17
+ #### **False Rejection Rate (FRR)**
18
+ FRR is the proportion of **genuine** users incorrectly rejected by the system.
19
+
20
+ $FRR = \frac{\text{False Rejections}}{\text{Total Genuine Attempts}}$
21
+
22
+ A high FRR means the system is too strict, denying access to legitimate users.
23
+
24
+ ### EER is the point at which FAR and FRR are equal.
25
+ """
26
+
27
+ return gr.Markdown(text, latex_delimiters=[ {"left": "$", "right": "$", "display": False }])
ui/submission.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def render_submission_page():
4
+ text = r"""Want to submit your own system to the leaderboard? Submit
5
+ all the scores files for your system across evaluation sets of all the below supported datasets at
6
+ <[email protected]> and we will handle the rest. Score files can be generated using the df_arena_toolkit.
7
+
8
+
9
+ - [ASVSpoof2019](https://www.asvspoof.org/index2019.html)
10
+ - [ASVSpoof2021LA](https://www.asvspoof.org/index2021.html)
11
+ - [ASVSpoof2021DF](https://www.asvspoof.org/index2021.html)
12
+ - [ASVSpoof2024-Dev](https://www.asvspoof.org/workshop2024)
13
+ - [ASVSpoof2024-Eval](https://www.asvspoof.org/workshop2024)
14
+ - [FakeOrReal](https://bil.eecs.yorku.ca/datasets/)
15
+ - [codecfake3](https://github.com/xieyuankun/Codecfake)
16
+ - [ADD2022 Track 1](http://addchallenge.cn/add2022)
17
+ - [ADD2022 Track 3](http://addchallenge.cn/add2022)
18
+ - [ADD 2023 R1](http://addchallenge.cn/add2023)
19
+ - [ADD2023 R2](http://addchallenge.cn/add2023)
20
+ - [DFADD](https://github.com/isjwdu/DFADD)
21
+ - [LibriVoc](https://github.com/csun22/Synthetic-Voice-Detection-Vocoder-Artifacts)
22
+ - [SONAR](https://github.com/Jessegator/SONAR)
23
+ """
24
+
25
+
26
+ return gr.Markdown(text)
27
+
28
+
29
+
utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import pandas as pd
3
+
4
+ def load_leaderboard(db_path):
5
+ df = pd.read_csv(db_path) # Update table name if needed
6
+
7
+ return df
8
+
9
+ custom_css = """
10
+ h1, {
11
+ font-size: 48px !important; /* Increase heading sizes */
12
+ line-height: 2.0 !important; /* Increase line spacing */
13
+ text-align: center !important; /* Center align headings */
14
+ }
15
+
16
+ .gradio-container {
17
+ padding: 30px !important; /* Increase padding around the UI */
18
+ }
19
+
20
+ .markdown-body p {
21
+ font-size: 28px !important; /* Increase text size */
22
+ line-height: 2.0 !important; /* More space between lines */
23
+ }
24
+
25
+ .gradio-container .gr-block {
26
+ margin-bottom: 20px !important; /* Add more space between elements */
27
+ }
28
+ """