apsys commited on
Commit
d4d998a
·
0 Parent(s):
.env.template ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ HF_TOKEN="your_huggingface_write_token"
2
+ OWNER="your_huggingface_username_or_org"
3
+ RESULTS_DATASET_ID="your_username/guardbench-results"
4
+ SUBMITTER_TOKEN="your_secret_submission_token"
5
+ ADMIN_USERNAME="admin"
6
+ ADMIN_PASSWORD="password" # Change this!
.gitignore ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ build/
9
+ develop-eggs/
10
+ dist/
11
+ downloads/
12
+ eggs/
13
+ .eggs/
14
+ lib/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ .venv/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+
24
+ # Environment variables
25
+ .env
26
+
27
+ # Virtual Environment
28
+ venv/
29
+ ENV/
30
+
31
+ # IDE
32
+ .idea/
33
+ .vscode/
34
+ *.swp
35
+ *.swo
36
+
37
+ # OS
38
+ .DS_Store
39
+ Thumbs.db
40
+
41
+ # Hugging Face cache
42
+ eval-queue/
43
+ eval-results/
44
+ eval-queue-bk/
45
+ eval-results-bk/
README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GuardBench Leaderboard
2
+
3
+ A HuggingFace leaderboard for the GuardBench project that allows users to submit evaluation results and view the performance of different models on safety guardrails.
4
+
5
+ ## Features
6
+
7
+ - Display model performance across multiple safety categories
8
+ - Accept JSONL submissions with evaluation results
9
+ - Store submissions in a HuggingFace dataset
10
+ - Secure submission process with token authentication
11
+ - Automatic data refresh from HuggingFace
12
+
13
+ ## Setup
14
+
15
+ 1. Clone this repository
16
+ 2. Install dependencies:
17
+ ```
18
+ pip install -r requirements.txt
19
+ ```
20
+ 3. Create a `.env` file based on the `.env.template`:
21
+ ```
22
+ cp .env.template .env
23
+ ```
24
+ 4. Edit the `.env` file with your HuggingFace credentials and settings
25
+ 5. Run the application:
26
+ ```
27
+ python app.py
28
+ ```
29
+
30
+ ## Submission Format
31
+
32
+ Submissions should be in JSONL format, with each line containing a JSON object with the following structure:
33
+
34
+ ```json
35
+ {
36
+ "model_name": "model-name",
37
+ "per_category_metrics": {
38
+ "Category Name": {
39
+ "default_prompts": {
40
+ "f1_binary": 0.95,
41
+ "recall_binary": 0.93,
42
+ "precision_binary": 1.0,
43
+ "error_ratio": 0.0,
44
+ "avg_runtime_ms": 3000
45
+ },
46
+ "jailbreaked_prompts": { ... },
47
+ "default_answers": { ... },
48
+ "jailbreaked_answers": { ... }
49
+ },
50
+ ...
51
+ },
52
+ "avg_metrics": {
53
+ "default_prompts": {
54
+ "f1_binary": 0.97,
55
+ "recall_binary": 0.95,
56
+ "precision_binary": 1.0,
57
+ "error_ratio": 0.0,
58
+ "avg_runtime_ms": 3000
59
+ },
60
+ "jailbreaked_prompts": { ... },
61
+ "default_answers": { ... },
62
+ "jailbreaked_answers": { ... }
63
+ }
64
+ }
65
+ ```
66
+
67
+ ## Environment Variables
68
+
69
+ - `HF_TOKEN`: Your HuggingFace write token
70
+ - `OWNER`: Your HuggingFace username or organization
71
+ - `RESULTS_DATASET_ID`: The ID of the dataset to store results (e.g., "username/guardbench-results")
72
+ - `SUBMITTER_TOKEN`: A secret token required for submissions
73
+ - `ADMIN_USERNAME`: Username for admin access to the leaderboard
74
+ - `ADMIN_PASSWORD`: Password for admin access to the leaderboard
75
+
76
+ ## Deployment
77
+
78
+ This application can be deployed as a HuggingFace Space for public access. Follow the HuggingFace Spaces documentation for deployment instructions.
79
+
80
+ ## License
81
+
82
+ MIT
app.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GuardBench Leaderboard Application
3
+ """
4
+
5
+ import os
6
+ import json
7
+ import tempfile
8
+ import logging
9
+ import gradio as gr
10
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
11
+ import pandas as pd
12
+ from apscheduler.schedulers.background import BackgroundScheduler
13
+
14
+ from src.about import (
15
+ CITATION_BUTTON_LABEL,
16
+ CITATION_BUTTON_TEXT,
17
+ EVALUATION_QUEUE_TEXT,
18
+ INTRODUCTION_TEXT,
19
+ LLM_BENCHMARKS_TEXT,
20
+ TITLE,
21
+ )
22
+ from src.display.css_html_js import custom_css
23
+ from src.display.utils import (
24
+ GUARDBENCH_COLUMN,
25
+ DISPLAY_COLS,
26
+ METRIC_COLS,
27
+ HIDDEN_COLS,
28
+ NEVER_HIDDEN_COLS,
29
+ CATEGORIES,
30
+ TEST_TYPES,
31
+ ModelType,
32
+ Precision,
33
+ WeightType
34
+ )
35
+ from src.display.formatting import styled_message, styled_error, styled_warning
36
+ from src.envs import (
37
+ ADMIN_USERNAME,
38
+ ADMIN_PASSWORD,
39
+ RESULTS_DATASET_ID,
40
+ SUBMITTER_TOKEN,
41
+ TOKEN,
42
+ DATA_PATH
43
+ )
44
+ from src.populate import get_leaderboard_df, download_leaderboard_data, get_category_leaderboard_df
45
+ from src.submission.submit import process_submission
46
+
47
+ # Configure logging
48
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
49
+ logger = logging.getLogger(__name__)
50
+
51
+ # Ensure data directory exists
52
+ os.makedirs(DATA_PATH, exist_ok=True)
53
+
54
+ # Initialize leaderboard data
55
+ try:
56
+ logger.info("Initializing leaderboard data...")
57
+ LEADERBOARD_DF = get_leaderboard_df()
58
+ logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
59
+ except Exception as e:
60
+ logger.error(f"Error loading leaderboard data: {e}")
61
+ LEADERBOARD_DF = pd.DataFrame()
62
+
63
+
64
+ def init_leaderboard(dataframe):
65
+ """
66
+ Initialize the leaderboard component.
67
+ """
68
+ if dataframe is None or dataframe.empty:
69
+ # Create an empty dataframe with the right columns
70
+ columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
71
+ dataframe = pd.DataFrame(columns=columns)
72
+ logger.warning("Initializing empty leaderboard")
73
+
74
+ return Leaderboard(
75
+ value=dataframe,
76
+ datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS],
77
+ select_columns=SelectColumns(
78
+ default_selection=[getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS],
79
+ cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS],
80
+ label="Select Columns to Display:",
81
+ ),
82
+ search_columns=[GUARDBENCH_COLUMN.model.name],
83
+ hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS],
84
+ filter_columns=[
85
+ ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"),
86
+ ],
87
+ interactive=False,
88
+ )
89
+
90
+
91
+ def submit_results(
92
+ model_name: str,
93
+ base_model: str,
94
+ revision: str,
95
+ precision: str,
96
+ weight_type: str,
97
+ model_type: str,
98
+ submission_file: tempfile._TemporaryFileWrapper
99
+ ):
100
+ """
101
+ Handle submission of results with model metadata.
102
+ """
103
+ if submission_file is None:
104
+ return styled_error("No submission file provided")
105
+
106
+ if not model_name:
107
+ return styled_error("Model name is required")
108
+
109
+ if not model_type:
110
+ return styled_error("Please select a model type")
111
+
112
+ file_path = submission_file.name
113
+ logger.info(f"Received submission for model {model_name}: {file_path}")
114
+
115
+ # Add metadata to the submission
116
+ metadata = {
117
+ "model_name": model_name,
118
+ "base_model": base_model,
119
+ "revision": revision if revision else "main",
120
+ "precision": precision,
121
+ "weight_type": weight_type,
122
+ "model_type": model_type
123
+ }
124
+
125
+ # Process the submission
126
+ result = process_submission(file_path, metadata)
127
+
128
+ # Refresh the leaderboard data
129
+ global LEADERBOARD_DF
130
+ try:
131
+ logger.info("Refreshing leaderboard data after submission...")
132
+ LEADERBOARD_DF = get_leaderboard_df()
133
+ logger.info("Refreshed leaderboard data after submission")
134
+ except Exception as e:
135
+ logger.error(f"Error refreshing leaderboard data: {e}")
136
+
137
+ return result
138
+
139
+
140
+ def refresh_data():
141
+ """
142
+ Refresh the leaderboard data from HuggingFace.
143
+ """
144
+ global LEADERBOARD_DF
145
+ try:
146
+ logger.info("Performing scheduled refresh of leaderboard data...")
147
+ LEADERBOARD_DF = get_leaderboard_df()
148
+ logger.info("Scheduled refresh of leaderboard data completed")
149
+ except Exception as e:
150
+ logger.error(f"Error in scheduled refresh: {e}")
151
+
152
+
153
+ # Create Gradio app
154
+ demo = gr.Blocks(css=custom_css)
155
+
156
+ with demo:
157
+ gr.HTML(TITLE)
158
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
159
+
160
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
161
+ with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
162
+ refresh_button = gr.Button("Refresh Leaderboard")
163
+
164
+ # Create tabs for each category
165
+ with gr.Tabs(elem_classes="category-tabs") as category_tabs:
166
+ # First tab for average metrics across all categories
167
+ with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
168
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
169
+
170
+ # Create a tab for each category
171
+ for category in CATEGORIES:
172
+ with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
173
+ category_df = get_category_leaderboard_df(category)
174
+ category_leaderboard = init_leaderboard(category_df)
175
+
176
+ # Refresh button functionality
177
+ refresh_button.click(
178
+ fn=lambda: [
179
+ init_leaderboard(get_leaderboard_df()),
180
+ *[init_leaderboard(get_category_leaderboard_df(category)) for category in CATEGORIES]
181
+ ],
182
+ inputs=[],
183
+ outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
184
+ )
185
+
186
+ with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1):
187
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
188
+
189
+ with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2):
190
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
191
+
192
+ with gr.Row():
193
+ gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
194
+
195
+ with gr.Row():
196
+ with gr.Column():
197
+ model_name_textbox = gr.Textbox(label="Model name")
198
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
199
+ model_type = gr.Dropdown(
200
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
201
+ label="Model type",
202
+ multiselect=False,
203
+ value=None,
204
+ interactive=True,
205
+ )
206
+
207
+ with gr.Column():
208
+ precision = gr.Dropdown(
209
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
210
+ label="Precision",
211
+ multiselect=False,
212
+ value="float16",
213
+ interactive=True,
214
+ )
215
+ weight_type = gr.Dropdown(
216
+ choices=[i.value.name for i in WeightType],
217
+ label="Weights type",
218
+ multiselect=False,
219
+ value="Original",
220
+ interactive=True,
221
+ )
222
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
223
+
224
+ with gr.Row():
225
+ file_input = gr.File(
226
+ label="Upload JSONL Results File",
227
+ file_types=[".jsonl"]
228
+ )
229
+
230
+ submit_button = gr.Button("Submit Results")
231
+ result_output = gr.Markdown()
232
+
233
+ submit_button.click(
234
+ fn=submit_results,
235
+ inputs=[
236
+ model_name_textbox,
237
+ base_model_name_textbox,
238
+ revision_name_textbox,
239
+ precision,
240
+ weight_type,
241
+ model_type,
242
+ file_input
243
+ ],
244
+ outputs=result_output
245
+ )
246
+
247
+ with gr.Row():
248
+ with gr.Accordion("📙 Citation", open=False):
249
+ citation_button = gr.Textbox(
250
+ value=CITATION_BUTTON_TEXT,
251
+ label=CITATION_BUTTON_LABEL,
252
+ lines=10,
253
+ elem_id="citation-button",
254
+ show_copy_button=True,
255
+ )
256
+
257
+ with gr.Accordion("ℹ️ Dataset Information", open=False):
258
+ dataset_info = gr.Markdown(f"""
259
+ ## Dataset Information
260
+
261
+ Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID})
262
+
263
+ Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")}
264
+ """)
265
+
266
+ # Set up scheduler to refresh data periodically
267
+ scheduler = BackgroundScheduler()
268
+ scheduler.add_job(refresh_data, 'interval', minutes=30)
269
+ scheduler.start()
270
+
271
+ # Launch the app
272
+ if __name__ == "__main__":
273
+ # Set up authentication if credentials are provided
274
+ if not ADMIN_USERNAME or not ADMIN_PASSWORD:
275
+ logger.warning("Admin username or password not set. Running without authentication.")
276
+ auth = None
277
+ else:
278
+ auth = (ADMIN_USERNAME, ADMIN_PASSWORD)
279
+
280
+ # Launch the app
281
+ demo.launch(server_name="0.0.0.0", server_port=7860, auth=auth)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=3.50.2
2
+ huggingface_hub>=0.19.0
3
+ datasets>=2.16.0
4
+ pandas>=2.0.0
5
+ python-dotenv>=1.0.0
6
+ apscheduler>=3.10.1
7
+ gradio-leaderboard
src/about.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text content for the GuardBench Leaderboard.
3
+ """
4
+
5
+ TITLE = """
6
+ <div style="text-align: center; margin-bottom: 1rem">
7
+ <h1>GuardBench Leaderboard</h1>
8
+ </div>
9
+ """
10
+
11
+ INTRODUCTION_TEXT = """
12
+ ## Introduction
13
+
14
+ GuardBench is a comprehensive benchmark for evaluating the safety guardrails of large language models (LLMs).
15
+ This leaderboard tracks model performance across various safety categories, including harmful content detection,
16
+ jailbreak resistance, and more.
17
+
18
+ Models are evaluated on their ability to properly refuse harmful requests and detect problematic content
19
+ across multiple categories and test scenarios.
20
+ """
21
+
22
+ LLM_BENCHMARKS_TEXT = """
23
+ ## GuardBench Evaluation Methodology
24
+
25
+ GuardBench evaluates models across multiple safety categories:
26
+
27
+ - **Criminal, Violent, and Terrorist Activity**
28
+ - **Manipulation, Deception, and Misinformation**
29
+ - **Creative Content Involving Illicit Themes**
30
+ - **Sexual Content and Violence**
31
+ - **Political Corruption and Legal Evasion**
32
+ - **Labor Exploitation and Human Trafficking**
33
+ - **Environmental and Industrial Harm**
34
+ - **Animal Cruelty and Exploitation**
35
+ - **Self-Harm and Suicidal Ideation**
36
+ - **Safe Prompts**
37
+
38
+ Each category is tested with:
39
+ - **Default Prompts**: Standard harmful requests
40
+ - **Jailbreaked Prompts**: Adversarial attempts to bypass safety measures
41
+ - **Default Answers**: Evaluation of harmful content in responses
42
+ - **Jailbreaked Answers**: Evaluation of harmful content in responses to jailbreak attempts
43
+
44
+ Metrics include:
45
+ - **F1 Binary**: Harmonic mean of precision and recall
46
+ - **Recall Binary**: Proportion of actual harmful content correctly identified
47
+ - **Precision Binary**: Proportion of identified harmful content that was actually harmful
48
+ - **Error Ratio**: Rate of errors during evaluation
49
+ - **Average Runtime**: Average processing time in milliseconds
50
+ """
51
+
52
+ EVALUATION_QUEUE_TEXT = """
53
+ ## Submission Process
54
+
55
+ To submit your model results to the GuardBench leaderboard:
56
+
57
+ 1. Evaluate your model using the [GuardBench framework](https://github.com/huggingface/guard-bench)
58
+ 2. Format your results as a JSONL file according to our schema
59
+ 3. Submit your results using the submission form with your authorized token
60
+
61
+ Results will be processed and added to the leaderboard once validated.
62
+ """
63
+
64
+ CITATION_BUTTON_LABEL = "Cite GuardBench"
65
+
66
+ CITATION_BUTTON_TEXT = """
67
+ @misc{guardbench2023,
68
+ author = {GuardBench Team},
69
+ title = {GuardBench: Comprehensive Benchmark for LLM Safety Guardrails},
70
+ year = {2023},
71
+ publisher = {GitHub},
72
+ journal = {GitHub repository},
73
+ howpublished = {\\url{https://github.com/huggingface/guard-bench}}
74
+ }
75
+ """
src/display/css_html_js.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CSS and styling for the GuardBench Leaderboard.
3
+ """
4
+
5
+ custom_css = """
6
+ .markdown-text {
7
+ font-size: 16px !important;
8
+ text-align: justify !important;
9
+ }
10
+
11
+ .tab-buttons button.selected {
12
+ border-color: #2196F3 !important;
13
+ background: #E3F2FD !important;
14
+ color: #2196F3 !important;
15
+ }
16
+
17
+ #citation-button textarea {
18
+ font-family: monospace !important;
19
+ }
20
+
21
+ .leaderboard-container {
22
+ margin-top: 20px;
23
+ }
24
+
25
+ .category-header {
26
+ font-weight: bold;
27
+ background-color: #f5f5f5;
28
+ padding: 10px;
29
+ margin-top: 15px;
30
+ border-radius: 5px;
31
+ }
32
+
33
+ .metric-name {
34
+ font-weight: bold;
35
+ color: #2196F3;
36
+ }
37
+
38
+ .model-name {
39
+ font-weight: bold;
40
+ }
41
+
42
+ .model-link:hover {
43
+ text-decoration: underline;
44
+ color: #1976D2;
45
+ }
46
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Formatting utilities for the GuardBench Leaderboard.
3
+ """
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+
9
+ def make_clickable_model(model_name: str) -> str:
10
+ """
11
+ Create a clickable link for a model name.
12
+ """
13
+ return f'<a href="https://huggingface.co/{model_name}" target="_blank">{model_name}</a>'
14
+
15
+
16
+ def has_no_nan_values(df: pd.DataFrame, columns: list) -> pd.Series:
17
+ """
18
+ Check if a row has no NaN values in the specified columns.
19
+ """
20
+ return ~df[columns].isna().any(axis=1)
21
+
22
+
23
+ def format_percentage(value: float) -> str:
24
+ """
25
+ Format a value as a percentage.
26
+ """
27
+ if pd.isna(value):
28
+ return "N/A"
29
+ return f"{value * 100:.2f}%"
30
+
31
+
32
+ def format_number(value: float, precision: int = 2) -> str:
33
+ """
34
+ Format a number with specified precision.
35
+ """
36
+ if pd.isna(value):
37
+ return "N/A"
38
+ return f"{value:.{precision}f}"
39
+
40
+
41
+ def styled_message(message: str) -> str:
42
+ """
43
+ Format a success message with styling.
44
+ """
45
+ return f"""
46
+ <div style="padding: 10px; border-radius: 5px; background-color: #e6f7e6; color: #2e7d32; border: 1px solid #2e7d32;">
47
+ ✅ {message}
48
+ </div>
49
+ """
50
+
51
+
52
+ def styled_warning(message: str) -> str:
53
+ """
54
+ Format a warning message with styling.
55
+ """
56
+ return f"""
57
+ <div style="padding: 10px; border-radius: 5px; background-color: #fff8e1; color: #ff8f00; border: 1px solid #ff8f00;">
58
+ ⚠️ {message}
59
+ </div>
60
+ """
61
+
62
+
63
+ def styled_error(message: str) -> str:
64
+ """
65
+ Format an error message with styling.
66
+ """
67
+ return f"""
68
+ <div style="padding: 10px; border-radius: 5px; background-color: #ffebee; color: #c62828; border: 1px solid #c62828;">
69
+ ❌ {message}
70
+ </div>
71
+ """
src/display/utils.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility classes and functions for the GuardBench Leaderboard display.
3
+ """
4
+
5
+ from dataclasses import dataclass, field, fields
6
+ from enum import Enum, auto
7
+ from typing import List, Optional
8
+
9
+
10
+ class ModelType(Enum):
11
+ """Model types for the leaderboard."""
12
+ Unknown = auto()
13
+ OpenSource = auto()
14
+ ClosedSource = auto()
15
+ API = auto()
16
+
17
+ def to_str(self, separator: str = " ") -> str:
18
+ """Convert enum to string with separator."""
19
+ if self == ModelType.Unknown:
20
+ return "Unknown"
21
+ elif self == ModelType.OpenSource:
22
+ return f"Open{separator}Source"
23
+ elif self == ModelType.ClosedSource:
24
+ return f"Closed{separator}Source"
25
+ elif self == ModelType.API:
26
+ return "API"
27
+ return "Unknown"
28
+
29
+
30
+ class Precision(Enum):
31
+ """Model precision types."""
32
+ Unknown = auto()
33
+ float16 = auto()
34
+ bfloat16 = auto()
35
+ float32 = auto()
36
+ int8 = auto()
37
+ int4 = auto()
38
+
39
+
40
+ class WeightType(Enum):
41
+ """Model weight types."""
42
+ Original = auto()
43
+ Delta = auto()
44
+ Adapter = auto()
45
+
46
+
47
+ @dataclass
48
+ class ColumnInfo:
49
+ """Information about a column in the leaderboard."""
50
+ name: str
51
+ display_name: str
52
+ type: str = "text"
53
+ hidden: bool = False
54
+ never_hidden: bool = False
55
+ displayed_by_default: bool = True
56
+
57
+
58
+ @dataclass
59
+ class GuardBenchColumn:
60
+ """Columns for the GuardBench leaderboard."""
61
+ model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
62
+ name="model_name",
63
+ display_name="Model",
64
+ never_hidden=True,
65
+ displayed_by_default=True
66
+ ))
67
+
68
+ model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
69
+ name="model_type",
70
+ display_name="Type",
71
+ displayed_by_default=True
72
+ ))
73
+
74
+ # Metrics for all categories
75
+ default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
76
+ name="default_prompts_f1",
77
+ display_name="Default Prompts F1",
78
+ type="number",
79
+ displayed_by_default=True
80
+ ))
81
+
82
+ jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
83
+ name="jailbreaked_prompts_f1",
84
+ display_name="Jailbreaked Prompts F1",
85
+ type="number",
86
+ displayed_by_default=True
87
+ ))
88
+
89
+ default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
90
+ name="default_answers_f1",
91
+ display_name="Default Answers F1",
92
+ type="number",
93
+ displayed_by_default=True
94
+ ))
95
+
96
+ jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
97
+ name="jailbreaked_answers_f1",
98
+ display_name="Jailbreaked Answers F1",
99
+ type="number",
100
+ displayed_by_default=True
101
+ ))
102
+
103
+ # Average metrics
104
+ average_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
105
+ name="average_f1",
106
+ display_name="Average F1",
107
+ type="number",
108
+ displayed_by_default=True,
109
+ never_hidden=True
110
+ ))
111
+
112
+ average_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
113
+ name="average_recall",
114
+ display_name="Average Recall",
115
+ type="number",
116
+ displayed_by_default=False
117
+ ))
118
+
119
+ average_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
120
+ name="average_precision",
121
+ display_name="Average Precision",
122
+ type="number",
123
+ displayed_by_default=False
124
+ ))
125
+
126
+ # Additional metadata
127
+ submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
128
+ name="submission_date",
129
+ display_name="Submission Date",
130
+ displayed_by_default=False
131
+ ))
132
+
133
+
134
+ # Create instances for easy access
135
+ GUARDBENCH_COLUMN = GuardBenchColumn()
136
+
137
+ # Extract column lists for different views
138
+ COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
139
+ DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
140
+ if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
141
+ METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
142
+ if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
143
+ HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
144
+ if getattr(GUARDBENCH_COLUMN, f.name).hidden]
145
+ NEVER_HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
146
+ if getattr(GUARDBENCH_COLUMN, f.name).never_hidden]
147
+
148
+ # Categories in GuardBench
149
+ CATEGORIES = [
150
+ "Criminal, Violent, and Terrorist Activity",
151
+ "Manipulation, Deception, and Misinformation",
152
+ "Creative Content Involving Illicit Themes",
153
+ "Sexual Content and Violence",
154
+ "Political Corruption and Legal Evasion",
155
+ "Labor Exploitation and Human Trafficking",
156
+ "Environmental and Industrial Harm",
157
+ "Animal Cruelty and Exploitation",
158
+ "Self–Harm and Suicidal Ideation",
159
+ "Safe Prompts"
160
+ ]
161
+
162
+ # Test types in GuardBench
163
+ TEST_TYPES = [
164
+ "default_prompts",
165
+ "jailbreaked_prompts",
166
+ "default_answers",
167
+ "jailbreaked_answers"
168
+ ]
169
+
170
+ # Metrics in GuardBench
171
+ METRICS = [
172
+ "f1_binary",
173
+ "recall_binary",
174
+ "precision_binary",
175
+ "error_ratio",
176
+ "avg_runtime_ms"
177
+ ]
src/envs.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi
3
+ from dotenv import load_dotenv
4
+
5
+ # Load environment variables
6
+ load_dotenv()
7
+
8
+ # Hugging Face configuration
9
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
10
+ OWNER = os.environ.get("OWNER", "guard-bench") # Change to your org
11
+ SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
12
+ ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
13
+ ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
14
+
15
+ # Repository IDs
16
+ REPO_ID = f"{OWNER}/leaderboard"
17
+ RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/guardbench-results")
18
+
19
+ # Cache paths
20
+ CACHE_PATH = os.getenv("HF_HOME", ".")
21
+ DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
22
+
23
+ # Local data paths
24
+ LEADERBOARD_FILE = os.path.join(DATA_PATH, "leaderboard.json")
25
+
26
+ # HF API instance
27
+ API = HfApi(token=TOKEN)
src/leaderboard/processor.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Process and transform GuardBench leaderboard data.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import pandas as pd
8
+ from datetime import datetime
9
+ from typing import Dict, List, Any, Tuple
10
+
11
+ from src.display.utils import CATEGORIES, TEST_TYPES, METRICS
12
+
13
+
14
+ def load_leaderboard_data(file_path: str) -> Dict:
15
+ """
16
+ Load the leaderboard data from a JSON file.
17
+ """
18
+ if not os.path.exists(file_path):
19
+ return {"entries": [], "last_updated": datetime.now().isoformat()}
20
+
21
+ with open(file_path, 'r') as f:
22
+ data = json.load(f)
23
+
24
+ return data
25
+
26
+
27
+ def save_leaderboard_data(data: Dict, file_path: str) -> None:
28
+ """
29
+ Save the leaderboard data to a JSON file.
30
+ """
31
+ # Ensure the directory exists
32
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
33
+
34
+ # Update the last_updated timestamp
35
+ data["last_updated"] = datetime.now().isoformat()
36
+
37
+ with open(file_path, 'w') as f:
38
+ json.dump(data, f, indent=2)
39
+
40
+
41
+ def process_submission(submission_data: List[Dict]) -> List[Dict]:
42
+ """
43
+ Process submission data and convert it to leaderboard entries.
44
+ """
45
+ entries = []
46
+
47
+ for item in submission_data:
48
+ # Create a new entry for the leaderboard
49
+ entry = {
50
+ "model_name": item.get("model_name", "Unknown Model"),
51
+ "per_category_metrics": {},
52
+ "avg_metrics": {},
53
+ "submission_date": datetime.now().isoformat()
54
+ }
55
+
56
+ # Process per-category metrics
57
+ if "per_category_metrics" in item:
58
+ entry["per_category_metrics"] = item["per_category_metrics"]
59
+
60
+ # Process average metrics
61
+ if "avg_metrics" in item:
62
+ entry["avg_metrics"] = item["avg_metrics"]
63
+
64
+ entries.append(entry)
65
+
66
+ return entries
67
+
68
+
69
+ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
70
+ """
71
+ Convert leaderboard data to a pandas DataFrame for display.
72
+ """
73
+ rows = []
74
+
75
+ for entry in leaderboard_data.get("entries", []):
76
+ model_name = entry.get("model_name", "Unknown Model")
77
+
78
+ # Extract average metrics for main display
79
+ row = {
80
+ "model_name": model_name,
81
+ "model_type": entry.get("model_type", "Unknown"),
82
+ "submission_date": entry.get("submission_date", "")
83
+ }
84
+
85
+ # Add average metrics
86
+ avg_metrics = entry.get("avg_metrics", {})
87
+ for test_type in TEST_TYPES:
88
+ if test_type in avg_metrics:
89
+ for metric in METRICS:
90
+ if metric in avg_metrics[test_type]:
91
+ col_name = f"{test_type}_{metric}"
92
+ row[col_name] = avg_metrics[test_type][metric]
93
+
94
+ # Calculate overall averages for key metrics
95
+ f1_values = []
96
+ recall_values = []
97
+ precision_values = []
98
+
99
+ for test_type in TEST_TYPES:
100
+ if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
101
+ f1_values.append(avg_metrics[test_type]["f1_binary"])
102
+ if test_type in avg_metrics and "recall_binary" in avg_metrics[test_type]:
103
+ recall_values.append(avg_metrics[test_type]["recall_binary"])
104
+ if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
105
+ precision_values.append(avg_metrics[test_type]["precision_binary"])
106
+
107
+ # Add overall averages
108
+ if f1_values:
109
+ row["average_f1"] = sum(f1_values) / len(f1_values)
110
+ if recall_values:
111
+ row["average_recall"] = sum(recall_values) / len(recall_values)
112
+ if precision_values:
113
+ row["average_precision"] = sum(precision_values) / len(precision_values)
114
+
115
+ # Add specific test type F1 scores for display
116
+ if "default_prompts" in avg_metrics and "f1_binary" in avg_metrics["default_prompts"]:
117
+ row["default_prompts_f1"] = avg_metrics["default_prompts"]["f1_binary"]
118
+ if "jailbreaked_prompts" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_prompts"]:
119
+ row["jailbreaked_prompts_f1"] = avg_metrics["jailbreaked_prompts"]["f1_binary"]
120
+ if "default_answers" in avg_metrics and "f1_binary" in avg_metrics["default_answers"]:
121
+ row["default_answers_f1"] = avg_metrics["default_answers"]["f1_binary"]
122
+ if "jailbreaked_answers" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_answers"]:
123
+ row["jailbreaked_answers_f1"] = avg_metrics["jailbreaked_answers"]["f1_binary"]
124
+
125
+ rows.append(row)
126
+
127
+ # Create DataFrame and sort by average F1 score
128
+ df = pd.DataFrame(rows)
129
+ if not df.empty and "average_f1" in df.columns:
130
+ df = df.sort_values(by="average_f1", ascending=False)
131
+
132
+ return df
133
+
134
+
135
+ def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict]) -> Dict:
136
+ """
137
+ Add new entries to the leaderboard, replacing any with the same model name.
138
+ """
139
+ # Create a mapping of existing entries by model name
140
+ existing_entries = {entry["model_name"]: i for i, entry in enumerate(leaderboard_data.get("entries", []))}
141
+
142
+ # Process each new entry
143
+ for new_entry in new_entries:
144
+ model_name = new_entry.get("model_name")
145
+
146
+ if model_name in existing_entries:
147
+ # Replace existing entry
148
+ leaderboard_data["entries"][existing_entries[model_name]] = new_entry
149
+ else:
150
+ # Add new entry
151
+ if "entries" not in leaderboard_data:
152
+ leaderboard_data["entries"] = []
153
+ leaderboard_data["entries"].append(new_entry)
154
+
155
+ # Update the last_updated timestamp
156
+ leaderboard_data["last_updated"] = datetime.now().isoformat()
157
+
158
+ return leaderboard_data
159
+
160
+
161
+ def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
162
+ """
163
+ Process a JSONL submission file and extract entries.
164
+ """
165
+ entries = []
166
+ try:
167
+ with open(file_path, 'r') as f:
168
+ for line in f:
169
+ try:
170
+ entry = json.loads(line)
171
+ entries.append(entry)
172
+ except json.JSONDecodeError as e:
173
+ return [], f"Invalid JSON in submission file: {e}"
174
+
175
+ if not entries:
176
+ return [], "Submission file is empty"
177
+
178
+ return entries, "Successfully processed submission"
179
+ except Exception as e:
180
+ return [], f"Error processing submission file: {e}"
src/populate.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Populate the GuardBench leaderboard from HuggingFace datasets.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import pandas as pd
8
+ import tempfile
9
+ from typing import Dict, Tuple, List
10
+ from glob import glob
11
+
12
+ from huggingface_hub import snapshot_download, hf_hub_download, HfApi
13
+ from datasets import load_dataset
14
+
15
+ from src.display.utils import GUARDBENCH_COLUMN, DISPLAY_COLS, CATEGORIES
16
+ from src.envs import RESULTS_DATASET_ID, TOKEN, LEADERBOARD_FILE, CACHE_PATH
17
+ from src.leaderboard.processor import leaderboard_to_dataframe, load_leaderboard_data, save_leaderboard_data, process_jsonl_submission, add_entries_to_leaderboard
18
+
19
+
20
+ def download_leaderboard_data() -> bool:
21
+ """
22
+ Download the latest leaderboard data from HuggingFace.
23
+ """
24
+ try:
25
+ # Create a temporary directory to download the submissions
26
+ temp_dir = os.path.join(CACHE_PATH, "temp_submissions")
27
+ os.makedirs(temp_dir, exist_ok=True)
28
+
29
+ # Download the entire repository
30
+ try:
31
+ snapshot_path = snapshot_download(
32
+ repo_id=RESULTS_DATASET_ID,
33
+ repo_type="dataset",
34
+ local_dir=temp_dir,
35
+ token=TOKEN,
36
+ ignore_patterns=["*.md", ".*"],
37
+ etag_timeout=30
38
+ )
39
+
40
+ # Process all submission files
41
+ all_entries = []
42
+ submission_files = []
43
+
44
+ # Look for submission files in the submissions directory
45
+ submissions_dir = os.path.join(snapshot_path, "submissions")
46
+ if os.path.exists(submissions_dir):
47
+ submission_files.extend(glob(os.path.join(submissions_dir, "*.jsonl")))
48
+
49
+ # Also look for any JSONL files in the root
50
+ submission_files.extend(glob(os.path.join(snapshot_path, "*.jsonl")))
51
+
52
+ # Process each submission file
53
+ for file_path in submission_files:
54
+ entries, _ = process_jsonl_submission(file_path)
55
+ all_entries.extend(entries)
56
+
57
+ # Create leaderboard data structure
58
+ leaderboard_data = {
59
+ "entries": all_entries,
60
+ "last_updated": pd.Timestamp.now().isoformat()
61
+ }
62
+
63
+ # Save to local file
64
+ save_leaderboard_data(leaderboard_data, LEADERBOARD_FILE)
65
+
66
+ return True
67
+ except Exception as e:
68
+ print(f"Error downloading repository: {e}")
69
+
70
+ # If we can't download the repository, try to download individual files
71
+ try:
72
+ api = HfApi(token=TOKEN)
73
+ files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
74
+
75
+ submission_files = [f for f in files if f.endswith('.jsonl')]
76
+ all_entries = []
77
+
78
+ for file_path in submission_files:
79
+ try:
80
+ local_path = hf_hub_download(
81
+ repo_id=RESULTS_DATASET_ID,
82
+ filename=file_path,
83
+ repo_type="dataset",
84
+ token=TOKEN
85
+ )
86
+ entries, _ = process_jsonl_submission(local_path)
87
+ all_entries.extend(entries)
88
+ except Exception as file_error:
89
+ print(f"Error downloading file {file_path}: {file_error}")
90
+
91
+ # Create leaderboard data structure
92
+ leaderboard_data = {
93
+ "entries": all_entries,
94
+ "last_updated": pd.Timestamp.now().isoformat()
95
+ }
96
+
97
+ # Save to local file
98
+ save_leaderboard_data(leaderboard_data, LEADERBOARD_FILE)
99
+
100
+ return True
101
+ except Exception as list_error:
102
+ print(f"Error listing repository files: {list_error}")
103
+
104
+ # If we can't download anything, create an empty leaderboard
105
+ if not os.path.exists(LEADERBOARD_FILE):
106
+ empty_data = {"entries": [], "last_updated": pd.Timestamp.now().isoformat()}
107
+ save_leaderboard_data(empty_data, LEADERBOARD_FILE)
108
+
109
+ return False
110
+ except Exception as e:
111
+ print(f"Error downloading leaderboard data: {e}")
112
+
113
+ # Ensure we have at least an empty leaderboard file
114
+ if not os.path.exists(LEADERBOARD_FILE):
115
+ empty_data = {"entries": [], "last_updated": pd.Timestamp.now().isoformat()}
116
+ save_leaderboard_data(empty_data, LEADERBOARD_FILE)
117
+
118
+ return False
119
+
120
+
121
+ def get_leaderboard_df() -> pd.DataFrame:
122
+ """
123
+ Get the leaderboard data as a DataFrame.
124
+ """
125
+ # Try to download the latest data
126
+ download_leaderboard_data()
127
+
128
+ # Load from local file
129
+ leaderboard_data = load_leaderboard_data(LEADERBOARD_FILE)
130
+
131
+ # Convert to DataFrame
132
+ df = leaderboard_to_dataframe(leaderboard_data)
133
+
134
+ return df
135
+
136
+
137
+ def get_category_leaderboard_df(category: str) -> pd.DataFrame:
138
+ """
139
+ Get the leaderboard data filtered by a specific category.
140
+
141
+ Args:
142
+ category: The category to filter by (e.g., "Criminal, Violent, and Terrorist Activity")
143
+
144
+ Returns:
145
+ DataFrame with metrics for the specified category
146
+ """
147
+ # Load the leaderboard data
148
+ leaderboard_data = load_leaderboard_data(LEADERBOARD_FILE)
149
+
150
+ # Filter entries to only include those with data for the specified category
151
+ filtered_entries = []
152
+
153
+ for entry in leaderboard_data.get("entries", []):
154
+ # Check if the entry has data for this category
155
+ if "per_category_metrics" in entry and category in entry["per_category_metrics"]:
156
+ # Create a new entry with just the overall info and this category's metrics
157
+ filtered_entry = {
158
+ "model_name": entry.get("model_name", "Unknown Model"),
159
+ "model_type": entry.get("model_type", "Unknown"),
160
+ "submission_date": entry.get("submission_date", ""),
161
+ }
162
+
163
+ # Extract metrics for this category
164
+ category_metrics = entry["per_category_metrics"][category]
165
+
166
+ # Add metrics for each test type
167
+ for test_type in category_metrics:
168
+ if test_type and isinstance(category_metrics[test_type], dict):
169
+ for metric, value in category_metrics[test_type].items():
170
+ col_name = f"{test_type}_{metric}"
171
+ filtered_entry[col_name] = value
172
+
173
+ # Calculate average F1 for this category
174
+ f1_values = []
175
+ for test_type in category_metrics:
176
+ if test_type and isinstance(category_metrics[test_type], dict) and "f1_binary" in category_metrics[test_type]:
177
+ f1_values.append(category_metrics[test_type]["f1_binary"])
178
+
179
+ if f1_values:
180
+ filtered_entry["average_f1"] = sum(f1_values) / len(f1_values)
181
+
182
+ # Add specific test type F1 scores for display
183
+ for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
184
+ if test_type in category_metrics and "f1_binary" in category_metrics[test_type]:
185
+ filtered_entry[f"{test_type}_f1"] = category_metrics[test_type]["f1_binary"]
186
+
187
+ filtered_entries.append(filtered_entry)
188
+
189
+ # Create a new leaderboard data structure with the filtered entries
190
+ filtered_leaderboard = {
191
+ "entries": filtered_entries,
192
+ "last_updated": leaderboard_data.get("last_updated", pd.Timestamp.now().isoformat())
193
+ }
194
+
195
+ # Convert to DataFrame
196
+ df = leaderboard_to_dataframe(filtered_leaderboard)
197
+
198
+ return df
199
+
200
+
201
+ def get_detailed_model_data(model_name: str) -> Dict:
202
+ """
203
+ Get detailed data for a specific model.
204
+ """
205
+ leaderboard_data = load_leaderboard_data(LEADERBOARD_FILE)
206
+
207
+ for entry in leaderboard_data.get("entries", []):
208
+ if entry.get("model_name") == model_name:
209
+ return entry
210
+
211
+ return {}
src/submission/submit.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Handle submissions to the GuardBench leaderboard.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import tempfile
8
+ import uuid
9
+ from datetime import datetime
10
+ from typing import Dict, List, Tuple
11
+
12
+ from huggingface_hub import HfApi
13
+ from datasets import load_dataset, Dataset
14
+
15
+ from src.display.formatting import styled_error, styled_message, styled_warning
16
+ from src.envs import API, RESULTS_DATASET_ID, TOKEN
17
+ from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard, load_leaderboard_data
18
+
19
+
20
+ def validate_submission(file_path: str) -> Tuple[bool, str]:
21
+ """
22
+ Validate a submission file.
23
+ """
24
+ try:
25
+ entries, message = process_jsonl_submission(file_path)
26
+ if not entries:
27
+ return False, message
28
+
29
+ # Additional validation could be added here
30
+
31
+ return True, "Submission is valid"
32
+ except Exception as e:
33
+ return False, f"Error validating submission: {e}"
34
+
35
+
36
+ def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str) -> Tuple[bool, str]:
37
+ """
38
+ Submit results to a HuggingFace dataset repository as individual files.
39
+ """
40
+ try:
41
+ # Process the submission file to validate
42
+ entries, message = process_jsonl_submission(file_path)
43
+ if not entries:
44
+ return False, message
45
+
46
+ # Generate a unique submission ID
47
+ model_name = metadata.get("model_name", "unknown")
48
+ model_name_safe = model_name.replace("/", "_").replace(" ", "_")
49
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
50
+ submission_id = f"{model_name_safe}_{timestamp}"
51
+
52
+ # Create an API instance
53
+ api = HfApi(token=token)
54
+
55
+ # Create a temporary file with metadata added
56
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as temp_file:
57
+ # Add metadata to each entry
58
+ for entry in entries:
59
+ # If the entry already has a model_name, don't override it
60
+ if "model_name" not in entry:
61
+ entry["model_name"] = metadata.get("model_name")
62
+
63
+ # Add other metadata if not present
64
+ for key, value in metadata.items():
65
+ if key != "model_name" and key not in entry:
66
+ entry[key] = value
67
+
68
+ # Write to temp file
69
+ temp_file.write(json.dumps(entry) + "\n")
70
+
71
+ temp_path = temp_file.name
72
+
73
+ # Upload the file directly to the repository
74
+ submission_path = f"submissions/{submission_id}.jsonl"
75
+ api.upload_file(
76
+ path_or_fileobj=temp_path,
77
+ path_in_repo=submission_path,
78
+ repo_id=dataset_id,
79
+ repo_type="dataset",
80
+ commit_message=f"Add submission for {model_name}"
81
+ )
82
+
83
+ # Clean up the temporary file
84
+ os.unlink(temp_path)
85
+
86
+ return True, f"Successfully uploaded submission for {model_name} to {dataset_id}"
87
+ except Exception as e:
88
+ return False, f"Error submitting to dataset: {e}"
89
+
90
+
91
+ def process_submission(file_path: str, metadata: Dict) -> str:
92
+ """
93
+ Process a submission to the GuardBench leaderboard.
94
+ """
95
+ # Validate submission file
96
+ is_valid, validation_message = validate_submission(file_path)
97
+ if not is_valid:
98
+ return styled_error(validation_message)
99
+
100
+ # Submit to HuggingFace dataset repository
101
+ success, message = submit_to_hub(file_path, metadata, RESULTS_DATASET_ID, TOKEN)
102
+ if not success:
103
+ return styled_error(message)
104
+
105
+ return styled_message(f"Submission successful! {message}")