Spaces:

whitecircle-ai
/

circle-guard-bench

Running

App Files Files Community

apsys commited on Mar 30

Commit

d4d998a

0 Parent(s):

init

Browse files

Files changed (13) hide show

.env.template +6 -0
.gitignore +45 -0
README.md +82 -0
app.py +281 -0
requirements.txt +7 -0
src/about.py +75 -0
src/display/css_html_js.py +46 -0
src/display/formatting.py +71 -0
src/display/utils.py +177 -0
src/envs.py +27 -0
src/leaderboard/processor.py +180 -0
src/populate.py +211 -0
src/submission/submit.py +105 -0

.env.template ADDED Viewed

	@@ -0,0 +1,6 @@

+HF_TOKEN="your_huggingface_write_token"
+OWNER="your_huggingface_username_or_org"
+RESULTS_DATASET_ID="your_username/guardbench-results"
+SUBMITTER_TOKEN="your_secret_submission_token"
+ADMIN_USERNAME="admin"
+ADMIN_PASSWORD="password" # Change this!

.gitignore ADDED Viewed

	@@ -0,0 +1,45 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+.venv/
+*.egg-info/
+.installed.cfg
+*.egg
+# Environment variables
+.env
+# Virtual Environment
+venv/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Hugging Face cache
+eval-queue/
+eval-results/
+eval-queue-bk/
+eval-results-bk/

README.md ADDED Viewed

	@@ -0,0 +1,82 @@

+# GuardBench Leaderboard
+A HuggingFace leaderboard for the GuardBench project that allows users to submit evaluation results and view the performance of different models on safety guardrails.
+## Features
+- Display model performance across multiple safety categories
+- Accept JSONL submissions with evaluation results
+- Store submissions in a HuggingFace dataset
+- Secure submission process with token authentication
+- Automatic data refresh from HuggingFace
+## Setup
+1. Clone this repository
+2. Install dependencies:
+   ```
+   pip install -r requirements.txt
+   ```
+3. Create a `.env` file based on the `.env.template`:
+   ```
+   cp .env.template .env
+   ```
+4. Edit the `.env` file with your HuggingFace credentials and settings
+5. Run the application:
+   ```
+   python app.py
+   ```
+## Submission Format
+Submissions should be in JSONL format, with each line containing a JSON object with the following structure:
+```json
+{
+  "model_name": "model-name",
+  "per_category_metrics": {
+    "Category Name": {
+      "default_prompts": {
+        "f1_binary": 0.95,
+        "recall_binary": 0.93,
+        "precision_binary": 1.0,
+        "error_ratio": 0.0,
+        "avg_runtime_ms": 3000
+      },
+      "jailbreaked_prompts": { ... },
+      "default_answers": { ... },
+      "jailbreaked_answers": { ... }
+    },
+    ...
+  },
+  "avg_metrics": {
+    "default_prompts": {
+      "f1_binary": 0.97,
+      "recall_binary": 0.95,
+      "precision_binary": 1.0,
+      "error_ratio": 0.0,
+      "avg_runtime_ms": 3000
+    },
+    "jailbreaked_prompts": { ... },
+    "default_answers": { ... },
+    "jailbreaked_answers": { ... }
+  }
+}
+```
+## Environment Variables
+- `HF_TOKEN`: Your HuggingFace write token
+- `OWNER`: Your HuggingFace username or organization
+- `RESULTS_DATASET_ID`: The ID of the dataset to store results (e.g., "username/guardbench-results")
+- `SUBMITTER_TOKEN`: A secret token required for submissions
+- `ADMIN_USERNAME`: Username for admin access to the leaderboard
+- `ADMIN_PASSWORD`: Password for admin access to the leaderboard
+## Deployment
+This application can be deployed as a HuggingFace Space for public access. Follow the HuggingFace Spaces documentation for deployment instructions.
+## License
+MIT

app.py ADDED Viewed

	@@ -0,0 +1,281 @@

+"""
+GuardBench Leaderboard Application
+"""
+import os
+import json
+import tempfile
+import logging
+import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    GUARDBENCH_COLUMN,
+    DISPLAY_COLS,
+    METRIC_COLS,
+    HIDDEN_COLS,
+    NEVER_HIDDEN_COLS,
+    CATEGORIES,
+    TEST_TYPES,
+    ModelType,
+    Precision,
+    WeightType
+)
+from src.display.formatting import styled_message, styled_error, styled_warning
+from src.envs import (
+    ADMIN_USERNAME,
+    ADMIN_PASSWORD,
+    RESULTS_DATASET_ID,
+    SUBMITTER_TOKEN,
+    TOKEN,
+    DATA_PATH
+)
+from src.populate import get_leaderboard_df, download_leaderboard_data, get_category_leaderboard_df
+from src.submission.submit import process_submission
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Ensure data directory exists
+os.makedirs(DATA_PATH, exist_ok=True)
+# Initialize leaderboard data
+try:
+    logger.info("Initializing leaderboard data...")
+    LEADERBOARD_DF = get_leaderboard_df()
+    logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
+except Exception as e:
+    logger.error(f"Error loading leaderboard data: {e}")
+    LEADERBOARD_DF = pd.DataFrame()
+def init_leaderboard(dataframe):
+    """
+    Initialize the leaderboard component.
+    """
+    if dataframe is None or dataframe.empty:
+        # Create an empty dataframe with the right columns
+        columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
+        dataframe = pd.DataFrame(columns=columns)
+        logger.warning("Initializing empty leaderboard")
+    return Leaderboard(
+        value=dataframe,
+        datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS],
+        select_columns=SelectColumns(
+            default_selection=[getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS],
+            cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS],
+            label="Select Columns to Display:",
+        ),
+        search_columns=[GUARDBENCH_COLUMN.model.name],
+        hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS],
+        filter_columns=[
+            ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"),
+        ],
+        interactive=False,
+    )
+def submit_results(
+    model_name: str,
+    base_model: str,
+    revision: str,
+    precision: str,
+    weight_type: str,
+    model_type: str,
+    submission_file: tempfile._TemporaryFileWrapper
+):
+    """
+    Handle submission of results with model metadata.
+    """
+    if submission_file is None:
+        return styled_error("No submission file provided")
+    if not model_name:
+        return styled_error("Model name is required")
+    if not model_type:
+        return styled_error("Please select a model type")
+    file_path = submission_file.name
+    logger.info(f"Received submission for model {model_name}: {file_path}")
+    # Add metadata to the submission
+    metadata = {
+        "model_name": model_name,
+        "base_model": base_model,
+        "revision": revision if revision else "main",
+        "precision": precision,
+        "weight_type": weight_type,
+        "model_type": model_type
+    }
+    # Process the submission
+    result = process_submission(file_path, metadata)
+    # Refresh the leaderboard data
+    global LEADERBOARD_DF
+    try:
+        logger.info("Refreshing leaderboard data after submission...")
+        LEADERBOARD_DF = get_leaderboard_df()
+        logger.info("Refreshed leaderboard data after submission")
+    except Exception as e:
+        logger.error(f"Error refreshing leaderboard data: {e}")
+    return result
+def refresh_data():
+    """
+    Refresh the leaderboard data from HuggingFace.
+    """
+    global LEADERBOARD_DF
+    try:
+        logger.info("Performing scheduled refresh of leaderboard data...")
+        LEADERBOARD_DF = get_leaderboard_df()
+        logger.info("Scheduled refresh of leaderboard data completed")
+    except Exception as e:
+        logger.error(f"Error in scheduled refresh: {e}")
+# Create Gradio app
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
+            refresh_button = gr.Button("Refresh Leaderboard")
+            # Create tabs for each category
+            with gr.Tabs(elem_classes="category-tabs") as category_tabs:
+                # First tab for average metrics across all categories
+                with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
+                    leaderboard = init_leaderboard(LEADERBOARD_DF)
+                # Create a tab for each category
+                for category in CATEGORIES:
+                    with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
+                        category_df = get_category_leaderboard_df(category)
+                        category_leaderboard = init_leaderboard(category_df)
+            # Refresh button functionality
+            refresh_button.click(
+                fn=lambda: [
+                    init_leaderboard(get_leaderboard_df()),
+                    *[init_leaderboard(get_category_leaderboard_df(category)) for category in CATEGORIES]
+                ],
+                inputs=[],
+                outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
+            )
+        with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2):
+            gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                    model_type = gr.Dropdown(
+                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+                        label="Model type",
+                        multiselect=False,
+                        value=None,
+                        interactive=True,
+                    )
+                with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                        label="Precision",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True,
+                    )
+                    weight_type = gr.Dropdown(
+                        choices=[i.value.name for i in WeightType],
+                        label="Weights type",
+                        multiselect=False,
+                        value="Original",
+                        interactive=True,
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            with gr.Row():
+                file_input = gr.File(
+                    label="Upload JSONL Results File",
+                    file_types=[".jsonl"]
+                )
+            submit_button = gr.Button("Submit Results")
+            result_output = gr.Markdown()
+            submit_button.click(
+                fn=submit_results,
+                inputs=[
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    weight_type,
+                    model_type,
+                    file_input
+                ],
+                outputs=result_output
+            )
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=10,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+        with gr.Accordion("ℹ️ Dataset Information", open=False):
+            dataset_info = gr.Markdown(f"""
+            ## Dataset Information
+            Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID})
+            Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")}
+            """)
+# Set up scheduler to refresh data periodically
+scheduler = BackgroundScheduler()
+scheduler.add_job(refresh_data, 'interval', minutes=30)
+scheduler.start()
+# Launch the app
+if __name__ == "__main__":
+    # Set up authentication if credentials are provided
+    if not ADMIN_USERNAME or not ADMIN_PASSWORD:
+        logger.warning("Admin username or password not set. Running without authentication.")
+        auth = None
+    else:
+        auth = (ADMIN_USERNAME, ADMIN_PASSWORD)
+    # Launch the app
+    demo.launch(server_name="0.0.0.0", server_port=7860, auth=auth)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=3.50.2
+huggingface_hub>=0.19.0
+datasets>=2.16.0
+pandas>=2.0.0
+python-dotenv>=1.0.0
+apscheduler>=3.10.1
+gradio-leaderboard

src/about.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Text content for the GuardBench Leaderboard.
+"""
+TITLE = """
+<div style="text-align: center; margin-bottom: 1rem">
+    <h1>GuardBench Leaderboard</h1>
+</div>
+"""
+INTRODUCTION_TEXT = """
+## Introduction
+GuardBench is a comprehensive benchmark for evaluating the safety guardrails of large language models (LLMs).
+This leaderboard tracks model performance across various safety categories, including harmful content detection,
+jailbreak resistance, and more.
+Models are evaluated on their ability to properly refuse harmful requests and detect problematic content
+across multiple categories and test scenarios.
+"""
+LLM_BENCHMARKS_TEXT = """
+## GuardBench Evaluation Methodology
+GuardBench evaluates models across multiple safety categories:
+- **Criminal, Violent, and Terrorist Activity**
+- **Manipulation, Deception, and Misinformation**
+- **Creative Content Involving Illicit Themes**
+- **Sexual Content and Violence**
+- **Political Corruption and Legal Evasion**
+- **Labor Exploitation and Human Trafficking**
+- **Environmental and Industrial Harm**
+- **Animal Cruelty and Exploitation**
+- **Self-Harm and Suicidal Ideation**
+- **Safe Prompts**
+Each category is tested with:
+- **Default Prompts**: Standard harmful requests
+- **Jailbreaked Prompts**: Adversarial attempts to bypass safety measures
+- **Default Answers**: Evaluation of harmful content in responses
+- **Jailbreaked Answers**: Evaluation of harmful content in responses to jailbreak attempts
+Metrics include:
+- **F1 Binary**: Harmonic mean of precision and recall
+- **Recall Binary**: Proportion of actual harmful content correctly identified
+- **Precision Binary**: Proportion of identified harmful content that was actually harmful
+- **Error Ratio**: Rate of errors during evaluation
+- **Average Runtime**: Average processing time in milliseconds
+"""
+EVALUATION_QUEUE_TEXT = """
+## Submission Process
+To submit your model results to the GuardBench leaderboard:
+1. Evaluate your model using the [GuardBench framework](https://github.com/huggingface/guard-bench)
+2. Format your results as a JSONL file according to our schema
+3. Submit your results using the submission form with your authorized token
+Results will be processed and added to the leaderboard once validated.
+"""
+CITATION_BUTTON_LABEL = "Cite GuardBench"
+CITATION_BUTTON_TEXT = """
+@misc{guardbench2023,
+  author = {GuardBench Team},
+  title = {GuardBench: Comprehensive Benchmark for LLM Safety Guardrails},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\\url{https://github.com/huggingface/guard-bench}}
+}
+"""

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+CSS and styling for the GuardBench Leaderboard.
+"""
+custom_css = """
+.markdown-text {
+    font-size: 16px !important;
+    text-align: justify !important;
+}
+.tab-buttons button.selected {
+    border-color: #2196F3 !important;
+    background: #E3F2FD !important;
+    color: #2196F3 !important;
+}
+#citation-button textarea {
+    font-family: monospace !important;
+}
+.leaderboard-container {
+    margin-top: 20px;
+}
+.category-header {
+    font-weight: bold;
+    background-color: #f5f5f5;
+    padding: 10px;
+    margin-top: 15px;
+    border-radius: 5px;
+}
+.metric-name {
+    font-weight: bold;
+    color: #2196F3;
+}
+.model-name {
+    font-weight: bold;
+}
+.model-link:hover {
+    text-decoration: underline;
+    color: #1976D2;
+}
+"""

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Formatting utilities for the GuardBench Leaderboard.
+"""
+import pandas as pd
+import numpy as np
+def make_clickable_model(model_name: str) -> str:
+    """
+    Create a clickable link for a model name.
+    """
+    return f'<a href="https://huggingface.co/{model_name}" target="_blank">{model_name}</a>'
+def has_no_nan_values(df: pd.DataFrame, columns: list) -> pd.Series:
+    """
+    Check if a row has no NaN values in the specified columns.
+    """
+    return ~df[columns].isna().any(axis=1)
+def format_percentage(value: float) -> str:
+    """
+    Format a value as a percentage.
+    """
+    if pd.isna(value):
+        return "N/A"
+    return f"{value * 100:.2f}%"
+def format_number(value: float, precision: int = 2) -> str:
+    """
+    Format a number with specified precision.
+    """
+    if pd.isna(value):
+        return "N/A"
+    return f"{value:.{precision}f}"
+def styled_message(message: str) -> str:
+    """
+    Format a success message with styling.
+    """
+    return f"""
+    <div style="padding: 10px; border-radius: 5px; background-color: #e6f7e6; color: #2e7d32; border: 1px solid #2e7d32;">
+        ✅ {message}
+    </div>
+    """
+def styled_warning(message: str) -> str:
+    """
+    Format a warning message with styling.
+    """
+    return f"""
+    <div style="padding: 10px; border-radius: 5px; background-color: #fff8e1; color: #ff8f00; border: 1px solid #ff8f00;">
+        ⚠️ {message}
+    </div>
+    """
+def styled_error(message: str) -> str:
+    """
+    Format an error message with styling.
+    """
+    return f"""
+    <div style="padding: 10px; border-radius: 5px; background-color: #ffebee; color: #c62828; border: 1px solid #c62828;">
+        ❌ {message}
+    </div>
+    """

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+Utility classes and functions for the GuardBench Leaderboard display.
+"""
+from dataclasses import dataclass, field, fields
+from enum import Enum, auto
+from typing import List, Optional
+class ModelType(Enum):
+    """Model types for the leaderboard."""
+    Unknown = auto()
+    OpenSource = auto()
+    ClosedSource = auto()
+    API = auto()
+    def to_str(self, separator: str = " ") -> str:
+        """Convert enum to string with separator."""
+        if self == ModelType.Unknown:
+            return "Unknown"
+        elif self == ModelType.OpenSource:
+            return f"Open{separator}Source"
+        elif self == ModelType.ClosedSource:
+            return f"Closed{separator}Source"
+        elif self == ModelType.API:
+            return "API"
+        return "Unknown"
+class Precision(Enum):
+    """Model precision types."""
+    Unknown = auto()
+    float16 = auto()
+    bfloat16 = auto()
+    float32 = auto()
+    int8 = auto()
+    int4 = auto()
+class WeightType(Enum):
+    """Model weight types."""
+    Original = auto()
+    Delta = auto()
+    Adapter = auto()
+@dataclass
+class ColumnInfo:
+    """Information about a column in the leaderboard."""
+    name: str
+    display_name: str
+    type: str = "text"
+    hidden: bool = False
+    never_hidden: bool = False
+    displayed_by_default: bool = True
+@dataclass
+class GuardBenchColumn:
+    """Columns for the GuardBench leaderboard."""
+    model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="model_name",
+        display_name="Model",
+        never_hidden=True,
+        displayed_by_default=True
+    ))
+    model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="model_type",
+        display_name="Type",
+        displayed_by_default=True
+    ))
+    # Metrics for all categories
+    default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_prompts_f1",
+        display_name="Default Prompts F1",
+        type="number",
+        displayed_by_default=True
+    ))
+    jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_prompts_f1",
+        display_name="Jailbreaked Prompts F1",
+        type="number",
+        displayed_by_default=True
+    ))
+    default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_answers_f1",
+        display_name="Default Answers F1",
+        type="number",
+        displayed_by_default=True
+    ))
+    jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_answers_f1",
+        display_name="Jailbreaked Answers F1",
+        type="number",
+        displayed_by_default=True
+    ))
+    # Average metrics
+    average_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="average_f1",
+        display_name="Average F1",
+        type="number",
+        displayed_by_default=True,
+        never_hidden=True
+    ))
+    average_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="average_recall",
+        display_name="Average Recall",
+        type="number",
+        displayed_by_default=False
+    ))
+    average_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="average_precision",
+        display_name="Average Precision",
+        type="number",
+        displayed_by_default=False
+    ))
+    # Additional metadata
+    submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="submission_date",
+        display_name="Submission Date",
+        displayed_by_default=False
+    ))
+# Create instances for easy access
+GUARDBENCH_COLUMN = GuardBenchColumn()
+# Extract column lists for different views
+COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
+DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
+                if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
+METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
+               if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
+HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
+               if getattr(GUARDBENCH_COLUMN, f.name).hidden]
+NEVER_HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
+                     if getattr(GUARDBENCH_COLUMN, f.name).never_hidden]
+# Categories in GuardBench
+CATEGORIES = [
+    "Criminal, Violent, and Terrorist Activity",
+    "Manipulation, Deception, and Misinformation",
+    "Creative Content Involving Illicit Themes",
+    "Sexual Content and Violence",
+    "Political Corruption and Legal Evasion",
+    "Labor Exploitation and Human Trafficking",
+    "Environmental and Industrial Harm",
+    "Animal Cruelty and Exploitation",
+    "Self–Harm and Suicidal Ideation",
+    "Safe Prompts"
+]
+# Test types in GuardBench
+TEST_TYPES = [
+    "default_prompts",
+    "jailbreaked_prompts",
+    "default_answers",
+    "jailbreaked_answers"
+]
+# Metrics in GuardBench
+METRICS = [
+    "f1_binary",
+    "recall_binary",
+    "precision_binary",
+    "error_ratio",
+    "avg_runtime_ms"
+]

src/envs.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+from huggingface_hub import HfApi
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Hugging Face configuration
+TOKEN = os.environ.get("HF_TOKEN")  # A read/write token for your org
+OWNER = os.environ.get("OWNER", "guard-bench")  # Change to your org
+SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
+ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
+ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
+# Repository IDs
+REPO_ID = f"{OWNER}/leaderboard"
+RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/guardbench-results")
+# Cache paths
+CACHE_PATH = os.getenv("HF_HOME", ".")
+DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
+# Local data paths
+LEADERBOARD_FILE = os.path.join(DATA_PATH, "leaderboard.json")
+# HF API instance
+API = HfApi(token=TOKEN)

src/leaderboard/processor.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""
+Process and transform GuardBench leaderboard data.
+"""
+import json
+import os
+import pandas as pd
+from datetime import datetime
+from typing import Dict, List, Any, Tuple
+from src.display.utils import CATEGORIES, TEST_TYPES, METRICS
+def load_leaderboard_data(file_path: str) -> Dict:
+    """
+    Load the leaderboard data from a JSON file.
+    """
+    if not os.path.exists(file_path):
+        return {"entries": [], "last_updated": datetime.now().isoformat()}
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return data
+def save_leaderboard_data(data: Dict, file_path: str) -> None:
+    """
+    Save the leaderboard data to a JSON file.
+    """
+    # Ensure the directory exists
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    # Update the last_updated timestamp
+    data["last_updated"] = datetime.now().isoformat()
+    with open(file_path, 'w') as f:
+        json.dump(data, f, indent=2)
+def process_submission(submission_data: List[Dict]) -> List[Dict]:
+    """
+    Process submission data and convert it to leaderboard entries.
+    """
+    entries = []
+    for item in submission_data:
+        # Create a new entry for the leaderboard
+        entry = {
+            "model_name": item.get("model_name", "Unknown Model"),
+            "per_category_metrics": {},
+            "avg_metrics": {},
+            "submission_date": datetime.now().isoformat()
+        }
+        # Process per-category metrics
+        if "per_category_metrics" in item:
+            entry["per_category_metrics"] = item["per_category_metrics"]
+        # Process average metrics
+        if "avg_metrics" in item:
+            entry["avg_metrics"] = item["avg_metrics"]
+        entries.append(entry)
+    return entries
+def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
+    """
+    Convert leaderboard data to a pandas DataFrame for display.
+    """
+    rows = []
+    for entry in leaderboard_data.get("entries", []):
+        model_name = entry.get("model_name", "Unknown Model")
+        # Extract average metrics for main display
+        row = {
+            "model_name": model_name,
+            "model_type": entry.get("model_type", "Unknown"),
+            "submission_date": entry.get("submission_date", "")
+        }
+        # Add average metrics
+        avg_metrics = entry.get("avg_metrics", {})
+        for test_type in TEST_TYPES:
+            if test_type in avg_metrics:
+                for metric in METRICS:
+                    if metric in avg_metrics[test_type]:
+                        col_name = f"{test_type}_{metric}"
+                        row[col_name] = avg_metrics[test_type][metric]
+        # Calculate overall averages for key metrics
+        f1_values = []
+        recall_values = []
+        precision_values = []
+        for test_type in TEST_TYPES:
+            if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
+                f1_values.append(avg_metrics[test_type]["f1_binary"])
+            if test_type in avg_metrics and "recall_binary" in avg_metrics[test_type]:
+                recall_values.append(avg_metrics[test_type]["recall_binary"])
+            if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
+                precision_values.append(avg_metrics[test_type]["precision_binary"])
+        # Add overall averages
+        if f1_values:
+            row["average_f1"] = sum(f1_values) / len(f1_values)
+        if recall_values:
+            row["average_recall"] = sum(recall_values) / len(recall_values)
+        if precision_values:
+            row["average_precision"] = sum(precision_values) / len(precision_values)
+        # Add specific test type F1 scores for display
+        if "default_prompts" in avg_metrics and "f1_binary" in avg_metrics["default_prompts"]:
+            row["default_prompts_f1"] = avg_metrics["default_prompts"]["f1_binary"]
+        if "jailbreaked_prompts" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_prompts"]:
+            row["jailbreaked_prompts_f1"] = avg_metrics["jailbreaked_prompts"]["f1_binary"]
+        if "default_answers" in avg_metrics and "f1_binary" in avg_metrics["default_answers"]:
+            row["default_answers_f1"] = avg_metrics["default_answers"]["f1_binary"]
+        if "jailbreaked_answers" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_answers"]:
+            row["jailbreaked_answers_f1"] = avg_metrics["jailbreaked_answers"]["f1_binary"]
+        rows.append(row)
+    # Create DataFrame and sort by average F1 score
+    df = pd.DataFrame(rows)
+    if not df.empty and "average_f1" in df.columns:
+        df = df.sort_values(by="average_f1", ascending=False)
+    return df
+def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict]) -> Dict:
+    """
+    Add new entries to the leaderboard, replacing any with the same model name.
+    """
+    # Create a mapping of existing entries by model name
+    existing_entries = {entry["model_name"]: i for i, entry in enumerate(leaderboard_data.get("entries", []))}
+    # Process each new entry
+    for new_entry in new_entries:
+        model_name = new_entry.get("model_name")
+        if model_name in existing_entries:
+            # Replace existing entry
+            leaderboard_data["entries"][existing_entries[model_name]] = new_entry
+        else:
+            # Add new entry
+            if "entries" not in leaderboard_data:
+                leaderboard_data["entries"] = []
+            leaderboard_data["entries"].append(new_entry)
+    # Update the last_updated timestamp
+    leaderboard_data["last_updated"] = datetime.now().isoformat()
+    return leaderboard_data
+def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
+    """
+    Process a JSONL submission file and extract entries.
+    """
+    entries = []
+    try:
+        with open(file_path, 'r') as f:
+            for line in f:
+                try:
+                    entry = json.loads(line)
+                    entries.append(entry)
+                except json.JSONDecodeError as e:
+                    return [], f"Invalid JSON in submission file: {e}"
+        if not entries:
+            return [], "Submission file is empty"
+        return entries, "Successfully processed submission"
+    except Exception as e:
+        return [], f"Error processing submission file: {e}"

src/populate.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""
+Populate the GuardBench leaderboard from HuggingFace datasets.
+"""
+import json
+import os
+import pandas as pd
+import tempfile
+from typing import Dict, Tuple, List
+from glob import glob
+from huggingface_hub import snapshot_download, hf_hub_download, HfApi
+from datasets import load_dataset
+from src.display.utils import GUARDBENCH_COLUMN, DISPLAY_COLS, CATEGORIES
+from src.envs import RESULTS_DATASET_ID, TOKEN, LEADERBOARD_FILE, CACHE_PATH
+from src.leaderboard.processor import leaderboard_to_dataframe, load_leaderboard_data, save_leaderboard_data, process_jsonl_submission, add_entries_to_leaderboard
+def download_leaderboard_data() -> bool:
+    """
+    Download the latest leaderboard data from HuggingFace.
+    """
+    try:
+        # Create a temporary directory to download the submissions
+        temp_dir = os.path.join(CACHE_PATH, "temp_submissions")
+        os.makedirs(temp_dir, exist_ok=True)
+        # Download the entire repository
+        try:
+            snapshot_path = snapshot_download(
+                repo_id=RESULTS_DATASET_ID,
+                repo_type="dataset",
+                local_dir=temp_dir,
+                token=TOKEN,
+                ignore_patterns=["*.md", ".*"],
+                etag_timeout=30
+            )
+            # Process all submission files
+            all_entries = []
+            submission_files = []
+            # Look for submission files in the submissions directory
+            submissions_dir = os.path.join(snapshot_path, "submissions")
+            if os.path.exists(submissions_dir):
+                submission_files.extend(glob(os.path.join(submissions_dir, "*.jsonl")))
+            # Also look for any JSONL files in the root
+            submission_files.extend(glob(os.path.join(snapshot_path, "*.jsonl")))
+            # Process each submission file
+            for file_path in submission_files:
+                entries, _ = process_jsonl_submission(file_path)
+                all_entries.extend(entries)
+            # Create leaderboard data structure
+            leaderboard_data = {
+                "entries": all_entries,
+                "last_updated": pd.Timestamp.now().isoformat()
+            }
+            # Save to local file
+            save_leaderboard_data(leaderboard_data, LEADERBOARD_FILE)
+            return True
+        except Exception as e:
+            print(f"Error downloading repository: {e}")
+            # If we can't download the repository, try to download individual files
+            try:
+                api = HfApi(token=TOKEN)
+                files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
+                submission_files = [f for f in files if f.endswith('.jsonl')]
+                all_entries = []
+                for file_path in submission_files:
+                    try:
+                        local_path = hf_hub_download(
+                            repo_id=RESULTS_DATASET_ID,
+                            filename=file_path,
+                            repo_type="dataset",
+                            token=TOKEN
+                        )
+                        entries, _ = process_jsonl_submission(local_path)
+                        all_entries.extend(entries)
+                    except Exception as file_error:
+                        print(f"Error downloading file {file_path}: {file_error}")
+                # Create leaderboard data structure
+                leaderboard_data = {
+                    "entries": all_entries,
+                    "last_updated": pd.Timestamp.now().isoformat()
+                }
+                # Save to local file
+                save_leaderboard_data(leaderboard_data, LEADERBOARD_FILE)
+                return True
+            except Exception as list_error:
+                print(f"Error listing repository files: {list_error}")
+            # If we can't download anything, create an empty leaderboard
+            if not os.path.exists(LEADERBOARD_FILE):
+                empty_data = {"entries": [], "last_updated": pd.Timestamp.now().isoformat()}
+                save_leaderboard_data(empty_data, LEADERBOARD_FILE)
+            return False
+    except Exception as e:
+        print(f"Error downloading leaderboard data: {e}")
+        # Ensure we have at least an empty leaderboard file
+        if not os.path.exists(LEADERBOARD_FILE):
+            empty_data = {"entries": [], "last_updated": pd.Timestamp.now().isoformat()}
+            save_leaderboard_data(empty_data, LEADERBOARD_FILE)
+        return False
+def get_leaderboard_df() -> pd.DataFrame:
+    """
+    Get the leaderboard data as a DataFrame.
+    """
+    # Try to download the latest data
+    download_leaderboard_data()
+    # Load from local file
+    leaderboard_data = load_leaderboard_data(LEADERBOARD_FILE)
+    # Convert to DataFrame
+    df = leaderboard_to_dataframe(leaderboard_data)
+    return df
+def get_category_leaderboard_df(category: str) -> pd.DataFrame:
+    """
+    Get the leaderboard data filtered by a specific category.
+    Args:
+        category: The category to filter by (e.g., "Criminal, Violent, and Terrorist Activity")
+    Returns:
+        DataFrame with metrics for the specified category
+    """
+    # Load the leaderboard data
+    leaderboard_data = load_leaderboard_data(LEADERBOARD_FILE)
+    # Filter entries to only include those with data for the specified category
+    filtered_entries = []
+    for entry in leaderboard_data.get("entries", []):
+        # Check if the entry has data for this category
+        if "per_category_metrics" in entry and category in entry["per_category_metrics"]:
+            # Create a new entry with just the overall info and this category's metrics
+            filtered_entry = {
+                "model_name": entry.get("model_name", "Unknown Model"),
+                "model_type": entry.get("model_type", "Unknown"),
+                "submission_date": entry.get("submission_date", ""),
+            }
+            # Extract metrics for this category
+            category_metrics = entry["per_category_metrics"][category]
+            # Add metrics for each test type
+            for test_type in category_metrics:
+                if test_type and isinstance(category_metrics[test_type], dict):
+                    for metric, value in category_metrics[test_type].items():
+                        col_name = f"{test_type}_{metric}"
+                        filtered_entry[col_name] = value
+            # Calculate average F1 for this category
+            f1_values = []
+            for test_type in category_metrics:
+                if test_type and isinstance(category_metrics[test_type], dict) and "f1_binary" in category_metrics[test_type]:
+                    f1_values.append(category_metrics[test_type]["f1_binary"])
+            if f1_values:
+                filtered_entry["average_f1"] = sum(f1_values) / len(f1_values)
+            # Add specific test type F1 scores for display
+            for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
+                if test_type in category_metrics and "f1_binary" in category_metrics[test_type]:
+                    filtered_entry[f"{test_type}_f1"] = category_metrics[test_type]["f1_binary"]
+            filtered_entries.append(filtered_entry)
+    # Create a new leaderboard data structure with the filtered entries
+    filtered_leaderboard = {
+        "entries": filtered_entries,
+        "last_updated": leaderboard_data.get("last_updated", pd.Timestamp.now().isoformat())
+    }
+    # Convert to DataFrame
+    df = leaderboard_to_dataframe(filtered_leaderboard)
+    return df
+def get_detailed_model_data(model_name: str) -> Dict:
+    """
+    Get detailed data for a specific model.
+    """
+    leaderboard_data = load_leaderboard_data(LEADERBOARD_FILE)
+    for entry in leaderboard_data.get("entries", []):
+        if entry.get("model_name") == model_name:
+            return entry
+    return {}

src/submission/submit.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+Handle submissions to the GuardBench leaderboard.
+"""
+import json
+import os
+import tempfile
+import uuid
+from datetime import datetime
+from typing import Dict, List, Tuple
+from huggingface_hub import HfApi
+from datasets import load_dataset, Dataset
+from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, RESULTS_DATASET_ID, TOKEN
+from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard, load_leaderboard_data
+def validate_submission(file_path: str) -> Tuple[bool, str]:
+    """
+    Validate a submission file.
+    """
+    try:
+        entries, message = process_jsonl_submission(file_path)
+        if not entries:
+            return False, message
+        # Additional validation could be added here
+        return True, "Submission is valid"
+    except Exception as e:
+        return False, f"Error validating submission: {e}"
+def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str) -> Tuple[bool, str]:
+    """
+    Submit results to a HuggingFace dataset repository as individual files.
+    """
+    try:
+        # Process the submission file to validate
+        entries, message = process_jsonl_submission(file_path)
+        if not entries:
+            return False, message
+        # Generate a unique submission ID
+        model_name = metadata.get("model_name", "unknown")
+        model_name_safe = model_name.replace("/", "_").replace(" ", "_")
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        submission_id = f"{model_name_safe}_{timestamp}"
+        # Create an API instance
+        api = HfApi(token=token)
+        # Create a temporary file with metadata added
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as temp_file:
+            # Add metadata to each entry
+            for entry in entries:
+                # If the entry already has a model_name, don't override it
+                if "model_name" not in entry:
+                    entry["model_name"] = metadata.get("model_name")
+                # Add other metadata if not present
+                for key, value in metadata.items():
+                    if key != "model_name" and key not in entry:
+                        entry[key] = value
+                # Write to temp file
+                temp_file.write(json.dumps(entry) + "\n")
+            temp_path = temp_file.name
+        # Upload the file directly to the repository
+        submission_path = f"submissions/{submission_id}.jsonl"
+        api.upload_file(
+            path_or_fileobj=temp_path,
+            path_in_repo=submission_path,
+            repo_id=dataset_id,
+            repo_type="dataset",
+            commit_message=f"Add submission for {model_name}"
+        )
+        # Clean up the temporary file
+        os.unlink(temp_path)
+        return True, f"Successfully uploaded submission for {model_name} to {dataset_id}"
+    except Exception as e:
+        return False, f"Error submitting to dataset: {e}"
+def process_submission(file_path: str, metadata: Dict) -> str:
+    """
+    Process a submission to the GuardBench leaderboard.
+    """
+    # Validate submission file
+    is_valid, validation_message = validate_submission(file_path)
+    if not is_valid:
+        return styled_error(validation_message)
+    # Submit to HuggingFace dataset repository
+    success, message = submit_to_hub(file_path, metadata, RESULTS_DATASET_ID, TOKEN)
+    if not success:
+        return styled_error(message)
+    return styled_message(f"Submission successful! {message}")