Spaces:

whitecircle-ai
/

circle-guard-bench

Running

File size: 18,564 Bytes

"""
GuardBench Leaderboard Application
"""

import os
import json
import tempfile
import logging
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from apscheduler.schedulers.background import BackgroundScheduler

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
    GUARDBENCH_COLUMN,
    DISPLAY_COLS,
    METRIC_COLS,
    HIDDEN_COLS,
    NEVER_HIDDEN_COLS,
    CATEGORIES,
    TEST_TYPES,
    ModelType,
    Precision,
    WeightType,
    GuardModelType
)
from src.display.formatting import styled_message, styled_error, styled_warning
from src.envs import (
    ADMIN_USERNAME,
    ADMIN_PASSWORD,
    RESULTS_DATASET_ID,
    SUBMITTER_TOKEN,
    TOKEN,
    DATA_PATH
)
from src.populate import get_leaderboard_df, get_category_leaderboard_df
from src.submission.submit import process_submission

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Ensure data directory exists
os.makedirs(DATA_PATH, exist_ok=True)

# Available benchmark versions
BENCHMARK_VERSIONS = ["v0"]
CURRENT_VERSION = "v0"

# Initialize leaderboard data
try:
    logger.info("Initializing leaderboard data...")
    LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
    logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
except Exception as e:
    logger.error(f"Error loading leaderboard data: {e}")
    LEADERBOARD_DF = pd.DataFrame()

print(DISPLAY_COLS)

def init_leaderboard(dataframe):
    """
    Initialize the leaderboard component.
    """
    if dataframe is None or dataframe.empty:
        # Create an empty dataframe with the right columns
        columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
        dataframe = pd.DataFrame(columns=columns)
        logger.warning("Initializing empty leaderboard")

    return Leaderboard(
        value=dataframe,
        datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS],
        select_columns=SelectColumns(
            default_selection=[getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS],
            cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS],
            label="Select Columns to Display:",
        ),
        search_columns=[GUARDBENCH_COLUMN.model_name.name],
        hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS],
        filter_columns=[
            ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"),
        ],
        interactive=False,
    )


def submit_results(
    model_name: str,
    base_model: str,
    revision: str,
    precision: str,
    weight_type: str,
    model_type: str,
    submission_file: tempfile._TemporaryFileWrapper,
    version: str,
    guard_model_type: GuardModelType
):
    """
    Handle submission of results with model metadata.
    """
    if submission_file is None:
        return styled_error("No submission file provided")

    if not model_name:
        return styled_error("Model name is required")

    if not model_type:
        return styled_error("Please select a model type")

    file_path = submission_file.name
    logger.info(f"Received submission for model {model_name}: {file_path}")

    # Add metadata to the submission
    metadata = {
        "model_name": model_name,
        "base_model": base_model,
        "revision": revision if revision else "main",
        "precision": precision,
        "weight_type": weight_type,
        "model_type": model_type,
        "version": version,
        "guard_model_type": guard_model_type
    }

    # Process the submission
    result = process_submission(file_path, metadata, version=version)

    # Refresh the leaderboard data
    global LEADERBOARD_DF
    try:
        logger.info(f"Refreshing leaderboard data after submission for version {version}...")
        LEADERBOARD_DF = get_leaderboard_df(version=version)
        logger.info("Refreshed leaderboard data after submission")
    except Exception as e:
        logger.error(f"Error refreshing leaderboard data: {e}")

    return result


def refresh_data(version=CURRENT_VERSION):
    """
    Refresh the leaderboard data from HuggingFace.
    """
    global LEADERBOARD_DF
    try:
        logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...")
        new_df = get_leaderboard_df(version=version)
        if new_df is not None and not new_df.empty:
            LEADERBOARD_DF = new_df
            logger.info("Scheduled refresh of leaderboard data completed")
        else:
            logger.warning("Refresh returned empty data, keeping existing data")
            # If empty, create a dataframe with correct columns
            if LEADERBOARD_DF is None or LEADERBOARD_DF.empty:
                columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
                LEADERBOARD_DF = pd.DataFrame(columns=columns)
    except Exception as e:
        logger.error(f"Error in scheduled refresh: {e}")
        # Ensure we have at least an empty dataframe with correct columns
        if LEADERBOARD_DF is None or LEADERBOARD_DF.empty:
            columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
            LEADERBOARD_DF = pd.DataFrame(columns=columns)
    return LEADERBOARD_DF


def update_leaderboards(version):
    """
    Update all leaderboard components with data for the selected version.
    """
    new_df = get_leaderboard_df(version=version)
    category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES]
    return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs]


def create_performance_plot(selected_models, category, metric="f1_binary", version=CURRENT_VERSION):
    """
    Create a radar plot comparing model performance for selected models.
    """
    if category == "📊 Overall Performance":
        df = get_leaderboard_df(version=version)
    else:
        df = get_category_leaderboard_df(category, version=version)

    if df.empty:
        return go.Figure()

    # Filter for selected models
    df = df[df['model_name'].isin(selected_models)]

    # Get the relevant metric columns
    metric_cols = [col for col in df.columns if metric in col]

    # Create figure
    fig = go.Figure()

    # Custom colors for different models
    colors = ['#8FCCCC', '#C2A4B6', '#98B4A6', '#B68F7C']  # Pale Cyan, Pale Pink, Pale Green, Pale Orange

    # Add traces for each model
    for idx, model in enumerate(selected_models):
        model_data = df[df['model_name'] == model]
        if not model_data.empty:
            values = model_data[metric_cols].values[0].tolist()
            # Add the first value again at the end to complete the polygon
            values = values + [values[0]]

            # Clean up test type names
            categories = [col.replace(f'_{metric}', '') for col in metric_cols]
            # Add the first category again at the end to complete the polygon
            categories = categories + [categories[0]]

            fig.add_trace(go.Scatterpolar(
                r=values,
                theta=categories,
                name=model,
                line_color=colors[idx % len(colors)],
                fill='toself'
            ))

    # Update layout with all settings at once
    fig.update_layout(
        paper_bgcolor='#000000',
        plot_bgcolor='#000000',
        font={'color': '#ffffff'},
        title={
            'text': f'{category} - {metric.upper()} Score Comparison',
            'font': {'color': '#ffffff', 'size': 24}
        },
        polar=dict(
            bgcolor='#000000',
            radialaxis=dict(
                visible=True,
                range=[0, 1],
                gridcolor='#333333',
                linecolor='#333333',
                tickfont={'color': '#ffffff'},
            ),
            angularaxis=dict(
                gridcolor='#333333',
                linecolor='#333333',
                tickfont={'color': '#ffffff'},
            )
        ),
        height=600,
        showlegend=True,
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="right",
            x=0.99,
            bgcolor='rgba(0,0,0,0.5)',
            font={'color': '#ffffff'}
        )
    )

    return fig


def update_model_choices(version):
    """
    Update the list of available models for the given version.
    """
    df = get_leaderboard_df(version=version)
    if df.empty:
        return []
    return sorted(df['model_name'].unique().tolist())


def update_visualization(selected_models, selected_category, selected_metric, version):
    """
    Update the visualization based on user selections.
    """
    if not selected_models:
        return go.Figure()
    return create_performance_plot(selected_models, selected_category, selected_metric, version)


# Create Gradio app
demo = gr.Blocks(css=custom_css)

with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Row():
        tabs = gr.Tabs(elem_classes="tab-buttons")

        with tabs:
            with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
                with gr.Row():
                    refresh_button = gr.Button("Refresh Leaderboard", scale=3)
                    version_selector = gr.Dropdown(
                        choices=BENCHMARK_VERSIONS,
                        label="Benchmark Version",
                        value=CURRENT_VERSION,
                        interactive=True,
                        elem_classes="version-selector",
                        scale=1
                    )

                # Create tabs for each category
                with gr.Tabs(elem_classes="category-tabs") as category_tabs:
                    # First tab for average metrics across all categories
                    with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
                        leaderboard = init_leaderboard(LEADERBOARD_DF)

                    # Create a tab for each category
                    for category in CATEGORIES:
                        with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
                            category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION)
                            category_leaderboard = init_leaderboard(category_df)

                # Refresh button functionality
                refresh_button.click(
                    fn=lambda: [
                        init_leaderboard(get_leaderboard_df(version=version_selector.value)),
                        *[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES]
                    ],
                    inputs=[],
                    outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
                )

            with gr.TabItem("📊 Visualize", elem_id="guardbench-viz-tab", id=1):
                with gr.Row():
                    with gr.Column():
                        viz_version_selector = gr.Dropdown(
                            choices=BENCHMARK_VERSIONS,
                            label="Benchmark Version",
                            value=CURRENT_VERSION,
                            interactive=True
                        )
                        model_selector = gr.Dropdown(
                            choices=update_model_choices(CURRENT_VERSION),
                            label="Select Models to Compare",
                            multiselect=True,
                            interactive=True
                        )
                    with gr.Column():
                        # Add Overall Performance to categories
                        viz_categories = ["📊 Overall Performance"] + CATEGORIES
                        category_selector = gr.Dropdown(
                            choices=viz_categories,
                            label="Select Category",
                            value=viz_categories[0],
                            interactive=True
                        )
                        metric_selector = gr.Dropdown(
                            choices=["f1_binary", "precision_binary", "recall_binary"],
                            label="Select Metric",
                            value="f1_binary",
                            interactive=True
                        )

                plot_output = gr.Plot()

                # Update visualization when any selector changes
                for control in [viz_version_selector, model_selector, category_selector, metric_selector]:
                    control.change(
                        fn=update_visualization,
                        inputs=[model_selector, category_selector, metric_selector, viz_version_selector],
                        outputs=plot_output
                    )

                # Update model choices when version changes
                viz_version_selector.change(
                    fn=update_model_choices,
                    inputs=[viz_version_selector],
                    outputs=[model_selector]
                )

            with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=2):
                gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

            with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=3):
                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

                with gr.Row():
                    with gr.Column(scale=3):
                        gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
                    with gr.Column(scale=1):
                        # Add version selector specifically for the submission tab
                        submission_version_selector = gr.Dropdown(
                            choices=BENCHMARK_VERSIONS,
                            label="Benchmark Version",
                            value=CURRENT_VERSION,
                            interactive=True,
                            elem_classes="version-selector"
                        )

                with gr.Row():
                    with gr.Column():
                        model_name_textbox = gr.Textbox(label="Model name")
                        revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                        model_type = gr.Dropdown(
                            choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                            label="Model type",
                            multiselect=False,
                            value=None,
                            interactive=True,
                        )
                        guard_model_type = gr.Dropdown(
                            choices=[t.name for t in GuardModelType],
                            label="Guard model type",
                            multiselect=False,
                            value=GuardModelType.LLM_REGEXP.name,
                            interactive=True,
                        )

                    with gr.Column():
                        precision = gr.Dropdown(
                            choices=[i.name for i in Precision if i != Precision.Unknown],
                            label="Precision",
                            multiselect=False,
                            value="float16",
                            interactive=True,
                        )
                        weight_type = gr.Dropdown(
                            choices=[i.name for i in WeightType],
                            label="Weights type",
                            multiselect=False,
                            value="Original",
                            interactive=True,
                        )
                        base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")

                with gr.Row():
                    file_input = gr.File(
                        label="Upload JSONL Results File",
                        file_types=[".jsonl"]
                    )

                submit_button = gr.Button("Submit Results")
                result_output = gr.Markdown()

                submit_button.click(
                    fn=submit_results,
                    inputs=[
                        model_name_textbox,
                        base_model_name_textbox,
                        revision_name_textbox,
                        precision,
                        weight_type,
                        model_type,
                        file_input,
                        submission_version_selector,
                        guard_model_type
                    ],
                    outputs=result_output
                )

    # Version selector functionality
    version_selector.change(
        fn=update_leaderboards,
        inputs=[version_selector],
        outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
    )

    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                lines=10,
                elem_id="citation-button",
                show_copy_button=True,
            )

        with gr.Accordion("ℹ️ Dataset Information", open=False):
            dataset_info = gr.Markdown(f"""
            ## Dataset Information

            Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID})

            Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")}
            """)

scheduler = BackgroundScheduler()
scheduler.add_job(lambda: refresh_data(version=CURRENT_VERSION), 'interval', minutes=30)
scheduler.start()

# Launch the app
if __name__ == "__main__":

    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)