|
""" |
|
GuardBench Leaderboard Application |
|
""" |
|
|
|
import os |
|
import json |
|
import tempfile |
|
import logging |
|
import gradio as gr |
|
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns |
|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
|
from src.about import ( |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
EVALUATION_QUEUE_TEXT, |
|
INTRODUCTION_TEXT, |
|
LLM_BENCHMARKS_TEXT, |
|
TITLE, |
|
) |
|
from src.display.css_html_js import custom_css |
|
from src.display.utils import ( |
|
GUARDBENCH_COLUMN, |
|
DISPLAY_COLS, |
|
METRIC_COLS, |
|
HIDDEN_COLS, |
|
NEVER_HIDDEN_COLS, |
|
CATEGORIES, |
|
TEST_TYPES, |
|
ModelType, |
|
Precision, |
|
WeightType, |
|
GuardModelType |
|
) |
|
from src.display.formatting import styled_message, styled_error, styled_warning |
|
from src.envs import ( |
|
ADMIN_USERNAME, |
|
ADMIN_PASSWORD, |
|
RESULTS_DATASET_ID, |
|
SUBMITTER_TOKEN, |
|
TOKEN, |
|
DATA_PATH |
|
) |
|
from src.populate import get_leaderboard_df, get_category_leaderboard_df |
|
from src.submission.submit import process_submission |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
os.makedirs(DATA_PATH, exist_ok=True) |
|
|
|
|
|
BENCHMARK_VERSIONS = ["v0"] |
|
CURRENT_VERSION = "v0" |
|
|
|
|
|
try: |
|
logger.info("Initializing leaderboard data...") |
|
LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION) |
|
logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries") |
|
except Exception as e: |
|
logger.error(f"Error loading leaderboard data: {e}") |
|
LEADERBOARD_DF = pd.DataFrame() |
|
|
|
print(DISPLAY_COLS) |
|
|
|
def init_leaderboard(dataframe): |
|
""" |
|
Initialize the leaderboard component. |
|
""" |
|
if dataframe is None or dataframe.empty: |
|
|
|
columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS] |
|
dataframe = pd.DataFrame(columns=columns) |
|
logger.warning("Initializing empty leaderboard") |
|
|
|
return Leaderboard( |
|
value=dataframe, |
|
datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS], |
|
select_columns=SelectColumns( |
|
default_selection=[getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS], |
|
cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS], |
|
label="Select Columns to Display:", |
|
), |
|
search_columns=[GUARDBENCH_COLUMN.model_name.name], |
|
hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS], |
|
filter_columns=[ |
|
ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"), |
|
], |
|
interactive=False, |
|
) |
|
|
|
|
|
def submit_results( |
|
model_name: str, |
|
base_model: str, |
|
revision: str, |
|
precision: str, |
|
weight_type: str, |
|
model_type: str, |
|
submission_file: tempfile._TemporaryFileWrapper, |
|
version: str, |
|
guard_model_type: GuardModelType |
|
): |
|
""" |
|
Handle submission of results with model metadata. |
|
""" |
|
if submission_file is None: |
|
return styled_error("No submission file provided") |
|
|
|
if not model_name: |
|
return styled_error("Model name is required") |
|
|
|
if not model_type: |
|
return styled_error("Please select a model type") |
|
|
|
file_path = submission_file.name |
|
logger.info(f"Received submission for model {model_name}: {file_path}") |
|
|
|
|
|
metadata = { |
|
"model_name": model_name, |
|
"base_model": base_model, |
|
"revision": revision if revision else "main", |
|
"precision": precision, |
|
"weight_type": weight_type, |
|
"model_type": model_type, |
|
"version": version, |
|
"guard_model_type": guard_model_type |
|
} |
|
|
|
|
|
result = process_submission(file_path, metadata, version=version) |
|
|
|
|
|
global LEADERBOARD_DF |
|
try: |
|
logger.info(f"Refreshing leaderboard data after submission for version {version}...") |
|
LEADERBOARD_DF = get_leaderboard_df(version=version) |
|
logger.info("Refreshed leaderboard data after submission") |
|
except Exception as e: |
|
logger.error(f"Error refreshing leaderboard data: {e}") |
|
|
|
return result |
|
|
|
|
|
def refresh_data(version=CURRENT_VERSION): |
|
""" |
|
Refresh the leaderboard data from HuggingFace. |
|
""" |
|
global LEADERBOARD_DF |
|
try: |
|
logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...") |
|
new_df = get_leaderboard_df(version=version) |
|
if new_df is not None and not new_df.empty: |
|
LEADERBOARD_DF = new_df |
|
logger.info("Scheduled refresh of leaderboard data completed") |
|
else: |
|
logger.warning("Refresh returned empty data, keeping existing data") |
|
|
|
if LEADERBOARD_DF is None or LEADERBOARD_DF.empty: |
|
columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS] |
|
LEADERBOARD_DF = pd.DataFrame(columns=columns) |
|
except Exception as e: |
|
logger.error(f"Error in scheduled refresh: {e}") |
|
|
|
if LEADERBOARD_DF is None or LEADERBOARD_DF.empty: |
|
columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS] |
|
LEADERBOARD_DF = pd.DataFrame(columns=columns) |
|
return LEADERBOARD_DF |
|
|
|
|
|
def update_leaderboards(version): |
|
""" |
|
Update all leaderboard components with data for the selected version. |
|
""" |
|
new_df = get_leaderboard_df(version=version) |
|
category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES] |
|
return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs] |
|
|
|
|
|
def create_performance_plot(selected_models, category, metric="f1_binary", version=CURRENT_VERSION): |
|
""" |
|
Create a radar plot comparing model performance for selected models. |
|
""" |
|
if category == "📊 Overall Performance": |
|
df = get_leaderboard_df(version=version) |
|
else: |
|
df = get_category_leaderboard_df(category, version=version) |
|
|
|
if df.empty: |
|
return go.Figure() |
|
|
|
|
|
df = df[df['model_name'].isin(selected_models)] |
|
|
|
|
|
metric_cols = [col for col in df.columns if metric in col] |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
colors = ['#8FCCCC', '#C2A4B6', '#98B4A6', '#B68F7C'] |
|
|
|
|
|
for idx, model in enumerate(selected_models): |
|
model_data = df[df['model_name'] == model] |
|
if not model_data.empty: |
|
values = model_data[metric_cols].values[0].tolist() |
|
|
|
values = values + [values[0]] |
|
|
|
|
|
categories = [col.replace(f'_{metric}', '') for col in metric_cols] |
|
|
|
categories = categories + [categories[0]] |
|
|
|
fig.add_trace(go.Scatterpolar( |
|
r=values, |
|
theta=categories, |
|
name=model, |
|
line_color=colors[idx % len(colors)], |
|
fill='toself' |
|
)) |
|
|
|
|
|
fig.update_layout( |
|
paper_bgcolor='#000000', |
|
plot_bgcolor='#000000', |
|
font={'color': '#ffffff'}, |
|
title={ |
|
'text': f'{category} - {metric.upper()} Score Comparison', |
|
'font': {'color': '#ffffff', 'size': 24} |
|
}, |
|
polar=dict( |
|
bgcolor='#000000', |
|
radialaxis=dict( |
|
visible=True, |
|
range=[0, 1], |
|
gridcolor='#333333', |
|
linecolor='#333333', |
|
tickfont={'color': '#ffffff'}, |
|
), |
|
angularaxis=dict( |
|
gridcolor='#333333', |
|
linecolor='#333333', |
|
tickfont={'color': '#ffffff'}, |
|
) |
|
), |
|
height=600, |
|
showlegend=True, |
|
legend=dict( |
|
yanchor="top", |
|
y=0.99, |
|
xanchor="right", |
|
x=0.99, |
|
bgcolor='rgba(0,0,0,0.5)', |
|
font={'color': '#ffffff'} |
|
) |
|
) |
|
|
|
return fig |
|
|
|
|
|
def update_model_choices(version): |
|
""" |
|
Update the list of available models for the given version. |
|
""" |
|
df = get_leaderboard_df(version=version) |
|
if df.empty: |
|
return [] |
|
return sorted(df['model_name'].unique().tolist()) |
|
|
|
|
|
def update_visualization(selected_models, selected_category, selected_metric, version): |
|
""" |
|
Update the visualization based on user selections. |
|
""" |
|
if not selected_models: |
|
return go.Figure() |
|
return create_performance_plot(selected_models, selected_category, selected_metric, version) |
|
|
|
|
|
|
|
demo = gr.Blocks(css=custom_css) |
|
|
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
tabs = gr.Tabs(elem_classes="tab-buttons") |
|
|
|
with tabs: |
|
with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0): |
|
with gr.Row(): |
|
refresh_button = gr.Button("Refresh Leaderboard", scale=3) |
|
version_selector = gr.Dropdown( |
|
choices=BENCHMARK_VERSIONS, |
|
label="Benchmark Version", |
|
value=CURRENT_VERSION, |
|
interactive=True, |
|
elem_classes="version-selector", |
|
scale=1 |
|
) |
|
|
|
|
|
with gr.Tabs(elem_classes="category-tabs") as category_tabs: |
|
|
|
with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"): |
|
leaderboard = init_leaderboard(LEADERBOARD_DF) |
|
|
|
|
|
for category in CATEGORIES: |
|
with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"): |
|
category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION) |
|
category_leaderboard = init_leaderboard(category_df) |
|
|
|
|
|
refresh_button.click( |
|
fn=lambda: [ |
|
init_leaderboard(get_leaderboard_df(version=version_selector.value)), |
|
*[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES] |
|
], |
|
inputs=[], |
|
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] |
|
) |
|
|
|
with gr.TabItem("📊 Visualize", elem_id="guardbench-viz-tab", id=1): |
|
with gr.Row(): |
|
with gr.Column(): |
|
viz_version_selector = gr.Dropdown( |
|
choices=BENCHMARK_VERSIONS, |
|
label="Benchmark Version", |
|
value=CURRENT_VERSION, |
|
interactive=True |
|
) |
|
model_selector = gr.Dropdown( |
|
choices=update_model_choices(CURRENT_VERSION), |
|
label="Select Models to Compare", |
|
multiselect=True, |
|
interactive=True |
|
) |
|
with gr.Column(): |
|
|
|
viz_categories = ["📊 Overall Performance"] + CATEGORIES |
|
category_selector = gr.Dropdown( |
|
choices=viz_categories, |
|
label="Select Category", |
|
value=viz_categories[0], |
|
interactive=True |
|
) |
|
metric_selector = gr.Dropdown( |
|
choices=["f1_binary", "precision_binary", "recall_binary"], |
|
label="Select Metric", |
|
value="f1_binary", |
|
interactive=True |
|
) |
|
|
|
plot_output = gr.Plot() |
|
|
|
|
|
for control in [viz_version_selector, model_selector, category_selector, metric_selector]: |
|
control.change( |
|
fn=update_visualization, |
|
inputs=[model_selector, category_selector, metric_selector, viz_version_selector], |
|
outputs=plot_output |
|
) |
|
|
|
|
|
viz_version_selector.change( |
|
fn=update_model_choices, |
|
inputs=[viz_version_selector], |
|
outputs=[model_selector] |
|
) |
|
|
|
with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=2): |
|
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=3): |
|
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text") |
|
with gr.Column(scale=1): |
|
|
|
submission_version_selector = gr.Dropdown( |
|
choices=BENCHMARK_VERSIONS, |
|
label="Benchmark Version", |
|
value=CURRENT_VERSION, |
|
interactive=True, |
|
elem_classes="version-selector" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
model_name_textbox = gr.Textbox(label="Model name") |
|
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") |
|
model_type = gr.Dropdown( |
|
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], |
|
label="Model type", |
|
multiselect=False, |
|
value=None, |
|
interactive=True, |
|
) |
|
guard_model_type = gr.Dropdown( |
|
choices=[t.name for t in GuardModelType], |
|
label="Guard model type", |
|
multiselect=False, |
|
value=GuardModelType.LLM_REGEXP.name, |
|
interactive=True, |
|
) |
|
|
|
with gr.Column(): |
|
precision = gr.Dropdown( |
|
choices=[i.name for i in Precision if i != Precision.Unknown], |
|
label="Precision", |
|
multiselect=False, |
|
value="float16", |
|
interactive=True, |
|
) |
|
weight_type = gr.Dropdown( |
|
choices=[i.name for i in WeightType], |
|
label="Weights type", |
|
multiselect=False, |
|
value="Original", |
|
interactive=True, |
|
) |
|
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") |
|
|
|
with gr.Row(): |
|
file_input = gr.File( |
|
label="Upload JSONL Results File", |
|
file_types=[".jsonl"] |
|
) |
|
|
|
submit_button = gr.Button("Submit Results") |
|
result_output = gr.Markdown() |
|
|
|
submit_button.click( |
|
fn=submit_results, |
|
inputs=[ |
|
model_name_textbox, |
|
base_model_name_textbox, |
|
revision_name_textbox, |
|
precision, |
|
weight_type, |
|
model_type, |
|
file_input, |
|
submission_version_selector, |
|
guard_model_type |
|
], |
|
outputs=result_output |
|
) |
|
|
|
|
|
version_selector.change( |
|
fn=update_leaderboards, |
|
inputs=[version_selector], |
|
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Accordion("📙 Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
lines=10, |
|
elem_id="citation-button", |
|
show_copy_button=True, |
|
) |
|
|
|
with gr.Accordion("ℹ️ Dataset Information", open=False): |
|
dataset_info = gr.Markdown(f""" |
|
## Dataset Information |
|
|
|
Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID}) |
|
|
|
Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")} |
|
""") |
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(lambda: refresh_data(version=CURRENT_VERSION), 'interval', minutes=30) |
|
scheduler.start() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, share=True) |
|
|