Spaces:

FSMBench
/

Leaderboard

Sleeping

File size: 6,915 Bytes

import os
from glob import glob

import gradio as gr
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.colors import BoundaryNorm, ListedColormap

all_results = pd.read_pickle("all_results.pkl")


def get_accuracy_dataframe(df):
    # Calculate overall model accuracy
    df['parsed_judge_response'] = df['parsed_judge_response'].astype(float)
    model_accuracy = df.groupby('model_name')['parsed_judge_response'].mean().reset_index()
    
    # Calculate model accuracy per difficulty level
    df['difficulty_level'] = df['difficulty_level'].astype(int)
    model_accuracy_per_level = df.groupby(['model_name', 'difficulty_level'])['parsed_judge_response'].mean().reset_index()
    model_accuracy_per_level_df = model_accuracy_per_level.pivot(index='model_name', columns='difficulty_level', values='parsed_judge_response')
    
    # Merge overall accuracy and level-based accuracy into a single DataFrame
    model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on='model_name')
    model_accuracy_df.rename(columns={1: 'level_1', 2: 'level_2', 3: 'level_3', 4: 'level_4', 5: 'level_5'}, inplace=True)
    model_accuracy_df.rename(columns={'parsed_judge_response': 'Accuracy'}, inplace=True)
    
    # Multiply by 100 and format to one decimal point
    model_accuracy_df = model_accuracy_df.applymap(lambda x: round(x * 100, 1) if isinstance(x, float) else x)
    
    # Add headers with icons
    model_accuracy_df.columns = [
        "🤖 Model Name",
        "⭐ Overall",
        "📈 Level 1",
        "🔍 Level 2",
        "📘 Level 3",
        "🔬 Level 4",
    ]

    model_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
    
    # Add a new column at the beginning for the rank
    model_accuracy_df.insert(0, '#', range(1, len(model_accuracy_df) + 1))
    
    return model_accuracy_df


accuracy_df = get_accuracy_dataframe(all_results)


# Define the column names with icons
headers_with_icons = [
    "🤖 Model Name",
    "⭐ Overall",
    "📈 Level 1",
    "🔍 Level 2",
    "📘 Level 3",
    "🔬 Level 4",
]

column_names = [
    "Model Name",
    "Overall Accuracy",
    "Level 1 Accuracy",
    "Level 2 Accuracy",
    "Level 3 Accuracy",
    "Level 4 Accuracy",
]

def load_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results/{evt.value}.jpg")
    return heatmap_image



# # Function to process data
# def process_data(data):
#     data_for_df = []
#     for file, df in data.items():
#         overall_accuracy = round(calculate_accuracy(df), 2)
#         breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
#         model_name = file.split("/")[-1].replace(".pkl", "")
#         data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
#     return data_for_df


# # Function to finalize DataFrame
# def finalize_df(df):
#     df = df.round(1)  # Round to one decimal place
#     df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
#     df.columns = headers_with_icons
#     df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
#     # add a new column with the order (index)
#     df["#"] = range(1, len(df) + 1)
#     # bring rank to the first column
#     cols = df.columns.tolist()
#     cols = cols[-1:] + cols[:-1]
#     df = df[cols]

#     return df


def load_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results/{evt.value}.jpg")
    return heatmap_image


with gr.Blocks() as demo:
    gr.Markdown("# FSM Benchmark Leaderboard")
    with gr.Tab("Text-only Benchmark"):
        leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
        gr.Markdown("## Heatmap")
        heatmap_image_qwen = gr.Image(label="", show_label=False)
        leader_board.select(fn=load_heatmap, outputs=[heatmap_image_qwen])

    # with gr.Tab("Vision Benchmark", visible=False):
    #     gr.Markdown("# Vision Benchmark Leaderboard")
    #     leader_board_vision = gr.Dataframe(
    #         vision_accuracy_df, headers=headers_with_icons
    #     )
    #     gr.Markdown("## Heatmap")
    #     heatmap_image_vision = gr.Image(label="", show_label=False)
    #     leader_board_vision.select(
    #         fn=load_vision_heatmap, outputs=[heatmap_image_vision]
    #     )

    # with gr.Tab("Text-only Benchmark (CoT)", visible=False):
    #     gr.Markdown("# Text-only Leaderboard (CoT)")
    #     cot_leader_board_text = gr.Dataframe(
    #         cot_text_accuracy_df, headers=headers_with_icons
    #     )
    #     gr.Markdown("## Heatmap")
    #     cot_heatmap_image_text = gr.Image(label="", show_label=False)
    #     cot_leader_board_text.select(
    #         fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
    #     )

    # with gr.Tab("Constraint Text-only Results (CoT)", visible=False):
    #     gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
    #     included_models_cot = gr.CheckboxGroup(
    #         label="Models to include",
    #         choices=all_cot_text_only_models,
    #         value=all_cot_text_only_models,
    #         interactive=True,
    #     )
    #     with gr.Row():
    #         number_of_queries_cot = gr.Textbox(label="Number of included queries")
    #         number_of_fsms_cot = gr.Textbox(label="Number of included  FSMs")

    #     constrained_leader_board_text_cot = gr.Dataframe()
    #     constrained_leader_board_plot_cot = gr.Plot()

    # with gr.Tab("Majority Vote (Subset 1)", visible=False):
    #     gr.Markdown("## Majority Vote (Subset 1)")
    #     intersection_leader_board = gr.Dataframe(
    #         intersection_df_acc, headers=headers_with_icons
    #     )
    #     heatmap_image = gr.Plot(label="Model Heatmap")

    # with gr.Tab("Text-only Benchmark (deprecated)", visible=False):
    #     gr.Markdown("# Text-only Leaderboard")
    #     leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
    #     gr.Markdown("## Heatmap")
    #     heatmap_image = gr.Image(label="", show_label=False)
    #     leader_board.select(fn=load_heatmap, outputs=[heatmap_image])

    # # ============ Callbacks ============

    # included_models_cot.select(
    #     fn=calculate_order_by_first_substring_cot,
    #     inputs=[included_models_cot],
    #     outputs=[
    #         constrained_leader_board_text_cot,
    #         number_of_queries_cot,
    #         number_of_fsms_cot,
    #     ],
    #     queue=True,
    # )

    # constrained_leader_board_text.select(
    #     fn=show_constraint_heatmap, outputs=[constrained_leader_board_plot]
    # )

    # constrained_leader_board_text_cot.select(
    #     fn=show_constraint_heatmap_cot, outputs=[constrained_leader_board_plot_cot]
    # )

    # intersection_leader_board.select(
    #     fn=show_intersection_heatmap, outputs=[heatmap_image]
    # )

    demo.launch()