File size: 6,915 Bytes
75d681a
e8f4283
c1f8e27
e8f4283
0ec0046
 
e8f4283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ec0046
e8f4283
 
d02e6ef
 
 
e8f4283
a3c4484
16b6bb4
e8f4283
a3c4484
16b6bb4
a3c4484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a101f39
 
 
 
16b6bb4
c8b7025
e8f4283
 
 
 
 
 
 
 
 
c8b7025
a101f39
e8f4283
 
 
 
 
 
 
 
 
 
 
 
16b6bb4
e8f4283
16b6bb4
 
e8f4283
 
16b6bb4
 
 
a3c4484
a101f39
05e4334
e8f4283
c8b7025
 
e8f4283
c8b7025
e8f4283
 
 
 
e0656c6
 
e8f4283
 
 
e0656c6
a101f39
e8f4283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ec0046
16b6bb4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import os
from glob import glob

import gradio as gr
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.colors import BoundaryNorm, ListedColormap

all_results = pd.read_pickle("all_results.pkl")


def get_accuracy_dataframe(df):
    # Calculate overall model accuracy
    df['parsed_judge_response'] = df['parsed_judge_response'].astype(float)
    model_accuracy = df.groupby('model_name')['parsed_judge_response'].mean().reset_index()
    
    # Calculate model accuracy per difficulty level
    df['difficulty_level'] = df['difficulty_level'].astype(int)
    model_accuracy_per_level = df.groupby(['model_name', 'difficulty_level'])['parsed_judge_response'].mean().reset_index()
    model_accuracy_per_level_df = model_accuracy_per_level.pivot(index='model_name', columns='difficulty_level', values='parsed_judge_response')
    
    # Merge overall accuracy and level-based accuracy into a single DataFrame
    model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on='model_name')
    model_accuracy_df.rename(columns={1: 'level_1', 2: 'level_2', 3: 'level_3', 4: 'level_4', 5: 'level_5'}, inplace=True)
    model_accuracy_df.rename(columns={'parsed_judge_response': 'Accuracy'}, inplace=True)
    
    # Multiply by 100 and format to one decimal point
    model_accuracy_df = model_accuracy_df.applymap(lambda x: round(x * 100, 1) if isinstance(x, float) else x)
    
    # Add headers with icons
    model_accuracy_df.columns = [
        "πŸ€– Model Name",
        "⭐ Overall",
        "πŸ“ˆ Level 1",
        "πŸ” Level 2",
        "πŸ“˜ Level 3",
        "πŸ”¬ Level 4",
    ]

    model_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
    
    # Add a new column at the beginning for the rank
    model_accuracy_df.insert(0, '#', range(1, len(model_accuracy_df) + 1))
    
    return model_accuracy_df


accuracy_df = get_accuracy_dataframe(all_results)


# Define the column names with icons
headers_with_icons = [
    "πŸ€– Model Name",
    "⭐ Overall",
    "πŸ“ˆ Level 1",
    "πŸ” Level 2",
    "πŸ“˜ Level 3",
    "πŸ”¬ Level 4",
]

column_names = [
    "Model Name",
    "Overall Accuracy",
    "Level 1 Accuracy",
    "Level 2 Accuracy",
    "Level 3 Accuracy",
    "Level 4 Accuracy",
]

def load_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results/{evt.value}.jpg")
    return heatmap_image



# # Function to process data
# def process_data(data):
#     data_for_df = []
#     for file, df in data.items():
#         overall_accuracy = round(calculate_accuracy(df), 2)
#         breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
#         model_name = file.split("/")[-1].replace(".pkl", "")
#         data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
#     return data_for_df


# # Function to finalize DataFrame
# def finalize_df(df):
#     df = df.round(1)  # Round to one decimal place
#     df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
#     df.columns = headers_with_icons
#     df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
#     # add a new column with the order (index)
#     df["#"] = range(1, len(df) + 1)
#     # bring rank to the first column
#     cols = df.columns.tolist()
#     cols = cols[-1:] + cols[:-1]
#     df = df[cols]

#     return df


def load_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results/{evt.value}.jpg")
    return heatmap_image


with gr.Blocks() as demo:
    gr.Markdown("# FSM Benchmark Leaderboard")
    with gr.Tab("Text-only Benchmark"):
        leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
        gr.Markdown("## Heatmap")
        heatmap_image_qwen = gr.Image(label="", show_label=False)
        leader_board.select(fn=load_heatmap, outputs=[heatmap_image_qwen])

    # with gr.Tab("Vision Benchmark", visible=False):
    #     gr.Markdown("# Vision Benchmark Leaderboard")
    #     leader_board_vision = gr.Dataframe(
    #         vision_accuracy_df, headers=headers_with_icons
    #     )
    #     gr.Markdown("## Heatmap")
    #     heatmap_image_vision = gr.Image(label="", show_label=False)
    #     leader_board_vision.select(
    #         fn=load_vision_heatmap, outputs=[heatmap_image_vision]
    #     )

    # with gr.Tab("Text-only Benchmark (CoT)", visible=False):
    #     gr.Markdown("# Text-only Leaderboard (CoT)")
    #     cot_leader_board_text = gr.Dataframe(
    #         cot_text_accuracy_df, headers=headers_with_icons
    #     )
    #     gr.Markdown("## Heatmap")
    #     cot_heatmap_image_text = gr.Image(label="", show_label=False)
    #     cot_leader_board_text.select(
    #         fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
    #     )

    # with gr.Tab("Constraint Text-only Results (CoT)", visible=False):
    #     gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
    #     included_models_cot = gr.CheckboxGroup(
    #         label="Models to include",
    #         choices=all_cot_text_only_models,
    #         value=all_cot_text_only_models,
    #         interactive=True,
    #     )
    #     with gr.Row():
    #         number_of_queries_cot = gr.Textbox(label="Number of included queries")
    #         number_of_fsms_cot = gr.Textbox(label="Number of included  FSMs")

    #     constrained_leader_board_text_cot = gr.Dataframe()
    #     constrained_leader_board_plot_cot = gr.Plot()

    # with gr.Tab("Majority Vote (Subset 1)", visible=False):
    #     gr.Markdown("## Majority Vote (Subset 1)")
    #     intersection_leader_board = gr.Dataframe(
    #         intersection_df_acc, headers=headers_with_icons
    #     )
    #     heatmap_image = gr.Plot(label="Model Heatmap")

    # with gr.Tab("Text-only Benchmark (deprecated)", visible=False):
    #     gr.Markdown("# Text-only Leaderboard")
    #     leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
    #     gr.Markdown("## Heatmap")
    #     heatmap_image = gr.Image(label="", show_label=False)
    #     leader_board.select(fn=load_heatmap, outputs=[heatmap_image])

    # # ============ Callbacks ============

    # included_models_cot.select(
    #     fn=calculate_order_by_first_substring_cot,
    #     inputs=[included_models_cot],
    #     outputs=[
    #         constrained_leader_board_text_cot,
    #         number_of_queries_cot,
    #         number_of_fsms_cot,
    #     ],
    #     queue=True,
    # )

    # constrained_leader_board_text.select(
    #     fn=show_constraint_heatmap, outputs=[constrained_leader_board_plot]
    # )

    # constrained_leader_board_text_cot.select(
    #     fn=show_constraint_heatmap_cot, outputs=[constrained_leader_board_plot_cot]
    # )

    # intersection_leader_board.select(
    #     fn=show_intersection_heatmap, outputs=[heatmap_image]
    # )

    demo.launch()