Spaces:
Sleeping
Sleeping
| import os | |
| from glob import glob | |
| import gradio as gr | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| import seaborn as sns | |
| from matplotlib.colors import BoundaryNorm, ListedColormap | |
| all_results = pd.read_pickle("all_results.pkl") | |
| def get_accuracy_dataframe(df): | |
| # Calculate overall model accuracy | |
| df['parsed_judge_response'] = df['parsed_judge_response'].astype(float) | |
| model_accuracy = df.groupby('model_name')['parsed_judge_response'].mean().reset_index() | |
| # Calculate model accuracy per difficulty level | |
| df['difficulty_level'] = df['difficulty_level'].astype(int) | |
| model_accuracy_per_level = df.groupby(['model_name', 'difficulty_level'])['parsed_judge_response'].mean().reset_index() | |
| model_accuracy_per_level_df = model_accuracy_per_level.pivot(index='model_name', columns='difficulty_level', values='parsed_judge_response') | |
| # Merge overall accuracy and level-based accuracy into a single DataFrame | |
| model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on='model_name') | |
| model_accuracy_df.rename(columns={1: 'level_1', 2: 'level_2', 3: 'level_3', 4: 'level_4', 5: 'level_5'}, inplace=True) | |
| model_accuracy_df.rename(columns={'parsed_judge_response': 'Accuracy'}, inplace=True) | |
| # Multiply by 100 and format to one decimal point | |
| model_accuracy_df = model_accuracy_df.applymap(lambda x: round(x * 100, 1) if isinstance(x, float) else x) | |
| # Add headers with icons | |
| model_accuracy_df.columns = [ | |
| "π€ Model Name", | |
| "β Overall", | |
| "π Level 1", | |
| "π Level 2", | |
| "π Level 3", | |
| "π¬ Level 4", | |
| ] | |
| model_accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True) | |
| return model_accuracy_df | |
| accuracy_df = get_accuracy_dataframe(all_results) | |
| # Define the column names with icons | |
| headers_with_icons = [ | |
| "π€ Model Name", | |
| "β Overall", | |
| "π Level 1", | |
| "π Level 2", | |
| "π Level 3", | |
| "π¬ Level 4", | |
| ] | |
| column_names = [ | |
| "Model Name", | |
| "Overall Accuracy", | |
| "Level 1 Accuracy", | |
| "Level 2 Accuracy", | |
| "Level 3 Accuracy", | |
| "Level 4 Accuracy", | |
| ] | |
| def load_heatmap(evt: gr.SelectData): | |
| heatmap_image = gr.Image(f"results/{evt.value}.jpg") | |
| return heatmap_image | |
| # # Function to process data | |
| # def process_data(data): | |
| # data_for_df = [] | |
| # for file, df in data.items(): | |
| # overall_accuracy = round(calculate_accuracy(df), 2) | |
| # breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)] | |
| # model_name = file.split("/")[-1].replace(".pkl", "") | |
| # data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy) | |
| # return data_for_df | |
| # # Function to finalize DataFrame | |
| # def finalize_df(df): | |
| # df = df.round(1) # Round to one decimal place | |
| # df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x) | |
| # df.columns = headers_with_icons | |
| # df.sort_values(by="β Overall", ascending=False, inplace=True) | |
| # # add a new column with the order (index) | |
| # df["#"] = range(1, len(df) + 1) | |
| # # bring rank to the first column | |
| # cols = df.columns.tolist() | |
| # cols = cols[-1:] + cols[:-1] | |
| # df = df[cols] | |
| # return df | |
| def load_heatmap(evt: gr.SelectData): | |
| heatmap_image = gr.Image(f"results/{evt.value}.jpg") | |
| return heatmap_image | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# FSM Benchmark Leaderboard") | |
| with gr.Tab("Text-only Benchmark"): | |
| leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons) | |
| gr.Markdown("## Heatmap") | |
| heatmap_image_qwen = gr.Image(label="", show_label=False) | |
| leader_board.select(fn=load_heatmap, outputs=[heatmap_image_qwen]) | |
| # with gr.Tab("Vision Benchmark", visible=False): | |
| # gr.Markdown("# Vision Benchmark Leaderboard") | |
| # leader_board_vision = gr.Dataframe( | |
| # vision_accuracy_df, headers=headers_with_icons | |
| # ) | |
| # gr.Markdown("## Heatmap") | |
| # heatmap_image_vision = gr.Image(label="", show_label=False) | |
| # leader_board_vision.select( | |
| # fn=load_vision_heatmap, outputs=[heatmap_image_vision] | |
| # ) | |
| # with gr.Tab("Text-only Benchmark (CoT)", visible=False): | |
| # gr.Markdown("# Text-only Leaderboard (CoT)") | |
| # cot_leader_board_text = gr.Dataframe( | |
| # cot_text_accuracy_df, headers=headers_with_icons | |
| # ) | |
| # gr.Markdown("## Heatmap") | |
| # cot_heatmap_image_text = gr.Image(label="", show_label=False) | |
| # cot_leader_board_text.select( | |
| # fn=load_cot_heatmap, outputs=[cot_heatmap_image_text] | |
| # ) | |
| # with gr.Tab("Constraint Text-only Results (CoT)", visible=False): | |
| # gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)") | |
| # included_models_cot = gr.CheckboxGroup( | |
| # label="Models to include", | |
| # choices=all_cot_text_only_models, | |
| # value=all_cot_text_only_models, | |
| # interactive=True, | |
| # ) | |
| # with gr.Row(): | |
| # number_of_queries_cot = gr.Textbox(label="Number of included queries") | |
| # number_of_fsms_cot = gr.Textbox(label="Number of included FSMs") | |
| # constrained_leader_board_text_cot = gr.Dataframe() | |
| # constrained_leader_board_plot_cot = gr.Plot() | |
| # with gr.Tab("Majority Vote (Subset 1)", visible=False): | |
| # gr.Markdown("## Majority Vote (Subset 1)") | |
| # intersection_leader_board = gr.Dataframe( | |
| # intersection_df_acc, headers=headers_with_icons | |
| # ) | |
| # heatmap_image = gr.Plot(label="Model Heatmap") | |
| # with gr.Tab("Text-only Benchmark (deprecated)", visible=False): | |
| # gr.Markdown("# Text-only Leaderboard") | |
| # leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons) | |
| # gr.Markdown("## Heatmap") | |
| # heatmap_image = gr.Image(label="", show_label=False) | |
| # leader_board.select(fn=load_heatmap, outputs=[heatmap_image]) | |
| # # ============ Callbacks ============ | |
| # included_models_cot.select( | |
| # fn=calculate_order_by_first_substring_cot, | |
| # inputs=[included_models_cot], | |
| # outputs=[ | |
| # constrained_leader_board_text_cot, | |
| # number_of_queries_cot, | |
| # number_of_fsms_cot, | |
| # ], | |
| # queue=True, | |
| # ) | |
| # constrained_leader_board_text.select( | |
| # fn=show_constraint_heatmap, outputs=[constrained_leader_board_plot] | |
| # ) | |
| # constrained_leader_board_text_cot.select( | |
| # fn=show_constraint_heatmap_cot, outputs=[constrained_leader_board_plot_cot] | |
| # ) | |
| # intersection_leader_board.select( | |
| # fn=show_intersection_heatmap, outputs=[heatmap_image] | |
| # ) | |
| demo.launch() | |