import gradio as gr import pandas as pd from glob import glob # Load text benchmark results csv_results = glob("results/*.pkl") # Load vision benchmark results vision_results = glob("results-vision/*.pkl") # Load CoT text benchmark results cot_text_results = glob("results-cot/*.pkl") # Load CoT vision benchmark results cot_vision_results = glob("results-vision-CoT/*.pkl") # Load the csv files into a dict with keys being name of the file and values being the data data = {file: pd.read_pickle(file) for file in csv_results} # Load the vision files into a dict vision_data = {file: pd.read_pickle(file) for file in vision_results} # Load the CoT text files into a dict cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results} # Load the CoT vision files into a dict cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results} def calculate_accuracy(df): return df["parsed_judge_response"].mean() * 100 def accuracy_breakdown(df): # 4 level accuracy return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values # Define the column names with icons headers_with_icons = [ "🤖 Model Name", "⭐ Overall", "📈 Level 1", "🔍 Level 2", "📘 Level 3", "🔬 Level 4", ] column_names = [ "Model Name", "Overall Accuracy", "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy", "Level 4 Accuracy", ] # Function to process data def process_data(data): data_for_df = [] for file, df in data.items(): overall_accuracy = round(calculate_accuracy(df), 2) breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)] model_name = file.split("/")[-1].replace(".pkl", "") data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy) return data_for_df # Process all data text_data_for_df = process_data(data) vision_data_for_df = process_data(vision_data) cot_text_data_for_df = process_data(cot_text_data) cot_vision_data_for_df = process_data(cot_vision_data) # Create DataFrames accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names) vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names) cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names) cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names) # Function to finalize DataFrame def finalize_df(df): df = df.round(1) # Round to one decimal place df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x) df.columns = headers_with_icons df.sort_values(by="⭐ Overall", ascending=False, inplace=True) return df # Finalize all DataFrames accuracy_df = finalize_df(accuracy_df) vision_accuracy_df = finalize_df(vision_accuracy_df) cot_text_accuracy_df = finalize_df(cot_text_accuracy_df) cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df) def load_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results/{evt.value}.jpg") return heatmap_image def load_vision_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg") return heatmap_image def load_cot_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results-cot/{evt.value}.jpg") return heatmap_image def load_cot_vision_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results-vision-CoT/{evt.value}.jpg") return heatmap_image with gr.Blocks() as demo: gr.Markdown("# FSM Benchmark Leaderboard") with gr.Tab("Text-only Benchmark"): gr.Markdown("# Text-only Leaderboard") leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons) gr.Markdown("## Heatmap") heatmap_image = gr.Image(label="", show_label=False) leader_board.select(fn=load_heatmap, outputs=[heatmap_image]) with gr.Tab("Vision Benchmark"): gr.Markdown("# Vision Benchmark Leaderboard") leader_board_vision = gr.Dataframe( vision_accuracy_df, headers=headers_with_icons ) gr.Markdown("## Heatmap") heatmap_image_vision = gr.Image(label="", show_label=False) leader_board_vision.select( fn=load_vision_heatmap, outputs=[heatmap_image_vision] ) with gr.Tab("CoT Text-only Benchmark"): gr.Markdown("# CoT Text-only Leaderboard") cot_leader_board_text = gr.Dataframe( cot_text_accuracy_df, headers=headers_with_icons ) gr.Markdown("## Heatmap") cot_heatmap_image_text = gr.Image(label="", show_label=False) cot_leader_board_text.select( fn=load_cot_heatmap, outputs=[cot_heatmap_image_text] ) with gr.Tab("CoT Vision Benchmark"): gr.Markdown("# CoT Vision Benchmark Leaderboard") cot_leader_board_vision = gr.Dataframe( cot_vision_accuracy_df, headers=headers_with_icons ) gr.Markdown("## Heatmap") cot_heatmap_image_vision = gr.Image(label="", show_label=False) cot_leader_board_vision.select( fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision] ) demo.launch()