import json import gzip import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download from io import StringIO from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( BENCHMARK_COLS, BENCHMARK_COLS_MULTIMODAL, BENCHMARK_COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH, COLS, COLS_MIB_SUBGRAPH, COLS_MIB_CAUSALGRAPH, COLS_MULTIMODAL, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, AutoEvalColumn_mib_subgraph, AutoEvalColumn_mib_causalgraph, fields, ) from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph from src.submission.submit import add_new_eval def restart_space(): API.restart_space(repo_id=REPO_ID) ### Space initialisation try: # print(EVAL_REQUESTS_PATH) snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: restart_space() try: # print(RESULTS_REPO_MIB_SUBGRAPH) snapshot_download( repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: restart_space() try: # print(RESULTS_REPO_MIB_CAUSALGRAPH) snapshot_download( repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: restart_space() LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH) # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH) # In app.py, modify the LEADERBOARD initialization LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph( EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH ) # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL) ( finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) # def init_leaderboard_mib_subgraph(dataframe, track): # # print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n") # if dataframe is None or dataframe.empty: # raise ValueError("Leaderboard DataFrame is empty or None.") # # filter for correct track # # dataframe = dataframe.loc[dataframe["Track"] == track] # # print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n") # return Leaderboard( # value=dataframe, # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)], # select_columns=SelectColumns( # default_selection=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default], # cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.never_hidden], # label="Select Columns to Display:", # ), # search_columns=["Method"], # Changed from AutoEvalColumn_mib_subgraph.model.name to "Method" # hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden], # bool_checkboxgroup_label="Hide models", # interactive=False, # ) from src.about import TasksMib_Subgraph # def init_leaderboard_mib_subgraph(dataframe, track): # if dataframe is None or dataframe.empty: # raise ValueError("Leaderboard DataFrame is empty or None.") # # Get unique tasks and models for filters # tasks = list(set(task.value.benchmark for task in TasksMib_Subgraph)) # models = list(set( # model # for task in TasksMib_Subgraph # for model in task.value.models # )) # return Leaderboard( # value=dataframe, # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)], # select_columns=SelectColumns( # default_selection=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default], # cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.never_hidden], # label="Select Columns to Display:", # ), # column_filters=[ # ColumnFilter( # column="task_filter", # choices=tasks, # label="Filter by Task:", # default=None # ), # ColumnFilter( # column="model_filter", # choices=models, # label="Filter by Model:", # default=None # ) # ], # search_columns=["Method"], # hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden], # bool_checkboxgroup_label="Hide models", # interactive=False, # ) # def init_leaderboard_mib_subgraph(dataframe, track): # if dataframe is None or dataframe.empty: # raise ValueError("Leaderboard DataFrame is empty or None.") # # Add filter columns to display # dataframe['Task'] = dataframe.apply( # lambda row: [task.value.benchmark for task in TasksMib_Subgraph # if any(f"{task.value.benchmark}_{model}" in row.index # for model in task.value.models)][0], # axis=1 # ) # dataframe['Model'] = dataframe.apply( # lambda row: [model for task in TasksMib_Subgraph # for model in task.value.models # if f"{task.value.benchmark}_{model}" in row.index][0], # axis=1 # ) # return Leaderboard( # value=dataframe, # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)], # select_columns=SelectColumns( # default_selection=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default], # cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.never_hidden], # label="Select Columns to Display:", # ), # search_columns=["Method", "Task", "Model"], # Add Task and Model to searchable columns # hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden], # bool_checkboxgroup_label="Hide models", # interactive=False, # ) # def init_leaderboard_mib_subgraph(dataframe, track): # """Initialize the subgraph leaderboard with grouped column selection.""" # if dataframe is None or dataframe.empty: # raise ValueError("Leaderboard DataFrame is empty or None.") # # Get tasks and models using the new class methods # tasks = TasksMib_Subgraph.get_all_tasks() # models = TasksMib_Subgraph.get_all_models() # # Create a mapping from selection to actual column names # selection_map = {} # # Add task mappings - when a task is selected, show all its columns # for task in tasks: # # For each task, find all valid task_model combinations # valid_combos = [] # for model in models: # col_name = f"{task}_{model}" # if col_name in dataframe.columns: # valid_combos.append(col_name) # if valid_combos: # selection_map[task] = valid_combos # # Add model mappings - when a model is selected, show all its columns # for model in models: # # For each model, find all valid task_model combinations # valid_combos = [] # for task in tasks: # col_name = f"{task}_{model}" # if col_name in dataframe.columns: # valid_combos.append(col_name) # if valid_combos: # selection_map[model] = valid_combos # return Leaderboard( # value=dataframe, # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)], # select_columns=SelectColumns( # choices=[tasks, models], # Two groups of choices # labels=["Tasks", "Models"], # Labels for each group # default_selection=[*tasks, *models], # Show everything by default # cant_deselect=["Method"], # Method column always visible # label="Filter by Tasks or Models:", # selection_map=selection_map # Map selections to actual columns # ), # search_columns=["Method"], # hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden], # bool_checkboxgroup_label="Hide models", # interactive=False, # ) # def init_leaderboard_mib_subgraph(dataframe, track): # """Initialize the subgraph leaderboard with grouped column selection for gradio-leaderboard 0.0.13""" # if dataframe is None or dataframe.empty: # raise ValueError("Leaderboard DataFrame is empty or None.") # # Get all unique tasks and models # tasks = [task.value.benchmark for task in TasksMib_Subgraph] # models = list(set(model for task in TasksMib_Subgraph for model in task.value.models)) # # Create two selection groups: one for tasks and one for models # # In 0.0.13, we can only have one SelectColumns, so we'll combine them # selection_choices = [ # *[f"Task: {task}" for task in tasks], # Prefix with 'Task:' for clarity # *[f"Model: {model}" for model in models] # Prefix with 'Model:' for clarity # ] # return Leaderboard( # value=dataframe, # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)], # select_columns=SelectColumns( # default_selection=selection_choices, # Show all by default # choices=selection_choices, # cant_deselect=["Method"], # Method column always visible # label="Select Tasks or Models:", # ), # search_columns=["Method"], # hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden], # bool_checkboxgroup_label="Hide models", # interactive=False, # ) # def init_leaderboard_mib_subgraph(dataframe, track): # """Initialize the subgraph leaderboard focusing only on task and model filtering. # This implementation creates a focused view where users can select which task-model # combinations they want to see, making the analysis of results more straightforward. # """ # if dataframe is None or dataframe.empty: # raise ValueError("Leaderboard DataFrame is empty or None.") # # Get all task-model combinations that actually exist in our data # task_model_columns = [] # for task in TasksMib_Subgraph: # for model in task.value.models: # col_name = f"{task.value.benchmark}_{model}" # if col_name in dataframe.columns: # task_model_columns.append(col_name) # return Leaderboard( # value=dataframe, # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)], # select_columns=SelectColumns( # default_selection=task_model_columns, # label="Select Task-Model Combinations:", # ), # search_columns=["Method"], # Keep Method searchable but not in column selection # hide_columns=[], # We don't need to hide any columns # bool_checkboxgroup_label="Hide models", # interactive=False, # ) # def init_leaderboard_mib_subgraph(dataframe, track): # """Initialize the subgraph leaderboard with verified task/model column selection""" # if dataframe is None or dataframe.empty: # raise ValueError("Leaderboard DataFrame is empty or None.") # # First, let's identify which columns actually exist in our dataframe # print("Available columns in dataframe:", dataframe.columns.tolist()) # # Create task selections based on TasksMib_Subgraph definition # task_selections = [] # for task in TasksMib_Subgraph: # task_cols = [] # for model in task.value.models: # col_name = f"{task.value.benchmark}_{model}" # if col_name in dataframe.columns: # task_cols.append(col_name) # if task_cols: # Only add tasks that have data # print(f"Task {task.value.benchmark} has columns:", task_cols) # task_selections.append(f"Task: {task.value.benchmark}") # # Create model selections by checking which models appear in columns # model_selections = [] # all_models = list(set(model for task in TasksMib_Subgraph for model in task.value.models)) # for model in all_models: # model_cols = [] # for task in TasksMib_Subgraph: # if model in task.value.models: # col_name = f"{task.value.benchmark}_{model}" # if col_name in dataframe.columns: # model_cols.append(col_name) # if model_cols: # Only add models that have data # print(f"Model {model} has columns:", model_cols) # model_selections.append(f"Model: {model}") # # Combine all selections # selections = task_selections + model_selections # print("Final selection options:", selections) # # Print DataFrame information # print("\nDebugging DataFrame:") # print("DataFrame columns:", dataframe.columns.tolist()) # print("DataFrame shape:", dataframe.shape) # print("DataFrame head:\n", dataframe.head()) # return Leaderboard( # value=dataframe, # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)], # select_columns=SelectColumns( # default_selection=selections, # label="Select Tasks or Models:" # ), # search_columns=["Method"], # hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden], # bool_checkboxgroup_label="Hide models", # interactive=False, # ) def init_leaderboard_mib_subgraph(dataframe, track): """Initialize the subgraph leaderboard with benchmark and model filtering capabilities.""" if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") # Print DataFrame information for debugging print("\nDebugging DataFrame columns:", dataframe.columns.tolist()) # Get result columns (excluding Method and Average) result_columns = [col for col in dataframe.columns if col not in ['Method', 'Average'] and '_' in col] # Create benchmark and model selections benchmarks = set() models = set() print(f"\nDebugging Result Columns: {result_columns}") # Extract unique benchmarks and models from column names for col in result_columns: print(f"col is {col}") benchmark, model = col.split('_') benchmarks.add(benchmark) models.add(model) print(f"benchmark is {benchmark} and model is {model}") # Create selection groups benchmark_selections = { # For each benchmark, store which columns should be shown benchmark: [col for col in result_columns if col.startswith(f"{benchmark}_")] for benchmark in benchmarks } model_selections = { # For each model, store which columns should be shown model: [col for col in result_columns if col.startswith(f"_{model}")] for model in models } # Combine the selection mappings selection_groups = { **benchmark_selections, **model_selections } print("\nDebugging Selection Groups:") print("Benchmarks:", benchmark_selections.keys()) print("Models:", model_selections.keys()) # Convert keys to list for selection options selection_options = list(selection_groups.keys()) return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)], select_columns=SelectColumns( default_selection=selection_options, # Show all options by default label="Filter by Benchmark or Model:" ), search_columns=["Method"], hide_columns=[], interactive=False, ) def init_leaderboard_mib_causalgraph(dataframe, track): # print("Debugging column issues:") # print("\nActual DataFrame columns:") # print(dataframe.columns.tolist()) # print("\nExpected columns for Leaderboard:") expected_cols = [c.name for c in fields(AutoEvalColumn_mib_causalgraph)] # print(expected_cols) # print("\nMissing columns:") missing_cols = [col for col in expected_cols if col not in dataframe.columns] # print(missing_cols) # print("\nSample of DataFrame content:") # print(dataframe.head().to_string()) return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)], select_columns=SelectColumns( default_selection=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.displayed_by_default], cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.never_hidden], label="Select Columns to Display:", ), search_columns=["Method"], hide_columns=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.hidden], bool_checkboxgroup_label="Hide models", interactive=False, ) def init_leaderboard_mib_causalgraph(dataframe, track): # print("Debugging column issues:") # print("\nActual DataFrame columns:") # print(dataframe.columns.tolist()) # Create only necessary columns return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)], select_columns=SelectColumns( default_selection=["Method"], # Start with just Method column cant_deselect=["Method"], # Method column should always be visible label="Select Columns to Display:", ), search_columns=["Method"], hide_columns=[], bool_checkboxgroup_label="Hide models", interactive=False, ) def init_leaderboard(dataframe, track): if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") # filter for correct track dataframe = dataframe.loc[dataframe["Track"] == track] # print(f"\n\n\n dataframe is {dataframe}\n\n\n") return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn)], select_columns=SelectColumns( default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], label="Select Columns to Display:", ), search_columns=[AutoEvalColumn.model.name], hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], bool_checkboxgroup_label="Hide models", interactive=False, ) def process_json(temp_file): if temp_file is None: return {} # Handle file upload try: file_path = temp_file.name if file_path.endswith('.gz'): with gzip.open(file_path, 'rt') as f: data = json.load(f) else: with open(file_path, 'r') as f: data = json.load(f) except Exception as e: raise gr.Error(f"Error processing file: {str(e)}") gr.Markdown("Upload successful!") return data demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: # with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0): # leaderboard = init_leaderboard(LEADERBOARD_DF, "strict") # with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1): # leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small") # with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2): # leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal") # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4): # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") # with gr.TabItem("👶 Submit", elem_id="llm-benchmark-tab-table", id=5): # with gr.Column(): # with gr.Row(): # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") # with gr.TabItem("Subgraph", elem_id="subgraph", id=0): # leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph") with gr.TabItem("Subgraph", elem_id="subgraph", id=0): # Add description for filters gr.Markdown(""" ### Filtering Options Use the dropdown menus below to filter results by specific tasks or models. You can combine filters to see specific task-model combinations. """) leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph") # Then modify the Causal Graph tab section with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1): with gr.Tabs() as causalgraph_tabs: with gr.TabItem("Detailed View", id=0): leaderboard_detailed = init_leaderboard_mib_causalgraph( LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, "Causal Graph" ) with gr.TabItem("Aggregated View", id=1): leaderboard_aggregated = init_leaderboard_mib_causalgraph( LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, "Causal Graph" ) with gr.TabItem("Intervention Averaged", id=2): leaderboard_averaged = init_leaderboard_mib_causalgraph( LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED, "Causal Graph" ) # with gr.Row(): # with gr.Accordion("📙 Citation", open=False): # citation_button = gr.Textbox( # value=CITATION_BUTTON_TEXT, # label=CITATION_BUTTON_LABEL, # lines=20, # elem_id="citation-button", # show_copy_button=True, # ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.launch(share=True, ssr_mode=False)