Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

Aaron Mueller commited on May 21

Commit

f59c752

2 Parent(s): 33ddef9 475701c

Merge branch 'main' of https://huggingface.co/spaces/mech-interp-bench/leaderboard

Browse files

Files changed (3) hide show

app.py +90 -18
src/about.py +7 -4
src/leaderboard/read_evals.py +251 -297

app.py CHANGED Viewed

@@ -399,34 +399,76 @@ def init_leaderboard_mib_subgraph(dataframe, track):
 def init_leaderboard_mib_causalgraph(dataframe, track):
     model_name_mapping = {
         "Qwen2ForCausalLM": "Qwen-2.5",
         "GPT2ForCausalLM": "GPT-2",
         "Gemma2ForCausalLM": "Gemma-2",
         "LlamaForCausalLM": "Llama-3.1"
     }
     benchmark_mapping = {
-        "IOI": "IOI",
-        "MCQA": "MCQA",
         "arithmetic_addition": "Arithmetic (+)",
         "arithmetic_subtraction": "Arithmetic (-)",
-        "arc_easy": "ARC (Easy)",
-        "arc_challenge": "ARC (Challenge)"
     }
     display_mapping = {}
     for task in TasksMib_Causalgraph:
         for model in task.value.models:
-            field_name = f"{task.value.col_name}_{model}"
-            display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
-            display_mapping[field_name] = display_name
     renamed_df = dataframe.rename(columns=display_mapping)
-    print(renamed_df)
     # Create only necessary columns
     return Leaderboard(
@@ -488,8 +530,10 @@ def get_hf_username(hf_repo):
 # Define the preset substrings for filtering
 PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
 TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
 MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
 def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
                                  selected_model_substrings: List[str]) -> pd.DataFrame:
     """
@@ -648,21 +692,21 @@ with demo:
         # Then modify the Causal Graph tab section
         with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
             with gr.Tabs() as causalgraph_tabs:
-                with gr.TabItem("Detailed View", id=0):
-                    leaderboard_detailed, data = init_leaderboard_mib_causalgraph(
-                        LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
-                        "Causal Graph"
-                    )
-                with gr.TabItem("Aggregated View", id=1):
                     gr.Markdown("""
                     ### Filtering Options
                     Use the dropdown menus below to filter results by specific tasks or models.
                     You can combine filters to see specific task-model combinations.
                     """)
                     task_substring_checkbox = gr.CheckboxGroup(
-                        choices=TASK_SUBSTRINGS,
                         label="View tasks:",
-                        value=TASK_SUBSTRINGS,  # Default to all substrings selected
                     )
                     model_substring_checkbox = gr.CheckboxGroup(
                         choices = MODEL_SUBSTRINGS,
@@ -685,11 +729,39 @@ with demo:
                         inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
                         outputs=leaderboard_aggregated
                     )
-                with gr.TabItem("Intervention Averaged", id=2):
                     leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
                         LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
                         "Causal Graph"
                     )
         with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
             # Track selection

+# @dataclass
+# class TaskMIB_Causalgraph:
+#     benchmark: str      # task name in json (ioi/arithmetic)
+#     models: list[str]   # list of models to show as sub-columns
+#     col_name: str       # display name in leaderboard
+#     metrics: list[str]  # metrics to store (average_score)
+# class TasksMib_Causalgraph(Enum):
+#     task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "ioi_task", ["average_score"])
+#     task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
+#     task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
+#     task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
+#     @classmethod
+#     def get_all_tasks(cls):
+#         """Returns a list of all task benchmarks"""
+#         return [task.value.benchmark for task in cls]
+#     @classmethod
+#     def get_all_models(cls):
+#         """Returns a list of all unique models across all tasks"""
+#         models = set()
+#         for task in cls:
+#             models.update(task.value.models)
+#         return sorted(list(models))
+# ioi_task
+# 4_answer_MCQA
 def init_leaderboard_mib_causalgraph(dataframe, track):
     model_name_mapping = {
         "Qwen2ForCausalLM": "Qwen-2.5",
         "GPT2ForCausalLM": "GPT-2",
+        "GPT2LMHeadModel": "GPT-2",
         "Gemma2ForCausalLM": "Gemma-2",
         "LlamaForCausalLM": "Llama-3.1"
     }
     benchmark_mapping = {
+        "ioi_task": "IOI",
+        "4_answer_MCQA": "MCQA",
         "arithmetic_addition": "Arithmetic (+)",
         "arithmetic_subtraction": "Arithmetic (-)",
+        "ARC_easy": "ARC (Easy)",
+        "RAVEL_task": "RAVEL"
+    }
+    target_variables_mapping = {
+        "output_token": "Output Token",
+        "output_position": "Output Position",
+        "answer_pointer": "Answer Pointer",
+        "answer": "Answer",
+        "Continent": "Continent",
+        "Language": "Language",
+        "Country": "Country",
+        "Language": "Language"
     }
     display_mapping = {}
     for task in TasksMib_Causalgraph:
         for model in task.value.models:
+            for target_variables in task.value.target_variables:
+                field_name = f"{model}_{task.value.col_name}_{target_variables}"
+                display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]} - {target_variables_mapping[target_variables]}"
+                display_mapping[field_name] = display_name
     renamed_df = dataframe.rename(columns=display_mapping)
     # Create only necessary columns
     return Leaderboard(
 # Define the preset substrings for filtering
 PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
 TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
+TASK_CAUSAL_SUBSTRINGS = ["IOI", "MCQA", "ARC (Easy)", "RAVEL"]
 MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
 def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
                                  selected_model_substrings: List[str]) -> pd.DataFrame:
     """
         # Then modify the Causal Graph tab section
         with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
             with gr.Tabs() as causalgraph_tabs:
+                # with gr.TabItem("Detailed View", id=0):
+                #     leaderboard_detailed, data = init_leaderboard_mib_causalgraph(
+                #         LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
+                #         "Causal Graph"
+                #     )
+                with gr.TabItem("Highest View", id=0):
                     gr.Markdown("""
                     ### Filtering Options
                     Use the dropdown menus below to filter results by specific tasks or models.
                     You can combine filters to see specific task-model combinations.
                     """)
                     task_substring_checkbox = gr.CheckboxGroup(
+                        choices=TASK_CAUSAL_SUBSTRINGS,
                         label="View tasks:",
+                        value=TASK_CAUSAL_SUBSTRINGS,  # Default to all substrings selected
                     )
                     model_substring_checkbox = gr.CheckboxGroup(
                         choices = MODEL_SUBSTRINGS,
                         inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
                         outputs=leaderboard_aggregated
                     )
+                with gr.TabItem("Averaged View", id=1):
+                    task_substring_checkbox = gr.CheckboxGroup(
+                        choices=TASK_CAUSAL_SUBSTRINGS,
+                        label="View tasks:",
+                        value=TASK_CAUSAL_SUBSTRINGS,  # Default to all substrings selected
+                    )
+                    model_substring_checkbox = gr.CheckboxGroup(
+                        choices = MODEL_SUBSTRINGS,
+                        label = "View models:",
+                        value = MODEL_SUBSTRINGS
+                    )
                     leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
                         LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
                         "Causal Graph"
                     )
+                    original_leaderboard = gr.State(value=data)
+                    task_substring_checkbox.change(
+                        fn=update_leaderboard,
+                        inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
+                        outputs=leaderboard_averaged
+                    )
+                    model_substring_checkbox.change(
+                        fn=update_leaderboard,
+                        inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
+                        outputs=leaderboard_averaged
+                    )
+                    # leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
+                    #     LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
+                    #     "Causal Graph"
+                    # )
         with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
             # Track selection

src/about.py CHANGED Viewed

@@ -78,12 +78,15 @@ class TaskMIB_Causalgraph:
     models: list[str]   # list of models to show as sub-columns
     col_name: str       # display name in leaderboard
     metrics: list[str]  # metrics to store (average_score)
 class TasksMib_Causalgraph(Enum):
-    task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "IOI", ["average_score"])
-    task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "MCQA", ["average_score"])
-    task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
-    task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
     @classmethod
     def get_all_tasks(cls):

     models: list[str]   # list of models to show as sub-columns
     col_name: str       # display name in leaderboard
     metrics: list[str]  # metrics to store (average_score)
+    target_variables: list[str]
 class TasksMib_Causalgraph(Enum):
+    task0 = TaskMIB_Causalgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"], ["output_token", "output_position"])
+    task1 = TaskMIB_Causalgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"], ["answer_pointer", "answer"])
+    task2 = TaskMIB_Causalgraph("ravel", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "RAVEL_task", ["average_score"], ["Continent", "Language", "Country", "Language"])
+    task3 = TaskMIB_Causalgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ARC_easy", ["average_score"], ["answer_pointer", "answer"])
     @classmethod
     def get_all_tasks(cls):

src/leaderboard/read_evals.py CHANGED Viewed

@@ -2,19 +2,22 @@ import glob
 import json
 import math
 import os
 from dataclasses import dataclass
 import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal
 from src.submission.check_validity import is_model_on_hub
-from src.about import TasksMib_Subgraph
-from typing import List, Dict, Any
-from collections import defaultdict
-import pandas as pd
@@ -205,224 +208,10 @@ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_
-# def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
-#     """
-#     Process a single JSON file and convert it to a DataFrame.
-#     Args:
-#         json_file: Dictionary containing the analysis results
-#         method_counter: Counter for handling duplicate method names
-#     Returns:
-#         pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
-#     """
-#     method_name = json_file['method_name']
-#     unique_method_name = f"{method_name}_{method_counter}"
-#     method_scores = []
-#     for result in json_file['results']:
-#         model = result['model_id']
-#         for task, scores in result['task_scores'].items():
-#             # Process each layer's data
-#             intervention_scores = defaultdict(list)
-#             for layer_data in scores:
-#                 for intervention_data in layer_data['layer_scores']:
-#                     # Calculate average score for counterfactuals
-#                     avg_cf_score = np.mean([
-#                         cf['score']
-#                         for cf in intervention_data['counterfactual_scores']
-#                     ])
-#                     if np.isnan(avg_cf_score):
-#                         avg_cf_score = 0.0
-#                     # Group scores by intervention
-#                     intervention_key = '_'.join(intervention_data['intervention'])
-#                     intervention_scores[intervention_key].append(avg_cf_score)
-#             # Average across layers for each intervention
-#             for intervention, layer_scores in intervention_scores.items():
-#                 column = f"{model}_{task}_{intervention}"
-#                 avg_score = np.mean(layer_scores) if layer_scores else 0.0
-#                 method_scores.append((column, f"{avg_score:.3f}"))
-#     # Sort by column names for consistency
-#     method_scores.sort(key=lambda x: x[0])
-#     data = {
-#         unique_method_name: {
-#             col: score for col, score in method_scores
-#         }
-#     }
-#     return pd.DataFrame.from_dict(data, orient='index')
-# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
-#     model_result_filepaths = []
-#     # print(f"Scanning directory: {results_path}")
-#     for root, dirnames, files in os.walk(results_path):
-#         # print(f"Current directory: {root}")
-#         # print(f"Found files: {files}")
-#         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
-#             continue
-#         try:
-#             files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-#         except dateutil.parser._parser.ParserError:
-#             files = [files[-1]]
-#         for file in files:
-#             model_result_filepaths.append(os.path.join(root, file))
-#     # print(f"Found json files: {model_result_filepaths}")
-#     method_counters = defaultdict(int)
-#     dataframes = []
-#     for json_file in model_result_filepaths:
-#         try:
-#             with open(filepath, 'r') as f:
-#                 json_data = json.load(f)
-#                 method_name = json_data['method_name']
-#                 method_counters[method_name] += 1
-#             # Process single JSON file
-#             df = process_single_json(json_data, method_counters[method_name])
-#             dataframes.append(df)
-#         except Exception as e:
-#             print(f"Error processing {json_file}: {e}")
-#             continue
-#     return dataframes
-from dataclasses import dataclass
-import json
-import numpy as np
-import pandas as pd
-from typing import Dict, List, Any
-import os
-from datetime import datetime
-import dateutil
-from collections import defaultdict
-@dataclass
-class EvalResult_MIB_CAUSALGRAPH:
-    """Represents one full evaluation for a method across all models in MIB for causal graph track."""
-    method_name: str      # name of the interpretation method
-    results: Dict         # nested dict of results for each model and task
-    def init_from_json_file(self, json_filepath: str):
-        """Inits results from the method result file"""
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        method_name = data.get("method_name")
-        # Initialize results dictionary
-        results = {}
-        for task in ["IOI", "MCQA", "arithmetic", "ARC-easy"]:
-            results[task] = {}
-        # Process each model's results
-        for result in data.get("results", []):
-            model_id = result.get("model_id", "")
-            model_name = model_id.replace(".", "_")
-            for task, scores in result.get("task_scores", {}).items():
-                intervention_scores = defaultdict(list)
-                for layer_data in scores:
-                    for intervention_data in layer_data['layer_scores']:
-                        # Calculate average score for counterfactuals
-                        avg_cf_score = np.mean([
-                            cf['score'] if 'score' in cf else 0
-                            for cf in intervention_data['counterfactual_scores']
-                        ])
-                        if np.isnan(avg_cf_score):
-                            avg_cf_score = 0.0
-                        intervention_key = '_'.join(intervention_data['intervention'])
-                        intervention_scores[intervention_key].append(avg_cf_score)
-                # Average across layers for each intervention
-                results[task][model_name] = {
-                    interv: np.mean(scores) if scores else 0.0
-                    for interv, scores in intervention_scores.items()
-                }
-        return EvalResult_MIB_CAUSALGRAPH(
-            method_name=method_name,
-            results=results
-        )
-    def to_dict(self, metric_type="average"):
-        """Converts the Eval Result to a dict for dataframe display"""
-        data_dict = {
-            "Method": self.method_name,
-            "Average": "-"  # Initialize first to make the order consistent
-        }
-        # Initialize columns for all task-model combinations
-        all_scores = []
-        for task, task_results in self.results.items():
-            for model, intervention_scores in task_results.items():
-                if not intervention_scores:
-                    continue
-                col_name = f"{task}_{model}"
-                scores = list(intervention_scores.values())
-                if not scores:
-                    data_dict[col_name] = '-'
-                    continue
-                avg_score = np.mean(scores)
-                data_dict[col_name] = f"{avg_score:.3f}"
-                all_scores.append(avg_score)
-        data_dict["Average"] = f"{np.mean(all_scores):.3f}"
-        return data_dict
-# def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
-#     """
-#     Aggregates rows with the same base method name by taking the max value for each column.
-#     Works with Method as a regular column instead of index.
-#     """
-#     df_copy = df.copy()
-#     print("\nBase methods extraction:")
-#     base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
-#                    else name for name in df_copy['Method']]
-#     print(f"Original methods: {df_copy['Method'].tolist()}")
-#     print(f"Base methods: {base_methods}")
-#     df_copy['base_method'] = base_methods
-#     # Convert scores to numeric values
-#     score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
-#     for col in score_columns:
-#         df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
-#     # Group by base method name and take the max
-#     aggregated_df = df_copy.groupby('base_method')[score_columns].max().round(3)
-#     # Reset index to make base_method a regular column and rename it to Method
-#     aggregated_df = aggregated_df.reset_index()
-#     aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
-#     # Convert back to string format
-#     for col in score_columns:
-#         aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}")
-#     return aggregated_df
 def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
     """
     Aggregates rows with the same base method name by taking the max value for each column.
@@ -444,21 +233,21 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
     # Convert scores to numeric values
     score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
     for col in score_columns:
-        df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) and not pd.isna(x) else x)
     # Group by base method name and take the max, handling NaN values
-    aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(2)
-    # Convert back to string format and reset index
     aggregated_df = aggregated_df.reset_index()
     aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
-    # Convert numeric values back to strings with 3 decimal places
-    for col in score_columns:
-        aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}" if not pd.isna(x) else x)
     return aggregated_df
 def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     """
     Creates a DataFrame where columns are model_task and cells are averaged over interventions.
@@ -467,99 +256,264 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     # Create a copy of the DataFrame
     df_copy = df.copy()
-    # Remove the Average column if it exists
-    if 'Average' in df_copy.columns:
-        df_copy = df_copy.drop('Average', axis=1)
-    # Get score columns (excluding Method)
-    score_columns = [col for col in df_copy.columns if col != 'Method']
-    # Convert all scores to numeric values
-    for col in score_columns:
-        df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
-    # Group columns by model_task
-    model_task_groups = {}
-    for col in score_columns:
-        model_task = '_'.join(col.split('_')[:2])  # Get model_task part
-        if model_task not in model_task_groups:
-            model_task_groups[model_task] = []
-        model_task_groups[model_task].append(col)
-    # Create new DataFrame with Method column and averaged intervention scores
     averaged_data = []
     for _, row in df_copy.iterrows():
-        averaged_row = {'Method': row['Method']}
         for model_task, cols in model_task_groups.items():
-            averaged_row[model_task] = np.mean([row[col] for col in cols]).round(2)
-        averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(2)
-        averaged_data.append(averaged_row)
     averaged_df = pd.DataFrame(averaged_data)
-    # Sort by Average column
-    averaged_df = averaged_df.sort_values('Average', ascending=False)
     return averaged_df
-def get_raw_eval_results_mib_causalgraph(results_path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-    """From the path of the results folder root, extract all needed info for MIB causal graph results"""
-    model_result_filepaths = []
-    for root, dirnames, files in os.walk(results_path):
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
-            continue
-        try:
-            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
-            files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    method_counters = defaultdict(int)
-    data_dicts = []
-    for filepath in model_result_filepaths:
-        with open(filepath, 'r') as f:
-            json_data = json.load(f)
-            method_name = json_data['method_name']
-            method_counters[method_name] += 1
-        eval_result = EvalResult_MIB_CAUSALGRAPH("", {})
-        result = eval_result.init_from_json_file(filepath)
-        data_dict = result.to_dict()
-        # print(f"data_dict.keys(): {data_dict.keys()}")
-        # Add method counter to the method name if it's not the first instance
-        if method_counters[method_name] > 1:
-            data_dict["Method"] = f"{method_name}_{method_counters[method_name]}"
-        data_dicts.append(data_dict)
-    if not data_dicts:
-        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
-    # Create the detailed DataFrame
-    detailed_df = pd.DataFrame(data_dicts)
-    # detailed_df.set_index("Method", inplace=True)
-    # print(f"detailed_df coluns are {detailed_df.columns.tolist()}")
-    # if "eval_name" in detailed_df.columns:
-    #     detailed_df.drop("eval_name", axis=1, inplace=True)
-    print("Before aggregation:")
-    print(detailed_df)
-    # Create aggregated DataFrame
-    aggregated_df = aggregate_methods(detailed_df)
-    # Create intervention-averaged DataFrame
-    intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
-    return detailed_df, aggregated_df, intervention_averaged_df

 import json
 import math
 import os
+import re
+import ast
 from dataclasses import dataclass
+from datetime import datetime
+from typing import List, Dict, Any, Tuple
+from collections import defaultdict
 import dateutil
 import numpy as np
+import pandas as pd
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, AutoEvalColumn_mib_causalgraph
 from src.submission.check_validity import is_model_on_hub
+from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
 def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
     """
     Aggregates rows with the same base method name by taking the max value for each column.
     # Convert scores to numeric values
     score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
     for col in score_columns:
+        df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
     # Group by base method name and take the max, handling NaN values
+    aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(3)
+    # Reset index to make base_method a regular column and rename it to Method
     aggregated_df = aggregated_df.reset_index()
     aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
     return aggregated_df
 def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     """
     Creates a DataFrame where columns are model_task and cells are averaged over interventions.
     # Create a copy of the DataFrame
     df_copy = df.copy()
+    # Get all columns except Method and Average
+    columns_to_process = [col for col in df_copy.columns if col not in ['Method', 'Average']]
+    # Extract model and task information from column names
+    model_task_groups = defaultdict(list)
+    for col in columns_to_process:
+        # Split by underscore and extract model, task
+        parts = col.split('_')
+        if len(parts) >= 2:
+            model_task = f"{parts[0]}_{parts[1]}"
+            model_task_groups[model_task].append(col)
+    # Create new DataFrame with Method and averaged columns
     averaged_data = []
     for _, row in df_copy.iterrows():
+        new_row = {'Method': row['Method']}
+        # Calculate average for each model_task group
         for model_task, cols in model_task_groups.items():
+            values = [row[col] for col in cols if pd.notna(row[col])]
+            if values:
+                new_row[model_task] = round(np.mean(values), 3)
+            else:
+                new_row[model_task] = np.nan
+        # Calculate overall average
+        model_task_values = [v for k, v in new_row.items() if k != 'Method' and pd.notna(v)]
+        if model_task_values:
+            new_row['Average'] = round(np.mean(model_task_values), 3)
+        else:
+            new_row['Average'] = np.nan
+        averaged_data.append(new_row)
+    # Create DataFrame and sort by Average
     averaged_df = pd.DataFrame(averaged_data)
+    if 'Average' in averaged_df.columns:
+        averaged_df = averaged_df.sort_values('Average', ascending=False)
     return averaged_df
+@dataclass
+class EvalResult_MIB_CAUSALGRAPH:
+    """Represents one full evaluation for a method across all models for causal variable localization."""
+    eval_name: str        # method name as identifier
+    method_name: str      # name of the interpretation method
+    model_name: str       # name of the model
+    task_name: str        # name of the task
+    target_variables: str # target variables (e.g., "answer", "answer_pointer")
+    average_accuracy: float  # average accuracy score
+    highest_accuracy: float  # highest accuracy score
+    @staticmethod
+    def init_from_consolidated_json(json_data: Dict):
+        """
+        Initialize results from the consolidated JSON format, treating each entry as a separate result
+        Args:
+            json_data: The parsed JSON data with tuple keys
+        Returns:
+            List of EvalResult_MIB_CAUSALGRAPH objects
+        """
+        results = []
+        for key, entry in json_data.items():
+            try:
+                # Parse tuple key: "('method', 'model', 'task', 'variable')"
+                try:
+                    key_tuple = ast.literal_eval(key)
+                    method_name, model_name, task_name, target_variable = key_tuple
+                except:
+                    # Alternative parsing with regex
+                    pattern = r"\('([^']+)', '([^']+)', '([^']+)', '([^']+)'\)"
+                    match = re.match(pattern, key)
+                    if match:
+                        method_name, model_name, task_name, target_variable = match.groups()
+                    else:
+                        print(f"Couldn't parse key: {key}")
+                        continue
+                # Get average and highest accuracy
+                average_accuracy = entry.get("average_accuracy", 0.0)
+                highest_accuracy = entry.get("highest_accuracy", 0.0)
+                # Create a result object for this entry
+                result = EvalResult_MIB_CAUSALGRAPH(
+                    eval_name=f"{method_name}_{model_name}_{task_name}_{target_variable}",
+                    method_name=method_name,
+                    model_name=model_name,
+                    task_name=task_name,
+                    target_variables=target_variable,
+                    average_accuracy=average_accuracy,
+                    highest_accuracy=highest_accuracy
+                )
+                results.append(result)
+            except Exception as e:
+                print(f"Error processing entry {key}: {e}")
+                continue
+        return results
+    def to_dict(self, metric_type="Highest"):
+        """
+        Converts the Eval Result to a dict for dataframe display
+        Args:
+            metric_type: Either "Mean" to use average_accuracy or "Highest" to use highest_accuracy
+        """
+        # Create column name in the exact format requested
+        # col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
+        col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
+        print(f"col_name is {col_name}")
+        # Select the appropriate accuracy metric based on metric_type
+        score = self.average_accuracy if metric_type == "Mean" else self.highest_accuracy
+        # Create data dictionary with method name and the score
+        data_dict = {
+            "eval_name": self.eval_name,
+            "Method": self.method_name,
+            col_name: score
+        }
+        return data_dict
+def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    Processes the consolidated JSON format for causal variable localization results
+    Treats each entry as a separate result and then combines them by method
+    Args:
+        results_path: Path to the directory containing results
+    Returns:
+        Tuple of four DataFrames:
+        - detailed_df_highest: Detailed view with highest accuracy scores
+        - detailed_df_mean: Detailed view with mean accuracy scores
+        - intervention_averaged_highest_df: Averaged by intervention using highest accuracy
+        - intervention_averaged_mean_df: Averaged by intervention using mean accuracy
+    """
+    # Find the consolidated JSON file
+    json_files = []
+    for root, _, files in os.walk(results_path):
+        for file in files:
+            if file.endswith('.json'):
+                json_files.append(os.path.join(root, file))
+    if not json_files:
+        print(f"No JSON files found in {results_path}")
+        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
+    # Load and process the consolidated JSON format
+    raw_data = None
+    for json_file in json_files:
+        try:
+            with open(json_file, 'r') as f:
+                data = json.load(f)
+                # Check if this is the consolidated format by examining a sample key
+                sample_key = next(iter(data), None)
+                if sample_key and isinstance(sample_key, str) and '(' in sample_key and ')' in sample_key:
+                    raw_data = data
+                    print(f"Found consolidated data file: {json_file}")
+                    break
+        except Exception as e:
+            print(f"Error reading {json_file}: {e}")
+    if raw_data is None:
+        print("No valid consolidated JSON file found")
+        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
+    # Get all results
+    eval_results = EvalResult_MIB_CAUSALGRAPH.init_from_consolidated_json(raw_data)
+    if not eval_results:
+        print("No results could be extracted from the JSON data")
+        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
+    # Create two sets of dictionaries - one for highest accuracy and one for mean accuracy
+    highest_results = [result.to_dict(metric_type="Highest") for result in eval_results]
+    mean_results = [result.to_dict(metric_type="Mean") for result in eval_results]
+    # Process highest accuracy results
+    # Group results by method
+    highest_method_groups = {}
+    for result_dict in highest_results:
+        method = result_dict["Method"]
+        if method not in highest_method_groups:
+            highest_method_groups[method] = {
+                "eval_name": method,
+                "Method": method
+            }
+        # Copy all score columns to the method's group
+        for key, value in result_dict.items():
+            if key not in ["eval_name", "Method"]:
+                highest_method_groups[method][key] = value
+    # Create the detailed DataFrame for highest accuracy
+    highest_records = list(highest_method_groups.values())
+    detailed_df_highest = pd.DataFrame(highest_records)
+    # Process mean accuracy results
+    # Group results by method
+    mean_method_groups = {}
+    for result_dict in mean_results:
+        method = result_dict["Method"]
+        if method not in mean_method_groups:
+            mean_method_groups[method] = {
+                "eval_name": method,
+                "Method": method
+            }
+        # Copy all score columns to the method's group
+        for key, value in result_dict.items():
+            if key not in ["eval_name", "Method"]:
+                mean_method_groups[method][key] = value
+    # Create the detailed DataFrame for mean accuracy
+    mean_records = list(mean_method_groups.values())
+    detailed_df_mean = pd.DataFrame(mean_records)
+    if detailed_df_highest.empty or detailed_df_mean.empty:
+        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
+    # Calculate and add Average column for both DataFrames
+    score_columns_highest = [col for col in detailed_df_highest.columns if col not in ["eval_name", "Method"]]
+    score_columns_mean = [col for col in detailed_df_mean.columns if col not in ["eval_name", "Method"]]
+    if score_columns_highest:
+        detailed_df_highest["Average"] = detailed_df_highest[score_columns_highest].mean(axis=1).round(3)
+    if score_columns_mean:
+        detailed_df_mean["Average"] = detailed_df_mean[score_columns_mean].mean(axis=1).round(3)
+    # Sort by Average descending
+    if "Average" in detailed_df_highest.columns:
+        detailed_df_highest = detailed_df_highest.sort_values("Average", ascending=False)
+    if "Average" in detailed_df_mean.columns:
+        detailed_df_mean = detailed_df_mean.sort_values("Average", ascending=False)
+    # # Create intervention-averaged DataFrames for both metrics
+    # intervention_averaged_highest_df = create_intervention_averaged_df(detailed_df_highest)
+    # intervention_averaged_mean_df = create_intervention_averaged_df(detailed_df_mean)
+    # return detailed_df_highest, detailed_df_mean, intervention_averaged_highest_df
+    return detailed_df_highest, detailed_df_highest, detailed_df_mean