Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

jasonshaoshun commited on Jan 22

Commit

53c7136

1 Parent(s): 202dbe2

debug

Browse files

Files changed (5) hide show

app.py +2 -2
src/about.py +3 -3
src/display/utils.py +30 -8
src/leaderboard/read_evals.py +26 -31
src/populate.py +51 -18

app.py CHANGED Viewed

@@ -75,7 +75,6 @@ except Exception:
 LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
 # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
 # In app.py, modify the LEADERBOARD initialization
 LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
     EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
@@ -84,6 +83,7 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
     BENCHMARK_COLS_MIB_CAUSALGRAPH
 )
 # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
@@ -210,7 +210,7 @@ with demo:
         with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
             leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
         # Then modify the Causal Graph tab section
         with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
             with gr.Tabs() as causalgraph_tabs:

 LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
 # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
 # In app.py, modify the LEADERBOARD initialization
 LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
     EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
     BENCHMARK_COLS_MIB_CAUSALGRAPH
 )
 # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
         with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
             leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
         # Then modify the Causal Graph tab section
         with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
             with gr.Tabs() as causalgraph_tabs:

src/about.py CHANGED Viewed

@@ -47,7 +47,7 @@ class TasksMib_Subgraph(Enum):
 @dataclass
 class TaskMIB_Causalgraph:
     benchmark: str      # MCQA
-    models: list[str]   # LlamaForCausalLM
     layers: list[str]   # 0-31
     col_name: str       # display name in leaderboard
     interventions: list[str]  # output_token, output_location
@@ -57,8 +57,8 @@ class TaskMIB_Causalgraph:
 class TasksMib_Causalgraph(Enum):
     task0 = TaskMIB_Causalgraph(
         "MCQA",
-        ["LlamaForCausalLM"],
-        [str(i) for i in range(32)],
         "mcqa",
         ["output_token", "output_location"],
         ["symbol_counterfactual", "randomLetter_counterfactual",

 @dataclass
 class TaskMIB_Causalgraph:
     benchmark: str      # MCQA
+    models: list[str]   # List of all models
     layers: list[str]   # 0-31
     col_name: str       # display name in leaderboard
     interventions: list[str]  # output_token, output_location
 class TasksMib_Causalgraph(Enum):
     task0 = TaskMIB_Causalgraph(
         "MCQA",
+        ["LlamaForCausalLM", "Qwen2ForCausalLM", "Gemma2ForCausalLM"],  # Updated model list
+        [str(i) for i in range(32)],  # 0-31 layers
         "mcqa",
         ["output_token", "output_location"],
         ["symbol_counterfactual", "randomLetter_counterfactual",

src/display/utils.py CHANGED Viewed

@@ -102,7 +102,22 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
-# Initialize the MIB causal graph columns
 auto_eval_column_dict_mib_causalgraph = []
 # Method name column
@@ -111,13 +126,20 @@ auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnCon
 # For each model-task-intervention combination
 for task in TasksMib_Causalgraph:
     for model in task.value.models:
-        for intervention in task.value.interventions:
-            col_name = f"{model}_{task.value.benchmark}_{intervention}".lower()
-            auto_eval_column_dict_mib_causalgraph.append([
-                col_name,
-                ColumnContent,
-                ColumnContent(col_name, "number", True)
-            ])
 # Create the dataclass
 AutoEvalColumn_mib_causalgraph = make_dataclass("AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True)

+# # Initialize the MIB causal graph columns
+# auto_eval_column_dict_mib_causalgraph = []
+# # Method name column
+# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
+# # For each model-task-intervention combination
+# for task in TasksMib_Causalgraph:
+#     for model in task.value.models:
+#         for intervention in task.value.interventions:
+#             col_name = f"{model}_{task.value.benchmark}_{intervention}".lower()
+#             auto_eval_column_dict_mib_causalgraph.append([
+#                 col_name,
+#                 ColumnContent,
+#                 ColumnContent(col_name, "number", True)
+#             ])
 auto_eval_column_dict_mib_causalgraph = []
 # Method name column
 # For each model-task-intervention combination
 for task in TasksMib_Causalgraph:
     for model in task.value.models:
+        for layer in task.value.layers:
+            for intervention in task.value.interventions:
+                for counterfactual in task.value.counterfactuals:
+                    col_name = f"layer{layer}_{intervention}_{counterfactual}"
+                    field_name = col_name.lower()
+                    auto_eval_column_dict_mib_causalgraph.append([
+                        field_name,
+                        ColumnContent,
+                        ColumnContent(col_name, "number", True)
+                    ])
 # Create the dataclass
 AutoEvalColumn_mib_causalgraph = make_dataclass("AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -187,46 +187,41 @@ class EvalResult_MIB_CAUSALGRAPH:
     """Represents one full evaluation for a method in MIB causalgraph."""
     eval_name: str
     method_name: str
-    results: Dict
     def init_from_json_file(self, json_filepath):
         """Inits results from the method result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
         method_name = data.get("method_name")
         results = {}
-        # First average across counterfactuals
-        for result in data.get("results", []):
-            model_id = result.get("model_id")
-            task_scores = result.get("task_scores", {})
-            model_results = {}
-            for task, scores in task_scores.items():
-                layer_scores = []
-                for layer_data in scores:
-                    layer = layer_data.get("layer")
-                    layer_scores_data = []
-                    for intervention_data in layer_data.get("layer_scores", []):
-                        # Average across counterfactuals
-                        avg_score = np.mean([cf['score'] for cf in intervention_data['counterfactual_scores']])
-                        if np.isnan(avg_score):
-                            avg_score = 0.0
-                        layer_scores_data.append({
-                            'intervention': intervention_data['intervention'][0],
-                            'score': avg_score
-                        })
-                    layer_scores.append({
-                        'layer': layer,
-                        'scores': layer_scores_data
-                    })
-                model_results[task] = layer_scores
-            results[model_id] = model_results
         return EvalResult_MIB_CAUSALGRAPH(
             eval_name=method_name,

     """Represents one full evaluation for a method in MIB causalgraph."""
     eval_name: str
     method_name: str
+    results: Dict
     def init_from_json_file(self, json_filepath):
         """Inits results from the method result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
         method_name = data.get("method_name")
         results = {}
+        # Get results for each model
+        for model_result in data.get("results", []):
+            model_id = model_result.get("model_id", "")  # Will be one of the three models
+            task_scores = model_result.get("task_scores", {})
+            # Process MCQA task scores
+            mcqa_scores = {}
+            for layer_data in task_scores.get("MCQA", []):
+                layer = layer_data.get("layer")
+                layer_scores = layer_data.get("layer_scores", [])
+                # Store scores for each intervention and counterfactual
+                for intervention_data in layer_scores:
+                    intervention = intervention_data["intervention"][0]
+                    counterfactual_scores = intervention_data["counterfactual_scores"]
+                    for cf_score in counterfactual_scores:
+                        counterfactual = cf_score["counterfactual"][0]
+                        score = cf_score["score"]
+                        # Create key for this combination
+                        key = f"layer{layer}_{intervention}_{counterfactual}"
+                        mcqa_scores[key] = score
+            results[model_id] = mcqa_scores
         return EvalResult_MIB_CAUSALGRAPH(
             eval_name=method_name,

src/populate.py CHANGED Viewed

@@ -133,37 +133,70 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
 #     return averaged_df
 def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
     df_copy = df.copy()
-    # Store Method column if it exists
     method_col = None
     if 'Method' in df_copy.columns:
         method_col = df_copy['Method']
         df_copy = df_copy.drop('Method', axis=1)
-    # Remove eval_name if present
     if 'eval_name' in df_copy.columns:
         df_copy = df_copy.drop('eval_name', axis=1)
-    # Group columns by model_task
-    model_task_groups = {}
-    for col in df_copy.columns:
-        model_task = '_'.join(col.split('_')[:2])  # Get model_task part
-        if model_task not in model_task_groups:
-            model_task_groups[model_task] = []
-        model_task_groups[model_task].append(col)
-    # Create new DataFrame with averaged intervention scores
-    averaged_df = pd.DataFrame({
-        model_task: df_copy[cols].mean(axis=1).round(3)
-        for model_task, cols in model_task_groups.items()
-    })
-    # Add Method column back
     if method_col is not None:
-        averaged_df.insert(0, 'Method', method_col)
     return averaged_df

 #     return averaged_df
+# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
+#     """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
+#     df_copy = df.copy()
+#     # Store Method column if it exists
+#     method_col = None
+#     if 'Method' in df_copy.columns:
+#         method_col = df_copy['Method']
+#         df_copy = df_copy.drop('Method', axis=1)
+#     # Remove eval_name if present
+#     if 'eval_name' in df_copy.columns:
+#         df_copy = df_copy.drop('eval_name', axis=1)
+#     # Group columns by model_task
+#     model_task_groups = {}
+#     for col in df_copy.columns:
+#         model_task = '_'.join(col.split('_')[:2])  # Get model_task part
+#         if model_task not in model_task_groups:
+#             model_task_groups[model_task] = []
+#         model_task_groups[model_task].append(col)
+#     # Create new DataFrame with averaged intervention scores
+#     averaged_df = pd.DataFrame({
+#         model_task: df_copy[cols].mean(axis=1).round(3)
+#         for model_task, cols in model_task_groups.items()
+#     })
+#     # Add Method column back
+#     if method_col is not None:
+#         averaged_df.insert(0, 'Method', method_col)
+#     return averaged_df
 def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
     df_copy = df.copy()
+    # Store Method column
     method_col = None
     if 'Method' in df_copy.columns:
         method_col = df_copy['Method']
         df_copy = df_copy.drop('Method', axis=1)
     if 'eval_name' in df_copy.columns:
         df_copy = df_copy.drop('eval_name', axis=1)
+    # Group columns by model and task
+    result_cols = {}
+    for task in TasksMib_Causalgraph:
+        for model in task.value.models:  # Will iterate over all three models
+            model = model.lower()
+            for intervention in task.value.interventions:
+                col_name = f"{model}_{task.value.benchmark.lower()}_{intervention}"
+                matching_cols = [c for c in df_copy.columns if c.startswith(col_name)]
+                if matching_cols:
+                    result_cols[col_name] = matching_cols
+    averaged_df = pd.DataFrame()
     if method_col is not None:
+        averaged_df['Method'] = method_col
+    for col_name, cols in result_cols.items():
+        averaged_df[col_name] = df_copy[cols].mean(axis=1).round(3)
     return averaged_df