Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

jasonshaoshun commited on Jan 30

Commit

0ab8298

1 Parent(s): 6ea8a3e

caulsal-track debug

Browse files

Files changed (1) hide show

src/leaderboard/read_evals.py +99 -25

src/leaderboard/read_evals.py CHANGED Viewed

@@ -357,38 +357,112 @@ class EvalResult_MIB_CAUSALGRAPH:
         return data_dict
 def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
     """
     Aggregates rows with the same base method name by taking the max value for each column.
     """
     # Create a copy of the DataFrame
     df_copy = df.copy()
     # Extract base method names (remove _2, _3, etc. suffixes)
     base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
-                   else name for name in df_copy.index]
-    df_copy.index = base_methods
     # Convert scores to numeric values
-    def extract_score(score_str):
-        if isinstance(score_str, str):
-            return float(score_str)
-        return 0.0
-    numeric_df = df_copy.applymap(extract_score)
     # Group by base method name and take the max
-    aggregated_df = numeric_df.groupby(level=0).max().round(3)
     # Convert back to string format
-    aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
     return aggregated_df
 def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     """
     Creates a DataFrame where columns are model_task and cells are averaged over interventions.
     """
     # Create a copy of the DataFrame
     df_copy = df.copy()
@@ -397,31 +471,31 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     if 'Average' in df_copy.columns:
         df_copy = df_copy.drop('Average', axis=1)
-    # Function to extract score value from string
-    def extract_score(score_str):
-        if isinstance(score_str, str):
-            return float(score_str)
-        return 0.0
     # Convert all scores to numeric values
-    numeric_df = df_copy.applymap(extract_score)
     # Group columns by model_task
     model_task_groups = {}
-    for col in numeric_df.columns:
         model_task = '_'.join(col.split('_')[:2])  # Get model_task part
         if model_task not in model_task_groups:
             model_task_groups[model_task] = []
         model_task_groups[model_task].append(col)
-    # Create new DataFrame with averaged intervention scores
-    averaged_df = pd.DataFrame({
-        model_task: numeric_df[cols].mean(axis=1).round(3)
-        for model_task, cols in model_task_groups.items()
-    })
-    # Add overall average column
-    averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
     # Sort by Average column
     averaged_df = averaged_df.sort_values('Average', ascending=False)

         return data_dict
+# def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
+#     """
+#     Aggregates rows with the same base method name by taking the max value for each column.
+#     """
+#     # Create a copy of the DataFrame
+#     df_copy = df.copy()
+#     # Extract base method names (remove _2, _3, etc. suffixes)
+#     base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
+#                    else name for name in df_copy.index]
+#     df_copy.index = base_methods
+#     # Convert scores to numeric values
+#     def extract_score(score_str):
+#         if isinstance(score_str, str):
+#             return float(score_str)
+#         return 0.0
+#     numeric_df = df_copy.applymap(extract_score)
+#     # Group by base method name and take the max
+#     aggregated_df = numeric_df.groupby(level=0).max().round(3)
+#     # Convert back to string format
+#     aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
+#     return aggregated_df
+# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
+#     """
+#     Creates a DataFrame where columns are model_task and cells are averaged over interventions.
+#     """
+#     # Create a copy of the DataFrame
+#     df_copy = df.copy()
+#     # Remove the Average column if it exists
+#     if 'Average' in df_copy.columns:
+#         df_copy = df_copy.drop('Average', axis=1)
+#     # Function to extract score value from string
+#     def extract_score(score_str):
+#         if isinstance(score_str, str):
+#             return float(score_str)
+#         return 0.0
+#     # Convert all scores to numeric values
+#     numeric_df = df_copy.applymap(extract_score)
+#     # Group columns by model_task
+#     model_task_groups = {}
+#     for col in numeric_df.columns:
+#         model_task = '_'.join(col.split('_')[:2])  # Get model_task part
+#         if model_task not in model_task_groups:
+#             model_task_groups[model_task] = []
+#         model_task_groups[model_task].append(col)
+#     # Create new DataFrame with averaged intervention scores
+#     averaged_df = pd.DataFrame({
+#         model_task: numeric_df[cols].mean(axis=1).round(3)
+#         for model_task, cols in model_task_groups.items()
+#     })
+#     # Add overall average column
+#     averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
+#     # Sort by Average column
+#     averaged_df = averaged_df.sort_values('Average', ascending=False)
+#     return averaged_df
 def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
     """
     Aggregates rows with the same base method name by taking the max value for each column.
+    Works with Method as a regular column instead of index.
     """
     # Create a copy of the DataFrame
     df_copy = df.copy()
     # Extract base method names (remove _2, _3, etc. suffixes)
     base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
+                   else name for name in df_copy['Method']]
+    df_copy['base_method'] = base_methods
     # Convert scores to numeric values
+    score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
+    for col in score_columns:
+        df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
     # Group by base method name and take the max
+    aggregated_df = df_copy.groupby('base_method')[score_columns].max().round(3)
+    # Reset index to make base_method a regular column and rename it to Method
+    aggregated_df = aggregated_df.reset_index()
+    aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
     # Convert back to string format
+    for col in score_columns:
+        aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}")
     return aggregated_df
 def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     """
     Creates a DataFrame where columns are model_task and cells are averaged over interventions.
+    Works with Method as a regular column.
     """
     # Create a copy of the DataFrame
     df_copy = df.copy()
     if 'Average' in df_copy.columns:
         df_copy = df_copy.drop('Average', axis=1)
+    # Get score columns (excluding Method)
+    score_columns = [col for col in df_copy.columns if col != 'Method']
     # Convert all scores to numeric values
+    for col in score_columns:
+        df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
     # Group columns by model_task
     model_task_groups = {}
+    for col in score_columns:
         model_task = '_'.join(col.split('_')[:2])  # Get model_task part
         if model_task not in model_task_groups:
             model_task_groups[model_task] = []
         model_task_groups[model_task].append(col)
+    # Create new DataFrame with Method column and averaged intervention scores
+    averaged_data = []
+    for _, row in df_copy.iterrows():
+        averaged_row = {'Method': row['Method']}
+        for model_task, cols in model_task_groups.items():
+            averaged_row[model_task] = np.mean([row[col] for col in cols]).round(3)
+        averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(3)
+        averaged_data.append(averaged_row)
+    averaged_df = pd.DataFrame(averaged_data)
     # Sort by Average column
     averaged_df = averaged_df.sort_values('Average', ascending=False)