Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

jasonshaoshun commited on Jan 30

Commit

e1a39f1

1 Parent(s): 85f4717

caulsal-track debug

Browse files

Files changed (1) hide show

src/leaderboard/read_evals.py +7 -73

src/leaderboard/read_evals.py CHANGED Viewed

@@ -358,89 +358,20 @@ class EvalResult_MIB_CAUSALGRAPH:
         return data_dict
-# def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
-#     """
-#     Aggregates rows with the same base method name by taking the max value for each column.
-#     """
-#     # Create a copy of the DataFrame
-#     df_copy = df.copy()
-#     # Extract base method names (remove _2, _3, etc. suffixes)
-#     base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
-#                    else name for name in df_copy.index]
-#     df_copy.index = base_methods
-#     # Convert scores to numeric values
-#     def extract_score(score_str):
-#         if isinstance(score_str, str):
-#             return float(score_str)
-#         return 0.0
-#     numeric_df = df_copy.applymap(extract_score)
-#     # Group by base method name and take the max
-#     aggregated_df = numeric_df.groupby(level=0).max().round(3)
-#     # Convert back to string format
-#     aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
-#     return aggregated_df
-# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
-#     """
-#     Creates a DataFrame where columns are model_task and cells are averaged over interventions.
-#     """
-#     # Create a copy of the DataFrame
-#     df_copy = df.copy()
-#     # Remove the Average column if it exists
-#     if 'Average' in df_copy.columns:
-#         df_copy = df_copy.drop('Average', axis=1)
-#     # Function to extract score value from string
-#     def extract_score(score_str):
-#         if isinstance(score_str, str):
-#             return float(score_str)
-#         return 0.0
-#     # Convert all scores to numeric values
-#     numeric_df = df_copy.applymap(extract_score)
-#     # Group columns by model_task
-#     model_task_groups = {}
-#     for col in numeric_df.columns:
-#         model_task = '_'.join(col.split('_')[:2])  # Get model_task part
-#         if model_task not in model_task_groups:
-#             model_task_groups[model_task] = []
-#         model_task_groups[model_task].append(col)
-#     # Create new DataFrame with averaged intervention scores
-#     averaged_df = pd.DataFrame({
-#         model_task: numeric_df[cols].mean(axis=1).round(3)
-#         for model_task, cols in model_task_groups.items()
-#     })
-#     # Add overall average column
-#     averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
-#     # Sort by Average column
-#     averaged_df = averaged_df.sort_values('Average', ascending=False)
-#     return averaged_df
 def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
     """
     Aggregates rows with the same base method name by taking the max value for each column.
     Works with Method as a regular column instead of index.
     """
-    # Create a copy of the DataFrame
     df_copy = df.copy()
-    # Extract base method names (remove _2, _3, etc. suffixes)
     base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
                    else name for name in df_copy['Method']]
-    df_copy['base_method'] = base_methods
     # Convert scores to numeric values
     score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
@@ -551,6 +482,9 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
     # if "eval_name" in detailed_df.columns:
     #     detailed_df.drop("eval_name", axis=1, inplace=True)
     # Create aggregated DataFrame
     aggregated_df = aggregate_methods(detailed_df)

         return data_dict
 def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
     """
     Aggregates rows with the same base method name by taking the max value for each column.
     Works with Method as a regular column instead of index.
     """
     df_copy = df.copy()
+    print("\nBase methods extraction:")
     base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
                    else name for name in df_copy['Method']]
+    print(f"Original methods: {df_copy['Method'].tolist()}")
+    print(f"Base methods: {base_methods}")
     # Convert scores to numeric values
     score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
     # if "eval_name" in detailed_df.columns:
     #     detailed_df.drop("eval_name", axis=1, inplace=True)
+    print("Before aggregation:")
+    print(detailed_df)
     # Create aggregated DataFrame
     aggregated_df = aggregate_methods(detailed_df)