Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

jasonshaoshun commited on Jan 30

Commit

4d9550b

1 Parent(s): da0827e

caulsal-track debug

Browse files

Files changed (1) hide show

src/leaderboard/read_evals.py +173 -157

src/leaderboard/read_evals.py CHANGED Viewed

@@ -171,172 +171,201 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
-def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
-    """
-    Process a single JSON file and convert it to a DataFrame.
-    Args:
-        json_file: Dictionary containing the analysis results
-        method_counter: Counter for handling duplicate method names
-    Returns:
-        pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
-    """
-    method_name = json_file['method_name']
-    unique_method_name = f"{method_name}_{method_counter}"
-    method_scores = []
-    for result in json_file['results']:
-        model = result['model_id']
-        for task, scores in result['task_scores'].items():
-            # Process each layer's data
-            intervention_scores = defaultdict(list)
-            for layer_data in scores:
-                for intervention_data in layer_data['layer_scores']:
-                    # Calculate average score for counterfactuals
-                    avg_cf_score = np.mean([
-                        cf['score']
-                        for cf in intervention_data['counterfactual_scores']
-                    ])
-                    if np.isnan(avg_cf_score):
-                        avg_cf_score = 0.0
-                    # Group scores by intervention
-                    intervention_key = '_'.join(intervention_data['intervention'])
-                    intervention_scores[intervention_key].append(avg_cf_score)
-            # Average across layers for each intervention
-            for intervention, layer_scores in intervention_scores.items():
-                column = f"{model}_{task}_{intervention}"
-                avg_score = np.mean(layer_scores) if layer_scores else 0.0
-                method_scores.append((column, f"{avg_score:.3f}"))
-    # Sort by column names for consistency
-    method_scores.sort(key=lambda x: x[0])
-    data = {
-        unique_method_name: {
-            col: score for col, score in method_scores
-        }
-    }
-    return pd.DataFrame.from_dict(data, orient='index')
-# @dataclass
-# class EvalResult_MIB_CAUSALGRAPH:
-#     eval_name: str
-#     method_name: str
-#     results: Dict
-#     def init_from_json_file(self, json_filepath):
-#         """Inits results from the method result file"""
-#         with open(json_filepath) as fp:
-#             data = json.load(fp)
-#         method_name = data.get("method_name")
-#         results = {}
-#         # Process each model's results
-#         for model_result in data.get("results", []):
-#             model_id = model_result.get("model_id", "")
-#             task_scores = model_result.get("task_scores", {})
-#             # Process MCQA scores
-#             for layer_data in task_scores.get("MCQA", []):
-#                 layer = layer_data.get("layer")
-#                 for score_data in layer_data.get("layer_scores", []):
-#                     intervention = score_data["intervention"][0]
-#                     for cf_score in score_data["counterfactual_scores"]:
-#                         counterfactual = cf_score["counterfactual"][0]
-#                         score = cf_score["score"]
-#                         # Create key matching the expected column format
-#                         key = f"{model_id}_layer{layer}_{intervention}_{counterfactual}"
-#                         results[key] = score
-#         return EvalResult_MIB_CAUSALGRAPH(
-#             eval_name=method_name,
-#             method_name=method_name,
-#             results=results
-#         )
-#         data = {}
-#         method_counters = defaultdict(int)
-#         for json_file in json_files:
-#             # Handle method name and duplicates
-#             method_name = json_file['method_name']
-#             method_counters[method_name] += 1
-#             unique_method_name = f"{method_name}_{method_counters[method_name]}"
-#             method_scores = []
-#             for result in json_file['results']:
-#                 model = result['model_id']
-#                 for task, scores in result['task_scores'].items():
-#                     # Process each layer's data
-#                     intervention_scores = defaultdict(list)
-#                     for layer_data in scores:
-#                         for intervention_data in layer_data['layer_scores']:
-#                             # Calculate average score for counterfactuals
-#                             avg_cf_score = np.mean([
-#                                 cf['score']
-#                                 for cf in intervention_data['counterfactual_scores']
-#                             ])
-#                             if np.isnan(avg_cf_score):
-#                                 avg_cf_score = 0.0
-#                             # Group scores by intervention
-#                             intervention_key = '_'.join(intervention_data['intervention'])
-#                             intervention_scores[intervention_key].append(avg_cf_score)
-#                     # Average across layers for each intervention
-#                     for intervention, layer_scores in intervention_scores.items():
-#                         column = f"{model}_{task}_{intervention}"
-#                         avg_score = np.mean(layer_scores) if layer_scores else 0.0
-#                         method_scores.append((column, f"{avg_score:.3f}"))
-#             # Sort by column names for consistency
-#             method_scores.sort(key=lambda x: x[0])
-#             data[unique_method_name] = {
-#                 col: score for col, score in method_scores
-#             }
-#         return pd.DataFrame.from_dict(data, orient='index')
-#     def to_dict(self):
-#         """Converts the Eval Result to a dict for dataframe display"""
-#         data_dict = {
-#             "eval_name": self.eval_name,
-#             "Method": self.method_name,
-#         }
-#         # Add all results directly
-#         data_dict.update(self.results)
-#         return data_dict
 def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
     model_result_filepaths = []
-    # print(f"Scanning directory: {results_path}")
     for root, dirnames, files in os.walk(results_path):
-        # print(f"Current directory: {root}")
-        # print(f"Found files: {files}")
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
@@ -348,32 +377,19 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
-    # print(f"Found json files: {model_result_filepaths}")
-    method_counters = defaultdict(int)
-    dataframes = []
-    for json_file in model_result_filepaths:
         try:
-            with open(filepath, 'r') as f:
-                json_data = json.load(f)
-                method_name = json_data['method_name']
-                method_counters[method_name] += 1
-            # Process single JSON file
-            df = process_single_json(json_data, method_counters[method_name])
-            dataframes.append(df)
         except Exception as e:
-            print(f"Error processing {json_file}: {e}")
             continue
-    # # Concatenate all DataFrames
-    # if dataframes:
-    #     final_df = pd.concat(dataframes, axis=0)
-    #     return final_df
-    # else:
-    #     return pd.DataFrame()
-    return dataframes

+# def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
+#     """
+#     Process a single JSON file and convert it to a DataFrame.
+#     Args:
+#         json_file: Dictionary containing the analysis results
+#         method_counter: Counter for handling duplicate method names
+#     Returns:
+#         pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
+#     """
+#     method_name = json_file['method_name']
+#     unique_method_name = f"{method_name}_{method_counter}"
+#     method_scores = []
+#     for result in json_file['results']:
+#         model = result['model_id']
+#         for task, scores in result['task_scores'].items():
+#             # Process each layer's data
+#             intervention_scores = defaultdict(list)
+#             for layer_data in scores:
+#                 for intervention_data in layer_data['layer_scores']:
+#                     # Calculate average score for counterfactuals
+#                     avg_cf_score = np.mean([
+#                         cf['score']
+#                         for cf in intervention_data['counterfactual_scores']
+#                     ])
+#                     if np.isnan(avg_cf_score):
+#                         avg_cf_score = 0.0
+#                     # Group scores by intervention
+#                     intervention_key = '_'.join(intervention_data['intervention'])
+#                     intervention_scores[intervention_key].append(avg_cf_score)
+#             # Average across layers for each intervention
+#             for intervention, layer_scores in intervention_scores.items():
+#                 column = f"{model}_{task}_{intervention}"
+#                 avg_score = np.mean(layer_scores) if layer_scores else 0.0
+#                 method_scores.append((column, f"{avg_score:.3f}"))
+#     # Sort by column names for consistency
+#     method_scores.sort(key=lambda x: x[0])
+#     data = {
+#         unique_method_name: {
+#             col: score for col, score in method_scores
+#         }
+#     }
+#     return pd.DataFrame.from_dict(data, orient='index')
+# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
+#     model_result_filepaths = []
+#     # print(f"Scanning directory: {results_path}")
+#     for root, dirnames, files in os.walk(results_path):
+#         # print(f"Current directory: {root}")
+#         # print(f"Found files: {files}")
+#         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+#             continue
+#         try:
+#             files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+#         except dateutil.parser._parser.ParserError:
+#             files = [files[-1]]
+#         for file in files:
+#             model_result_filepaths.append(os.path.join(root, file))
+#     # print(f"Found json files: {model_result_filepaths}")
+#     method_counters = defaultdict(int)
+#     dataframes = []
+#     for json_file in model_result_filepaths:
+#         try:
+#             with open(filepath, 'r') as f:
+#                 json_data = json.load(f)
+#                 method_name = json_data['method_name']
+#                 method_counters[method_name] += 1
+#             # Process single JSON file
+#             df = process_single_json(json_data, method_counters[method_name])
+#             dataframes.append(df)
+#         except Exception as e:
+#             print(f"Error processing {json_file}: {e}")
+#             continue
+#     # # Concatenate all DataFrames
+#     # if dataframes:
+#     #     final_df = pd.concat(dataframes, axis=0)
+#     #     return final_df
+#     # else:
+#     #     return pd.DataFrame()
+#     return dataframes
+from dataclasses import dataclass
+import json
+import numpy as np
+from typing import Dict, List, Any
+import os
+from datetime import datetime
+import dateutil
+@dataclass
+class EvalResult_MIB_CAUSALGRAPH:
+    """Represents one full evaluation for a method across all models in MIB for causal graph track."""
+    eval_name: str        # method name as identifier
+    method_name: str      # name of the interpretation method
+    results: Dict         # nested dict of results for each model and task
+    def init_from_json_file(self, json_filepath: str):
+        """Inits results from the method result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        method_name = data.get("method_name")
+        # Initialize results dictionary
+        results = {}
+        for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:
+            results[task] = {}
+        # Process each model's results
+        for result in data.get("results", []):
+            model_id = result.get("model_id", "")
+            model_name = model_id.replace(".", "_")
+            for task, scores in result.get("task_scores", {}).items():
+                intervention_scores = defaultdict(list)
+                for layer_data in scores:
+                    for intervention_data in layer_data['layer_scores']:
+                        # Calculate average score for counterfactuals
+                        avg_cf_score = np.mean([
+                            cf['score']
+                            for cf in intervention_data['counterfactual_scores']
+                        ])
+                        if np.isnan(avg_cf_score):
+                            avg_cf_score = 0.0
+                        intervention_key = '_'.join(intervention_data['intervention'])
+                        intervention_scores[intervention_key].append(avg_cf_score)
+                # Average across layers for each intervention
+                results[task][model_name] = {
+                    interv: np.mean(scores) if scores else 0.0
+                    for interv, scores in intervention_scores.items()
+                }
+        return EvalResult_MIB_CAUSALGRAPH(
+            eval_name=method_name,
+            method_name=method_name,
+            results=results
+        )
+    def to_dict(self, metric_type="average"):
+        """Converts the Eval Result to a dict for dataframe display"""
+        data_dict = {
+            "eval_name": self.eval_name,
+            "Method": self.method_name,
+        }
+        # Initialize columns for all task-model combinations
+        all_scores = []
+        for task, task_results in self.results.items():
+            for model, intervention_scores in task_results.items():
+                if not intervention_scores:
+                    continue
+                col_name = f"{task}_{model}"
+                scores = list(intervention_scores.values())
+                if not scores:
+                    data_dict[col_name] = '-'
+                    continue
+                avg_score = np.mean(scores)
+                data_dict[col_name] = round(avg_score, 3)
+                all_scores.append(avg_score)
+        data_dict["Average"] = round(np.mean(all_scores), 3) if all_scores else '-'
+        return data_dict
 def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
+    """From the path of the results folder root, extract all needed info for MIB causal graph results"""
     model_result_filepaths = []
     for root, dirnames, files in os.walk(results_path):
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
+    eval_results = []
+    for model_result_filepath in model_result_filepaths:
         try:
+            eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})  # Create empty instance
+            result = eval_result.init_from_json_file(model_result_filepath)
+            # Verify the result can be converted to dict format
+            result.to_dict()
+            eval_results.append(result)
         except Exception as e:
+            print(f"Error processing {model_result_filepath}: {e}")
             continue
+    return eval_results