Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

jasonshaoshun commited on Jan 30

Commit

691f4a8

1 Parent(s): 0ae72a8

caulsal-track debug

Browse files

Files changed (1) hide show

src/leaderboard/read_evals.py +91 -95

src/leaderboard/read_evals.py CHANGED Viewed

@@ -174,97 +174,97 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
-def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
-    """
-    Process a single JSON file and convert it to a DataFrame.
-    Args:
-        json_file: Dictionary containing the analysis results
-        method_counter: Counter for handling duplicate method names
-    Returns:
-        pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
-    """
-    method_name = json_file['method_name']
-    unique_method_name = f"{method_name}_{method_counter}"
-    method_scores = []
-    for result in json_file['results']:
-        model = result['model_id']
-        for task, scores in result['task_scores'].items():
-            # Process each layer's data
-            intervention_scores = defaultdict(list)
-            for layer_data in scores:
-                for intervention_data in layer_data['layer_scores']:
-                    # Calculate average score for counterfactuals
-                    avg_cf_score = np.mean([
-                        cf['score']
-                        for cf in intervention_data['counterfactual_scores']
-                    ])
-                    if np.isnan(avg_cf_score):
-                        avg_cf_score = 0.0
-                    # Group scores by intervention
-                    intervention_key = '_'.join(intervention_data['intervention'])
-                    intervention_scores[intervention_key].append(avg_cf_score)
-            # Average across layers for each intervention
-            for intervention, layer_scores in intervention_scores.items():
-                column = f"{model}_{task}_{intervention}"
-                avg_score = np.mean(layer_scores) if layer_scores else 0.0
-                method_scores.append((column, f"{avg_score:.3f}"))
-    # Sort by column names for consistency
-    method_scores.sort(key=lambda x: x[0])
-    data = {
-        unique_method_name: {
-            col: score for col, score in method_scores
-        }
-    }
-    return pd.DataFrame.from_dict(data, orient='index')
-def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
-    model_result_filepaths = []
-    # print(f"Scanning directory: {results_path}")
-    for root, dirnames, files in os.walk(results_path):
-        # print(f"Current directory: {root}")
-        # print(f"Found files: {files}")
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
-            continue
-        try:
-            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
-            files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    # print(f"Found json files: {model_result_filepaths}")
-    method_counters = defaultdict(int)
-    dataframes = []
-    for json_file in model_result_filepaths:
-        try:
-            with open(filepath, 'r') as f:
-                json_data = json.load(f)
-                method_name = json_data['method_name']
-                method_counters[method_name] += 1
-            # Process single JSON file
-            df = process_single_json(json_data, method_counters[method_name])
-            dataframes.append(df)
-        except Exception as e:
-            print(f"Error processing {json_file}: {e}")
-            continue
-    return dataframes
@@ -449,24 +449,20 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
     data_dicts = []
     for filepath in model_result_filepaths:
-        try:
-            with open(filepath, 'r') as f:
-                json_data = json.load(f)
-                method_name = json_data['method_name']
-                method_counters[method_name] += 1
-            eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
-            result = eval_result.init_from_json_file(filepath)
-            data_dict = result.to_dict()
-            # Add method counter to the method name if it's not the first instance
-            if method_counters[method_name] > 1:
-                data_dict["Method"] = f"{method_name}_{method_counters[method_name]}"
-            data_dicts.append(data_dict)
-        except Exception as e:
-            print(f"Error processing {filepath}: {e}")
-            continue
     if not data_dicts:
         return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

+# def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
+#     """
+#     Process a single JSON file and convert it to a DataFrame.
+#     Args:
+#         json_file: Dictionary containing the analysis results
+#         method_counter: Counter for handling duplicate method names
+#     Returns:
+#         pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
+#     """
+#     method_name = json_file['method_name']
+#     unique_method_name = f"{method_name}_{method_counter}"
+#     method_scores = []
+#     for result in json_file['results']:
+#         model = result['model_id']
+#         for task, scores in result['task_scores'].items():
+#             # Process each layer's data
+#             intervention_scores = defaultdict(list)
+#             for layer_data in scores:
+#                 for intervention_data in layer_data['layer_scores']:
+#                     # Calculate average score for counterfactuals
+#                     avg_cf_score = np.mean([
+#                         cf['score']
+#                         for cf in intervention_data['counterfactual_scores']
+#                     ])
+#                     if np.isnan(avg_cf_score):
+#                         avg_cf_score = 0.0
+#                     # Group scores by intervention
+#                     intervention_key = '_'.join(intervention_data['intervention'])
+#                     intervention_scores[intervention_key].append(avg_cf_score)
+#             # Average across layers for each intervention
+#             for intervention, layer_scores in intervention_scores.items():
+#                 column = f"{model}_{task}_{intervention}"
+#                 avg_score = np.mean(layer_scores) if layer_scores else 0.0
+#                 method_scores.append((column, f"{avg_score:.3f}"))
+#     # Sort by column names for consistency
+#     method_scores.sort(key=lambda x: x[0])
+#     data = {
+#         unique_method_name: {
+#             col: score for col, score in method_scores
+#         }
+#     }
+#     return pd.DataFrame.from_dict(data, orient='index')
+# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
+#     model_result_filepaths = []
+#     # print(f"Scanning directory: {results_path}")
+#     for root, dirnames, files in os.walk(results_path):
+#         # print(f"Current directory: {root}")
+#         # print(f"Found files: {files}")
+#         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+#             continue
+#         try:
+#             files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+#         except dateutil.parser._parser.ParserError:
+#             files = [files[-1]]
+#         for file in files:
+#             model_result_filepaths.append(os.path.join(root, file))
+#     # print(f"Found json files: {model_result_filepaths}")
+#     method_counters = defaultdict(int)
+#     dataframes = []
+#     for json_file in model_result_filepaths:
+#         try:
+#             with open(filepath, 'r') as f:
+#                 json_data = json.load(f)
+#                 method_name = json_data['method_name']
+#                 method_counters[method_name] += 1
+#             # Process single JSON file
+#             df = process_single_json(json_data, method_counters[method_name])
+#             dataframes.append(df)
+#         except Exception as e:
+#             print(f"Error processing {json_file}: {e}")
+#             continue
+#     return dataframes
     data_dicts = []
     for filepath in model_result_filepaths:
+        with open(filepath, 'r') as f:
+            json_data = json.load(f)
+            method_name = json_data['method_name']
+            method_counters[method_name] += 1
+        eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
+        result = eval_result.init_from_json_file(filepath)
+        data_dict = result.to_dict()
+        # Add method counter to the method name if it's not the first instance
+        if method_counters[method_name] > 1:
+            data_dict["Method"] = f"{method_name}_{method_counters[method_name]}"
+        data_dicts.append(data_dict)
     if not data_dicts:
         return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()