Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

shunshao commited on Jan 10

Commit

a5e49c6

verified ·

1 Parent(s): d92a3e3

Update src/leaderboard/read_evals.py

Browse files

Files changed (1) hide show

src/leaderboard/read_evals.py +107 -253

src/leaderboard/read_evals.py CHANGED Viewed

@@ -16,7 +16,31 @@ from typing import List, Dict
 from src.about import TasksMIB
 def compute_area(edge_counts, faithfulnesses, log_scale=True):
     percentages = [e / max(edge_counts) for e in edge_counts]
     area_under = 0.
     area_from_100 = 0.
@@ -44,51 +68,7 @@ class EvalResult_MIB:
     method_name: str      # name of the interpretation method
     results: Dict         # nested dict of results {task: {model: {metric: scores}}}
-    # def init_from_json_file(self, json_filepath):
-    #     """Inits results from the method result file"""
-    #     with open(json_filepath) as fp:
-    #         data = json.load(fp)
-    #     method_name = data.get("method_name")
-    #     def _get_task_metrics(scores, task_name):
-    #         """Extract both edge_counts and faithfulness scores"""
-    #         task_scores = scores.get(task_name, {})
-    #         if not task_scores:
-    #             return None
-    #         edge_counts = task_scores.get("edge_counts", [])
-    #         faithfulness = task_scores.get("faithfulness", [])
-    #         if not edge_counts or not faithfulness:
-    #             return None
-    #         # Handle case where faithfulness is a list of lists
-    #         if isinstance(faithfulness[0], list):
-    #             faithfulness = faithfulness[0]
-    #         return {
-    #             "edge_counts": edge_counts,
-    #             "faithfulness": faithfulness
-    #         }
-    #     # Process results for each model
-    #     results = {}
-    #     for task in TasksMIB:
-    #         results[task.value.benchmark] = {}
-    #         for model_result in data.get("results", []):
-    #             # model_id = model_result.get("model_id", "").split('/')[-1]  # Get last part of model path
-    #             model_id = model_result.get("model_id", "").split('/')[0]
-    #             scores = model_result.get("scores", {})
-    #             metrics = _get_task_metrics(scores, task.value.benchmark)
-    #             if metrics is not None:
-    #                 results[task.value.benchmark][model_id] = metrics
-    #     return EvalResult_MIB(
-    #         eval_name=method_name,
-    #         method_name=method_name,
-    #         results=results
-    #     )
     def init_from_json_file(self, json_filepath):
         """Inits results from the method result file"""
         with open(json_filepath) as fp:
@@ -131,6 +111,7 @@ class EvalResult_MIB:
         )
     # def to_dict(self):
     #     """Converts the Eval Result to a dict for dataframe display"""
     #     data_dict = {
@@ -139,8 +120,14 @@ class EvalResult_MIB:
     #     }
     #     all_scores = []
-    #     expected_entries = 0  # Count how many entries we expect
-    #     actual_entries = 0    # Count how many entries we actually got
     #     # For each task (ioi, mcqa)
     #     for task, task_results in self.results.items():
@@ -148,7 +135,6 @@ class EvalResult_MIB:
     #         models = task_results.keys()
     #         for model in models:
-    #             expected_entries += 1
     #             col_name = f"{task}_{model}"
     #             metrics = task_results[model]
     #             if metrics:
@@ -157,29 +143,31 @@ class EvalResult_MIB:
     #                 if isinstance(faithfulness[0], list):
     #                     faithfulness = faithfulness[0]
-    #                 # Use compute_area instead of simple averaging
     #                 area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
-    #                 score = area_under * 100  # Scale up for readability
     #                 data_dict[col_name] = round(score, 2)
     #                 all_scores.append(score)
-    #                 actual_entries += 1
     #             else:
     #                 data_dict[col_name] = '-'
-    #     # Only show average if all entries are present
-    #     if actual_entries == expected_entries:
     #         data_dict["Average"] = round(np.mean(all_scores), 2)
     #     else:
     #         data_dict["Average"] = '-'
     #     return data_dict
     def to_dict(self):
         """Converts the Eval Result to a dict for dataframe display"""
         data_dict = {
             "eval_name": self.eval_name,
             "Method": self.method_name,
         }
         all_scores = []
         required_entries = {
             'ioi_meta_llama': False,
@@ -190,29 +178,32 @@ class EvalResult_MIB:
             'mcqa_gpt2': False
         }
-        # For each task (ioi, mcqa)
         for task, task_results in self.results.items():
-            # Get the models that have results for this task
             models = task_results.keys()
             for model in models:
                 col_name = f"{task}_{model}"
                 metrics = task_results[model]
-                if metrics:
-                    edge_counts = metrics["edge_counts"]
-                    faithfulness = metrics["faithfulness"]
-                    if isinstance(faithfulness[0], list):
-                        faithfulness = faithfulness[0]
-                    # Use compute_area
-                    area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
                     score = area_under * 100
                     data_dict[col_name] = round(score, 2)
                     all_scores.append(score)
                     required_entries[col_name] = True
-                else:
-                    data_dict[col_name] = '-'
         # Only show average if all six required entries are present
         if all(required_entries.values()):
             data_dict["Average"] = round(np.mean(all_scores), 2)
@@ -225,6 +216,56 @@ class EvalResult_MIB:
 @dataclass
 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run.
@@ -418,190 +459,3 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
-# def get_raw_eval_results_mib(results_path: str) -> List[EvalResult_MIB]:
-#     """Extract all evaluation results from the results folder"""
-#     model_result_filepaths = []
-#     print(f"results_path is {results_path}")
-#     for root, dirnames, files in os.walk(results_path):
-#         print(f"root is {root}, dirnames is {dirnames}, files is {files}")
-#         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
-#             continue
-#         files.sort()
-#         for file in files:
-#             model_result_filepaths.append(os.path.join(root, file))
-#     print(f"model_result_filepaths is {model_result_filepaths}")
-#     eval_results = []
-#     for model_result_filepath in model_result_filepaths:
-#         try:
-#             eval_result = EvalResult_MIB("", "", {})  # Create empty instance
-#             result = eval_result.init_from_json_file(model_result_filepath)
-#             # Verify the result can be converted to dict format
-#             result.to_dict()
-#             eval_results.append(result)
-#         except Exception as e:
-#             print(f"Error processing {model_result_filepath}: {e}")
-#             continue
-#     return eval_results
-def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[EvalResult_MIB]:
-    """From the path of the results folder root, extract all needed info for MIB results"""
-    model_result_filepaths = []
-    print(f"results_path is {results_path}")
-    for root, dirnames, files in os.walk(results_path):
-        print(f"root is {root}, dirnames is {dirnames}, files is {files}")
-        # We should only have json files in model results
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
-            continue
-        # Sort the files by date - keeping original sorting logic
-        try:
-            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
-            files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    print(f"model_result_filepaths is {model_result_filepaths}")
-    eval_results = []
-    for model_result_filepath in model_result_filepaths:
-        try:
-            eval_result = EvalResult_MIB("", "", {})  # Create empty instance
-            result = eval_result.init_from_json_file(model_result_filepath)
-            print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
-            # Verify the result can be converted to dict format
-            result.to_dict()
-            eval_results.append(result)
-        except Exception as e:
-            print(f"Error processing {model_result_filepath}: {e}")
-            continue
-    return eval_results
-# from dataclasses import dataclass
-# from enum import Enum
-# from typing import Dict, List, Any
-# @dataclass
-# class Task:
-#     benchmark: str
-#     metrics: list[str]
-#     col_name: str
-#     def get_model_ids(self, results: Dict) -> List[str]:
-#         """Extract model IDs from results"""
-#         try:
-#             return [result["model_id"] for result in results["results"]]
-#         except (KeyError, TypeError):
-#             return []
-# class TasksMIB(Enum):
-#     task0 = Task("ioi", ["edge_counts", "faithfulness"], "Indirect Object Identification")
-#     task1 = Task("mcqa", ["edge_counts", "faithfulness"], "Multiple Choice QA")
-#     @classmethod
-#     def get_models(cls, results: Dict) -> List[str]:
-#         """Class method to get model IDs using any task"""
-#         # Since model IDs are common across tasks, we can use any task to extract them
-#         return cls.task0.value.get_model_ids(results)
-# # Example usage:
-# results = {
-#     "method_name": "EAP-IG (mean)",
-#     "results": [
-#         {"model_id": "meta-llama/Llama-3.1-8B", "scores": {}},
-#         {"model_id": "Qwen/Qwen2-1.5B", "scores": {}}
-#     ]
-# }
-# # Get models using TasksMIB
-# model_ids = TasksMIB.get_models(results)
-# print(model_ids)  # ['meta-llama/Llama-3.1-8B', 'Qwen/Qwen2-1.5B']
-from dataclasses import dataclass
-from enum import Enum
-from typing import Dict, List, Tuple
-@dataclass
-class Task:
-    benchmark: str
-    metrics: list[str]
-    col_name: str
-    def get_method_results(self, results: Dict) -> List[Tuple[str, str, Dict]]:
-        """
-        Extract (method_name, model_id, scores) tuples from results
-        Args:
-            results (Dict): Results dictionary containing method_name and results
-        Returns:
-            List[Tuple[str, str, Dict]]: List of (method_name, model_id, scores) tuples
-        """
-        method_name = results.get("method_name", "unknown")
-        try:
-            return [
-                (method_name, result["model_id"], result["scores"])
-                for result in results["results"]
-            ]
-        except (KeyError, TypeError):
-            return []
-class TasksMIB(Enum):
-    task0 = Task("ioi", ["edge_counts", "faithfulness"], "Indirect Object Identification")
-    task1 = Task("mcqa", ["edge_counts", "faithfulness"], "Multiple Choice QA")
-    @classmethod
-    def get_method_model_pairs(cls, results: Dict) -> List[Tuple[str, str]]:
-        """Get all (method_name, model_id) pairs from results"""
-        return [(pair[0], pair[1]) for pair in cls.task0.value.get_method_results(results)]
-# Example usage:
-results = {
-    "method_name": "EAP-IG (mean)",
-    "results": [
-        {"model_id": "meta-llama/Llama-3.1-8B", "scores": {}},
-        {"model_id": "Qwen/Qwen2-1.5B", "scores": {}}
-    ]
-}
-# Get method-model pairs
-method_model_pairs = TasksMIB.get_method_model_pairs(results)
-print(method_model_pairs)
-# [('EAP-IG (mean)', 'meta-llama/Llama-3.1-8B'), ('EAP-IG (mean)', 'Qwen/Qwen2-1.5B')]
-# Get full results including scores
-full_results = TasksMIB.task0.value.get_method_results(results)
-for method_name, model_id, scores in full_results:
-    print(f"Method: {method_name}, Model: {model_id}")
-    print(f"Scores: {scores}")

 from src.about import TasksMIB
+# def compute_area(edge_counts, faithfulnesses, log_scale=True):
+#     percentages = [e / max(edge_counts) for e in edge_counts]
+#     area_under = 0.
+#     area_from_100 = 0.
+#     for i in range(len(faithfulnesses) - 1):
+#         i_1, i_2 = i, i+1
+#         x_1 = percentages[i_1]
+#         x_2 = percentages[i_2]
+#         # area from point to 100
+#         if log_scale:
+#             x_1 = math.log(x_1)
+#             x_2 = math.log(x_2)
+#         trapezoidal = (percentages[i_2] - percentages[i_1]) * \
+#                         (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
+#         area_from_100 += trapezoidal
+#         trapezoidal = (percentages[i_2] - percentages[i_1]) * ((faithfulnesses[i_1] + faithfulnesses[i_2]) / 2)
+#         area_under += trapezoidal
+#     average = sum(faithfulnesses) / len(faithfulnesses)
+#     return (area_under, area_from_100, average)
 def compute_area(edge_counts, faithfulnesses, log_scale=True):
+    # Return None if either list is empty
+    if not edge_counts or not faithfulnesses:
+        return None, None, None
     percentages = [e / max(edge_counts) for e in edge_counts]
     area_under = 0.
     area_from_100 = 0.
     method_name: str      # name of the interpretation method
     results: Dict         # nested dict of results {task: {model: {metric: scores}}}
     def init_from_json_file(self, json_filepath):
         """Inits results from the method result file"""
         with open(json_filepath) as fp:
         )
     # def to_dict(self):
     #     """Converts the Eval Result to a dict for dataframe display"""
     #     data_dict = {
     #     }
     #     all_scores = []
+    #     required_entries = {
+    #         'ioi_meta_llama': False,
+    #         'ioi_qwen': False,
+    #         'ioi_gpt2': False,
+    #         'mcqa_meta_llama': False,
+    #         'mcqa_qwen': False,
+    #         'mcqa_gpt2': False
+    #     }
     #     # For each task (ioi, mcqa)
     #     for task, task_results in self.results.items():
     #         models = task_results.keys()
     #         for model in models:
     #             col_name = f"{task}_{model}"
     #             metrics = task_results[model]
     #             if metrics:
     #                 if isinstance(faithfulness[0], list):
     #                     faithfulness = faithfulness[0]
+    #                 # Use compute_area
     #                 area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
+    #                 score = area_under * 100
     #                 data_dict[col_name] = round(score, 2)
     #                 all_scores.append(score)
+    #                 required_entries[col_name] = True
     #             else:
     #                 data_dict[col_name] = '-'
+    #     # Only show average if all six required entries are present
+    #     if all(required_entries.values()):
     #         data_dict["Average"] = round(np.mean(all_scores), 2)
     #     else:
     #         data_dict["Average"] = '-'
     #     return data_dict
     def to_dict(self):
         """Converts the Eval Result to a dict for dataframe display"""
         data_dict = {
             "eval_name": self.eval_name,
             "Method": self.method_name,
         }
         all_scores = []
         required_entries = {
             'ioi_meta_llama': False,
             'mcqa_gpt2': False
         }
         for task, task_results in self.results.items():
             models = task_results.keys()
             for model in models:
                 col_name = f"{task}_{model}"
                 metrics = task_results[model]
+                # Handle empty lists case
+                if not metrics or not metrics["edge_counts"] or not metrics["faithfulness"]:
+                    data_dict[col_name] = '-'
+                    continue
+                faithfulness = metrics["faithfulness"]
+                if isinstance(faithfulness[0], list):
+                    faithfulness = faithfulness[0]
+                result = compute_area(metrics["edge_counts"], faithfulness)
+                if result is None or result[0] is None:
+                    data_dict[col_name] = '-'
+                else:
+                    area_under, _, _ = result
                     score = area_under * 100
                     data_dict[col_name] = round(score, 2)
                     all_scores.append(score)
                     required_entries[col_name] = True
         # Only show average if all six required entries are present
         if all(required_entries.values()):
             data_dict["Average"] = round(np.mean(all_scores), 2)
+def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[EvalResult_MIB]:
+    """From the path of the results folder root, extract all needed info for MIB results"""
+    model_result_filepaths = []
+    print(f"results_path is {results_path}")
+    for root, dirnames, files in os.walk(results_path):
+        print(f"root is {root}, dirnames is {dirnames}, files is {files}")
+        # We should only have json files in model results
+        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+            continue
+        # Sort the files by date - keeping original sorting logic
+        try:
+            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+        except dateutil.parser._parser.ParserError:
+            files = [files[-1]]
+        for file in files:
+            model_result_filepaths.append(os.path.join(root, file))
+    print(f"model_result_filepaths is {model_result_filepaths}")
+    eval_results = []
+    for model_result_filepath in model_result_filepaths:
+        try:
+            eval_result = EvalResult_MIB("", "", {})  # Create empty instance
+            result = eval_result.init_from_json_file(model_result_filepath)
+            print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
+            # Verify the result can be converted to dict format
+            result.to_dict()
+            eval_results.append(result)
+        except Exception as e:
+            print(f"Error processing {model_result_filepath}: {e}")
+            continue
+    return eval_results
 @dataclass
 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run.