Spaces:
Running
Running
Update src/leaderboard/read_evals.py
Browse files- src/leaderboard/read_evals.py +107 -253
src/leaderboard/read_evals.py
CHANGED
@@ -16,7 +16,31 @@ from typing import List, Dict
|
|
16 |
from src.about import TasksMIB
|
17 |
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
|
|
|
|
|
|
|
|
20 |
percentages = [e / max(edge_counts) for e in edge_counts]
|
21 |
area_under = 0.
|
22 |
area_from_100 = 0.
|
@@ -44,51 +68,7 @@ class EvalResult_MIB:
|
|
44 |
method_name: str # name of the interpretation method
|
45 |
results: Dict # nested dict of results {task: {model: {metric: scores}}}
|
46 |
|
47 |
-
# def init_from_json_file(self, json_filepath):
|
48 |
-
# """Inits results from the method result file"""
|
49 |
-
# with open(json_filepath) as fp:
|
50 |
-
# data = json.load(fp)
|
51 |
|
52 |
-
# method_name = data.get("method_name")
|
53 |
-
|
54 |
-
# def _get_task_metrics(scores, task_name):
|
55 |
-
# """Extract both edge_counts and faithfulness scores"""
|
56 |
-
# task_scores = scores.get(task_name, {})
|
57 |
-
# if not task_scores:
|
58 |
-
# return None
|
59 |
-
|
60 |
-
# edge_counts = task_scores.get("edge_counts", [])
|
61 |
-
# faithfulness = task_scores.get("faithfulness", [])
|
62 |
-
|
63 |
-
# if not edge_counts or not faithfulness:
|
64 |
-
# return None
|
65 |
-
|
66 |
-
# # Handle case where faithfulness is a list of lists
|
67 |
-
# if isinstance(faithfulness[0], list):
|
68 |
-
# faithfulness = faithfulness[0]
|
69 |
-
|
70 |
-
# return {
|
71 |
-
# "edge_counts": edge_counts,
|
72 |
-
# "faithfulness": faithfulness
|
73 |
-
# }
|
74 |
-
|
75 |
-
# # Process results for each model
|
76 |
-
# results = {}
|
77 |
-
# for task in TasksMIB:
|
78 |
-
# results[task.value.benchmark] = {}
|
79 |
-
# for model_result in data.get("results", []):
|
80 |
-
# # model_id = model_result.get("model_id", "").split('/')[-1] # Get last part of model path
|
81 |
-
# model_id = model_result.get("model_id", "").split('/')[0]
|
82 |
-
# scores = model_result.get("scores", {})
|
83 |
-
# metrics = _get_task_metrics(scores, task.value.benchmark)
|
84 |
-
# if metrics is not None:
|
85 |
-
# results[task.value.benchmark][model_id] = metrics
|
86 |
-
|
87 |
-
# return EvalResult_MIB(
|
88 |
-
# eval_name=method_name,
|
89 |
-
# method_name=method_name,
|
90 |
-
# results=results
|
91 |
-
# )
|
92 |
def init_from_json_file(self, json_filepath):
|
93 |
"""Inits results from the method result file"""
|
94 |
with open(json_filepath) as fp:
|
@@ -131,6 +111,7 @@ class EvalResult_MIB:
|
|
131 |
)
|
132 |
|
133 |
|
|
|
134 |
# def to_dict(self):
|
135 |
# """Converts the Eval Result to a dict for dataframe display"""
|
136 |
# data_dict = {
|
@@ -139,8 +120,14 @@ class EvalResult_MIB:
|
|
139 |
# }
|
140 |
|
141 |
# all_scores = []
|
142 |
-
#
|
143 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
# # For each task (ioi, mcqa)
|
146 |
# for task, task_results in self.results.items():
|
@@ -148,7 +135,6 @@ class EvalResult_MIB:
|
|
148 |
# models = task_results.keys()
|
149 |
|
150 |
# for model in models:
|
151 |
-
# expected_entries += 1
|
152 |
# col_name = f"{task}_{model}"
|
153 |
# metrics = task_results[model]
|
154 |
# if metrics:
|
@@ -157,29 +143,31 @@ class EvalResult_MIB:
|
|
157 |
# if isinstance(faithfulness[0], list):
|
158 |
# faithfulness = faithfulness[0]
|
159 |
|
160 |
-
# # Use compute_area
|
161 |
# area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
|
162 |
-
# score = area_under * 100
|
163 |
# data_dict[col_name] = round(score, 2)
|
164 |
# all_scores.append(score)
|
165 |
-
#
|
166 |
# else:
|
167 |
# data_dict[col_name] = '-'
|
168 |
|
169 |
-
# # Only show average if all entries are present
|
170 |
-
# if
|
171 |
# data_dict["Average"] = round(np.mean(all_scores), 2)
|
172 |
# else:
|
173 |
# data_dict["Average"] = '-'
|
174 |
|
175 |
# return data_dict
|
|
|
|
|
176 |
def to_dict(self):
|
177 |
"""Converts the Eval Result to a dict for dataframe display"""
|
178 |
data_dict = {
|
179 |
"eval_name": self.eval_name,
|
180 |
"Method": self.method_name,
|
181 |
}
|
182 |
-
|
183 |
all_scores = []
|
184 |
required_entries = {
|
185 |
'ioi_meta_llama': False,
|
@@ -190,29 +178,32 @@ class EvalResult_MIB:
|
|
190 |
'mcqa_gpt2': False
|
191 |
}
|
192 |
|
193 |
-
# For each task (ioi, mcqa)
|
194 |
for task, task_results in self.results.items():
|
195 |
-
# Get the models that have results for this task
|
196 |
models = task_results.keys()
|
197 |
|
198 |
for model in models:
|
199 |
col_name = f"{task}_{model}"
|
200 |
metrics = task_results[model]
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
score = area_under * 100
|
210 |
data_dict[col_name] = round(score, 2)
|
211 |
all_scores.append(score)
|
212 |
required_entries[col_name] = True
|
213 |
-
|
214 |
-
data_dict[col_name] = '-'
|
215 |
-
|
216 |
# Only show average if all six required entries are present
|
217 |
if all(required_entries.values()):
|
218 |
data_dict["Average"] = round(np.mean(all_scores), 2)
|
@@ -225,6 +216,56 @@ class EvalResult_MIB:
|
|
225 |
|
226 |
|
227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
@dataclass
|
229 |
class EvalResult:
|
230 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
@@ -418,190 +459,3 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
418 |
|
419 |
|
420 |
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
# def get_raw_eval_results_mib(results_path: str) -> List[EvalResult_MIB]:
|
425 |
-
# """Extract all evaluation results from the results folder"""
|
426 |
-
# model_result_filepaths = []
|
427 |
-
|
428 |
-
# print(f"results_path is {results_path}")
|
429 |
-
|
430 |
-
# for root, dirnames, files in os.walk(results_path):
|
431 |
-
# print(f"root is {root}, dirnames is {dirnames}, files is {files}")
|
432 |
-
# if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
433 |
-
# continue
|
434 |
-
|
435 |
-
# files.sort()
|
436 |
-
# for file in files:
|
437 |
-
# model_result_filepaths.append(os.path.join(root, file))
|
438 |
-
|
439 |
-
# print(f"model_result_filepaths is {model_result_filepaths}")
|
440 |
-
|
441 |
-
# eval_results = []
|
442 |
-
# for model_result_filepath in model_result_filepaths:
|
443 |
-
# try:
|
444 |
-
# eval_result = EvalResult_MIB("", "", {}) # Create empty instance
|
445 |
-
# result = eval_result.init_from_json_file(model_result_filepath)
|
446 |
-
# # Verify the result can be converted to dict format
|
447 |
-
# result.to_dict()
|
448 |
-
# eval_results.append(result)
|
449 |
-
# except Exception as e:
|
450 |
-
# print(f"Error processing {model_result_filepath}: {e}")
|
451 |
-
# continue
|
452 |
-
|
453 |
-
# return eval_results
|
454 |
-
|
455 |
-
def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[EvalResult_MIB]:
|
456 |
-
"""From the path of the results folder root, extract all needed info for MIB results"""
|
457 |
-
model_result_filepaths = []
|
458 |
-
|
459 |
-
print(f"results_path is {results_path}")
|
460 |
-
|
461 |
-
for root, dirnames, files in os.walk(results_path):
|
462 |
-
print(f"root is {root}, dirnames is {dirnames}, files is {files}")
|
463 |
-
# We should only have json files in model results
|
464 |
-
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
465 |
-
continue
|
466 |
-
|
467 |
-
# Sort the files by date - keeping original sorting logic
|
468 |
-
try:
|
469 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
470 |
-
except dateutil.parser._parser.ParserError:
|
471 |
-
files = [files[-1]]
|
472 |
-
|
473 |
-
for file in files:
|
474 |
-
model_result_filepaths.append(os.path.join(root, file))
|
475 |
-
|
476 |
-
print(f"model_result_filepaths is {model_result_filepaths}")
|
477 |
-
|
478 |
-
eval_results = []
|
479 |
-
for model_result_filepath in model_result_filepaths:
|
480 |
-
try:
|
481 |
-
eval_result = EvalResult_MIB("", "", {}) # Create empty instance
|
482 |
-
result = eval_result.init_from_json_file(model_result_filepath)
|
483 |
-
print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
|
484 |
-
# Verify the result can be converted to dict format
|
485 |
-
result.to_dict()
|
486 |
-
eval_results.append(result)
|
487 |
-
except Exception as e:
|
488 |
-
print(f"Error processing {model_result_filepath}: {e}")
|
489 |
-
continue
|
490 |
-
|
491 |
-
return eval_results
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
# from dataclasses import dataclass
|
511 |
-
# from enum import Enum
|
512 |
-
# from typing import Dict, List, Any
|
513 |
-
|
514 |
-
# @dataclass
|
515 |
-
# class Task:
|
516 |
-
# benchmark: str
|
517 |
-
# metrics: list[str]
|
518 |
-
# col_name: str
|
519 |
-
|
520 |
-
# def get_model_ids(self, results: Dict) -> List[str]:
|
521 |
-
# """Extract model IDs from results"""
|
522 |
-
# try:
|
523 |
-
# return [result["model_id"] for result in results["results"]]
|
524 |
-
# except (KeyError, TypeError):
|
525 |
-
# return []
|
526 |
-
|
527 |
-
# class TasksMIB(Enum):
|
528 |
-
# task0 = Task("ioi", ["edge_counts", "faithfulness"], "Indirect Object Identification")
|
529 |
-
# task1 = Task("mcqa", ["edge_counts", "faithfulness"], "Multiple Choice QA")
|
530 |
-
|
531 |
-
# @classmethod
|
532 |
-
# def get_models(cls, results: Dict) -> List[str]:
|
533 |
-
# """Class method to get model IDs using any task"""
|
534 |
-
# # Since model IDs are common across tasks, we can use any task to extract them
|
535 |
-
# return cls.task0.value.get_model_ids(results)
|
536 |
-
|
537 |
-
# # Example usage:
|
538 |
-
# results = {
|
539 |
-
# "method_name": "EAP-IG (mean)",
|
540 |
-
# "results": [
|
541 |
-
# {"model_id": "meta-llama/Llama-3.1-8B", "scores": {}},
|
542 |
-
# {"model_id": "Qwen/Qwen2-1.5B", "scores": {}}
|
543 |
-
# ]
|
544 |
-
# }
|
545 |
-
|
546 |
-
# # Get models using TasksMIB
|
547 |
-
# model_ids = TasksMIB.get_models(results)
|
548 |
-
# print(model_ids) # ['meta-llama/Llama-3.1-8B', 'Qwen/Qwen2-1.5B']
|
549 |
-
|
550 |
-
|
551 |
-
from dataclasses import dataclass
|
552 |
-
from enum import Enum
|
553 |
-
from typing import Dict, List, Tuple
|
554 |
-
|
555 |
-
@dataclass
|
556 |
-
class Task:
|
557 |
-
benchmark: str
|
558 |
-
metrics: list[str]
|
559 |
-
col_name: str
|
560 |
-
|
561 |
-
def get_method_results(self, results: Dict) -> List[Tuple[str, str, Dict]]:
|
562 |
-
"""
|
563 |
-
Extract (method_name, model_id, scores) tuples from results
|
564 |
-
|
565 |
-
Args:
|
566 |
-
results (Dict): Results dictionary containing method_name and results
|
567 |
-
|
568 |
-
Returns:
|
569 |
-
List[Tuple[str, str, Dict]]: List of (method_name, model_id, scores) tuples
|
570 |
-
"""
|
571 |
-
method_name = results.get("method_name", "unknown")
|
572 |
-
try:
|
573 |
-
return [
|
574 |
-
(method_name, result["model_id"], result["scores"])
|
575 |
-
for result in results["results"]
|
576 |
-
]
|
577 |
-
except (KeyError, TypeError):
|
578 |
-
return []
|
579 |
-
|
580 |
-
class TasksMIB(Enum):
|
581 |
-
task0 = Task("ioi", ["edge_counts", "faithfulness"], "Indirect Object Identification")
|
582 |
-
task1 = Task("mcqa", ["edge_counts", "faithfulness"], "Multiple Choice QA")
|
583 |
-
|
584 |
-
@classmethod
|
585 |
-
def get_method_model_pairs(cls, results: Dict) -> List[Tuple[str, str]]:
|
586 |
-
"""Get all (method_name, model_id) pairs from results"""
|
587 |
-
return [(pair[0], pair[1]) for pair in cls.task0.value.get_method_results(results)]
|
588 |
-
|
589 |
-
# Example usage:
|
590 |
-
results = {
|
591 |
-
"method_name": "EAP-IG (mean)",
|
592 |
-
"results": [
|
593 |
-
{"model_id": "meta-llama/Llama-3.1-8B", "scores": {}},
|
594 |
-
{"model_id": "Qwen/Qwen2-1.5B", "scores": {}}
|
595 |
-
]
|
596 |
-
}
|
597 |
-
|
598 |
-
# Get method-model pairs
|
599 |
-
method_model_pairs = TasksMIB.get_method_model_pairs(results)
|
600 |
-
print(method_model_pairs)
|
601 |
-
# [('EAP-IG (mean)', 'meta-llama/Llama-3.1-8B'), ('EAP-IG (mean)', 'Qwen/Qwen2-1.5B')]
|
602 |
-
|
603 |
-
# Get full results including scores
|
604 |
-
full_results = TasksMIB.task0.value.get_method_results(results)
|
605 |
-
for method_name, model_id, scores in full_results:
|
606 |
-
print(f"Method: {method_name}, Model: {model_id}")
|
607 |
-
print(f"Scores: {scores}")
|
|
|
16 |
from src.about import TasksMIB
|
17 |
|
18 |
|
19 |
+
# def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
20 |
+
# percentages = [e / max(edge_counts) for e in edge_counts]
|
21 |
+
# area_under = 0.
|
22 |
+
# area_from_100 = 0.
|
23 |
+
# for i in range(len(faithfulnesses) - 1):
|
24 |
+
# i_1, i_2 = i, i+1
|
25 |
+
# x_1 = percentages[i_1]
|
26 |
+
# x_2 = percentages[i_2]
|
27 |
+
# # area from point to 100
|
28 |
+
# if log_scale:
|
29 |
+
# x_1 = math.log(x_1)
|
30 |
+
# x_2 = math.log(x_2)
|
31 |
+
# trapezoidal = (percentages[i_2] - percentages[i_1]) * \
|
32 |
+
# (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
|
33 |
+
# area_from_100 += trapezoidal
|
34 |
+
|
35 |
+
# trapezoidal = (percentages[i_2] - percentages[i_1]) * ((faithfulnesses[i_1] + faithfulnesses[i_2]) / 2)
|
36 |
+
# area_under += trapezoidal
|
37 |
+
# average = sum(faithfulnesses) / len(faithfulnesses)
|
38 |
+
# return (area_under, area_from_100, average)
|
39 |
def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
40 |
+
# Return None if either list is empty
|
41 |
+
if not edge_counts or not faithfulnesses:
|
42 |
+
return None, None, None
|
43 |
+
|
44 |
percentages = [e / max(edge_counts) for e in edge_counts]
|
45 |
area_under = 0.
|
46 |
area_from_100 = 0.
|
|
|
68 |
method_name: str # name of the interpretation method
|
69 |
results: Dict # nested dict of results {task: {model: {metric: scores}}}
|
70 |
|
|
|
|
|
|
|
|
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
def init_from_json_file(self, json_filepath):
|
73 |
"""Inits results from the method result file"""
|
74 |
with open(json_filepath) as fp:
|
|
|
111 |
)
|
112 |
|
113 |
|
114 |
+
|
115 |
# def to_dict(self):
|
116 |
# """Converts the Eval Result to a dict for dataframe display"""
|
117 |
# data_dict = {
|
|
|
120 |
# }
|
121 |
|
122 |
# all_scores = []
|
123 |
+
# required_entries = {
|
124 |
+
# 'ioi_meta_llama': False,
|
125 |
+
# 'ioi_qwen': False,
|
126 |
+
# 'ioi_gpt2': False,
|
127 |
+
# 'mcqa_meta_llama': False,
|
128 |
+
# 'mcqa_qwen': False,
|
129 |
+
# 'mcqa_gpt2': False
|
130 |
+
# }
|
131 |
|
132 |
# # For each task (ioi, mcqa)
|
133 |
# for task, task_results in self.results.items():
|
|
|
135 |
# models = task_results.keys()
|
136 |
|
137 |
# for model in models:
|
|
|
138 |
# col_name = f"{task}_{model}"
|
139 |
# metrics = task_results[model]
|
140 |
# if metrics:
|
|
|
143 |
# if isinstance(faithfulness[0], list):
|
144 |
# faithfulness = faithfulness[0]
|
145 |
|
146 |
+
# # Use compute_area
|
147 |
# area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
|
148 |
+
# score = area_under * 100
|
149 |
# data_dict[col_name] = round(score, 2)
|
150 |
# all_scores.append(score)
|
151 |
+
# required_entries[col_name] = True
|
152 |
# else:
|
153 |
# data_dict[col_name] = '-'
|
154 |
|
155 |
+
# # Only show average if all six required entries are present
|
156 |
+
# if all(required_entries.values()):
|
157 |
# data_dict["Average"] = round(np.mean(all_scores), 2)
|
158 |
# else:
|
159 |
# data_dict["Average"] = '-'
|
160 |
|
161 |
# return data_dict
|
162 |
+
|
163 |
+
|
164 |
def to_dict(self):
|
165 |
"""Converts the Eval Result to a dict for dataframe display"""
|
166 |
data_dict = {
|
167 |
"eval_name": self.eval_name,
|
168 |
"Method": self.method_name,
|
169 |
}
|
170 |
+
|
171 |
all_scores = []
|
172 |
required_entries = {
|
173 |
'ioi_meta_llama': False,
|
|
|
178 |
'mcqa_gpt2': False
|
179 |
}
|
180 |
|
|
|
181 |
for task, task_results in self.results.items():
|
|
|
182 |
models = task_results.keys()
|
183 |
|
184 |
for model in models:
|
185 |
col_name = f"{task}_{model}"
|
186 |
metrics = task_results[model]
|
187 |
+
|
188 |
+
# Handle empty lists case
|
189 |
+
if not metrics or not metrics["edge_counts"] or not metrics["faithfulness"]:
|
190 |
+
data_dict[col_name] = '-'
|
191 |
+
continue
|
192 |
+
|
193 |
+
faithfulness = metrics["faithfulness"]
|
194 |
+
if isinstance(faithfulness[0], list):
|
195 |
+
faithfulness = faithfulness[0]
|
196 |
+
|
197 |
+
result = compute_area(metrics["edge_counts"], faithfulness)
|
198 |
+
if result is None or result[0] is None:
|
199 |
+
data_dict[col_name] = '-'
|
200 |
+
else:
|
201 |
+
area_under, _, _ = result
|
202 |
score = area_under * 100
|
203 |
data_dict[col_name] = round(score, 2)
|
204 |
all_scores.append(score)
|
205 |
required_entries[col_name] = True
|
206 |
+
|
|
|
|
|
207 |
# Only show average if all six required entries are present
|
208 |
if all(required_entries.values()):
|
209 |
data_dict["Average"] = round(np.mean(all_scores), 2)
|
|
|
216 |
|
217 |
|
218 |
|
219 |
+
|
220 |
+
|
221 |
+
|
222 |
+
def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[EvalResult_MIB]:
|
223 |
+
"""From the path of the results folder root, extract all needed info for MIB results"""
|
224 |
+
model_result_filepaths = []
|
225 |
+
|
226 |
+
print(f"results_path is {results_path}")
|
227 |
+
|
228 |
+
for root, dirnames, files in os.walk(results_path):
|
229 |
+
print(f"root is {root}, dirnames is {dirnames}, files is {files}")
|
230 |
+
# We should only have json files in model results
|
231 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
232 |
+
continue
|
233 |
+
|
234 |
+
# Sort the files by date - keeping original sorting logic
|
235 |
+
try:
|
236 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
237 |
+
except dateutil.parser._parser.ParserError:
|
238 |
+
files = [files[-1]]
|
239 |
+
|
240 |
+
for file in files:
|
241 |
+
model_result_filepaths.append(os.path.join(root, file))
|
242 |
+
|
243 |
+
print(f"model_result_filepaths is {model_result_filepaths}")
|
244 |
+
|
245 |
+
eval_results = []
|
246 |
+
for model_result_filepath in model_result_filepaths:
|
247 |
+
try:
|
248 |
+
eval_result = EvalResult_MIB("", "", {}) # Create empty instance
|
249 |
+
result = eval_result.init_from_json_file(model_result_filepath)
|
250 |
+
print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
|
251 |
+
# Verify the result can be converted to dict format
|
252 |
+
result.to_dict()
|
253 |
+
eval_results.append(result)
|
254 |
+
except Exception as e:
|
255 |
+
print(f"Error processing {model_result_filepath}: {e}")
|
256 |
+
continue
|
257 |
+
|
258 |
+
return eval_results
|
259 |
+
|
260 |
+
|
261 |
+
|
262 |
+
|
263 |
+
|
264 |
+
|
265 |
+
|
266 |
+
|
267 |
+
|
268 |
+
|
269 |
@dataclass
|
270 |
class EvalResult:
|
271 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
|
|
459 |
|
460 |
|
461 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|