shunshao commited on
Commit
a5e49c6
·
verified ·
1 Parent(s): d92a3e3

Update src/leaderboard/read_evals.py

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +107 -253
src/leaderboard/read_evals.py CHANGED
@@ -16,7 +16,31 @@ from typing import List, Dict
16
  from src.about import TasksMIB
17
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def compute_area(edge_counts, faithfulnesses, log_scale=True):
 
 
 
 
20
  percentages = [e / max(edge_counts) for e in edge_counts]
21
  area_under = 0.
22
  area_from_100 = 0.
@@ -44,51 +68,7 @@ class EvalResult_MIB:
44
  method_name: str # name of the interpretation method
45
  results: Dict # nested dict of results {task: {model: {metric: scores}}}
46
 
47
- # def init_from_json_file(self, json_filepath):
48
- # """Inits results from the method result file"""
49
- # with open(json_filepath) as fp:
50
- # data = json.load(fp)
51
 
52
- # method_name = data.get("method_name")
53
-
54
- # def _get_task_metrics(scores, task_name):
55
- # """Extract both edge_counts and faithfulness scores"""
56
- # task_scores = scores.get(task_name, {})
57
- # if not task_scores:
58
- # return None
59
-
60
- # edge_counts = task_scores.get("edge_counts", [])
61
- # faithfulness = task_scores.get("faithfulness", [])
62
-
63
- # if not edge_counts or not faithfulness:
64
- # return None
65
-
66
- # # Handle case where faithfulness is a list of lists
67
- # if isinstance(faithfulness[0], list):
68
- # faithfulness = faithfulness[0]
69
-
70
- # return {
71
- # "edge_counts": edge_counts,
72
- # "faithfulness": faithfulness
73
- # }
74
-
75
- # # Process results for each model
76
- # results = {}
77
- # for task in TasksMIB:
78
- # results[task.value.benchmark] = {}
79
- # for model_result in data.get("results", []):
80
- # # model_id = model_result.get("model_id", "").split('/')[-1] # Get last part of model path
81
- # model_id = model_result.get("model_id", "").split('/')[0]
82
- # scores = model_result.get("scores", {})
83
- # metrics = _get_task_metrics(scores, task.value.benchmark)
84
- # if metrics is not None:
85
- # results[task.value.benchmark][model_id] = metrics
86
-
87
- # return EvalResult_MIB(
88
- # eval_name=method_name,
89
- # method_name=method_name,
90
- # results=results
91
- # )
92
  def init_from_json_file(self, json_filepath):
93
  """Inits results from the method result file"""
94
  with open(json_filepath) as fp:
@@ -131,6 +111,7 @@ class EvalResult_MIB:
131
  )
132
 
133
 
 
134
  # def to_dict(self):
135
  # """Converts the Eval Result to a dict for dataframe display"""
136
  # data_dict = {
@@ -139,8 +120,14 @@ class EvalResult_MIB:
139
  # }
140
 
141
  # all_scores = []
142
- # expected_entries = 0 # Count how many entries we expect
143
- # actual_entries = 0 # Count how many entries we actually got
 
 
 
 
 
 
144
 
145
  # # For each task (ioi, mcqa)
146
  # for task, task_results in self.results.items():
@@ -148,7 +135,6 @@ class EvalResult_MIB:
148
  # models = task_results.keys()
149
 
150
  # for model in models:
151
- # expected_entries += 1
152
  # col_name = f"{task}_{model}"
153
  # metrics = task_results[model]
154
  # if metrics:
@@ -157,29 +143,31 @@ class EvalResult_MIB:
157
  # if isinstance(faithfulness[0], list):
158
  # faithfulness = faithfulness[0]
159
 
160
- # # Use compute_area instead of simple averaging
161
  # area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
162
- # score = area_under * 100 # Scale up for readability
163
  # data_dict[col_name] = round(score, 2)
164
  # all_scores.append(score)
165
- # actual_entries += 1
166
  # else:
167
  # data_dict[col_name] = '-'
168
 
169
- # # Only show average if all entries are present
170
- # if actual_entries == expected_entries:
171
  # data_dict["Average"] = round(np.mean(all_scores), 2)
172
  # else:
173
  # data_dict["Average"] = '-'
174
 
175
  # return data_dict
 
 
176
  def to_dict(self):
177
  """Converts the Eval Result to a dict for dataframe display"""
178
  data_dict = {
179
  "eval_name": self.eval_name,
180
  "Method": self.method_name,
181
  }
182
-
183
  all_scores = []
184
  required_entries = {
185
  'ioi_meta_llama': False,
@@ -190,29 +178,32 @@ class EvalResult_MIB:
190
  'mcqa_gpt2': False
191
  }
192
 
193
- # For each task (ioi, mcqa)
194
  for task, task_results in self.results.items():
195
- # Get the models that have results for this task
196
  models = task_results.keys()
197
 
198
  for model in models:
199
  col_name = f"{task}_{model}"
200
  metrics = task_results[model]
201
- if metrics:
202
- edge_counts = metrics["edge_counts"]
203
- faithfulness = metrics["faithfulness"]
204
- if isinstance(faithfulness[0], list):
205
- faithfulness = faithfulness[0]
206
-
207
- # Use compute_area
208
- area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
 
 
 
 
 
 
 
209
  score = area_under * 100
210
  data_dict[col_name] = round(score, 2)
211
  all_scores.append(score)
212
  required_entries[col_name] = True
213
- else:
214
- data_dict[col_name] = '-'
215
-
216
  # Only show average if all six required entries are present
217
  if all(required_entries.values()):
218
  data_dict["Average"] = round(np.mean(all_scores), 2)
@@ -225,6 +216,56 @@ class EvalResult_MIB:
225
 
226
 
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  @dataclass
229
  class EvalResult:
230
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
@@ -418,190 +459,3 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
418
 
419
 
420
 
421
-
422
-
423
-
424
- # def get_raw_eval_results_mib(results_path: str) -> List[EvalResult_MIB]:
425
- # """Extract all evaluation results from the results folder"""
426
- # model_result_filepaths = []
427
-
428
- # print(f"results_path is {results_path}")
429
-
430
- # for root, dirnames, files in os.walk(results_path):
431
- # print(f"root is {root}, dirnames is {dirnames}, files is {files}")
432
- # if len(files) == 0 or any([not f.endswith(".json") for f in files]):
433
- # continue
434
-
435
- # files.sort()
436
- # for file in files:
437
- # model_result_filepaths.append(os.path.join(root, file))
438
-
439
- # print(f"model_result_filepaths is {model_result_filepaths}")
440
-
441
- # eval_results = []
442
- # for model_result_filepath in model_result_filepaths:
443
- # try:
444
- # eval_result = EvalResult_MIB("", "", {}) # Create empty instance
445
- # result = eval_result.init_from_json_file(model_result_filepath)
446
- # # Verify the result can be converted to dict format
447
- # result.to_dict()
448
- # eval_results.append(result)
449
- # except Exception as e:
450
- # print(f"Error processing {model_result_filepath}: {e}")
451
- # continue
452
-
453
- # return eval_results
454
-
455
- def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[EvalResult_MIB]:
456
- """From the path of the results folder root, extract all needed info for MIB results"""
457
- model_result_filepaths = []
458
-
459
- print(f"results_path is {results_path}")
460
-
461
- for root, dirnames, files in os.walk(results_path):
462
- print(f"root is {root}, dirnames is {dirnames}, files is {files}")
463
- # We should only have json files in model results
464
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
465
- continue
466
-
467
- # Sort the files by date - keeping original sorting logic
468
- try:
469
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
470
- except dateutil.parser._parser.ParserError:
471
- files = [files[-1]]
472
-
473
- for file in files:
474
- model_result_filepaths.append(os.path.join(root, file))
475
-
476
- print(f"model_result_filepaths is {model_result_filepaths}")
477
-
478
- eval_results = []
479
- for model_result_filepath in model_result_filepaths:
480
- try:
481
- eval_result = EvalResult_MIB("", "", {}) # Create empty instance
482
- result = eval_result.init_from_json_file(model_result_filepath)
483
- print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
484
- # Verify the result can be converted to dict format
485
- result.to_dict()
486
- eval_results.append(result)
487
- except Exception as e:
488
- print(f"Error processing {model_result_filepath}: {e}")
489
- continue
490
-
491
- return eval_results
492
-
493
-
494
-
495
-
496
-
497
-
498
-
499
-
500
-
501
-
502
-
503
-
504
-
505
-
506
-
507
-
508
-
509
-
510
- # from dataclasses import dataclass
511
- # from enum import Enum
512
- # from typing import Dict, List, Any
513
-
514
- # @dataclass
515
- # class Task:
516
- # benchmark: str
517
- # metrics: list[str]
518
- # col_name: str
519
-
520
- # def get_model_ids(self, results: Dict) -> List[str]:
521
- # """Extract model IDs from results"""
522
- # try:
523
- # return [result["model_id"] for result in results["results"]]
524
- # except (KeyError, TypeError):
525
- # return []
526
-
527
- # class TasksMIB(Enum):
528
- # task0 = Task("ioi", ["edge_counts", "faithfulness"], "Indirect Object Identification")
529
- # task1 = Task("mcqa", ["edge_counts", "faithfulness"], "Multiple Choice QA")
530
-
531
- # @classmethod
532
- # def get_models(cls, results: Dict) -> List[str]:
533
- # """Class method to get model IDs using any task"""
534
- # # Since model IDs are common across tasks, we can use any task to extract them
535
- # return cls.task0.value.get_model_ids(results)
536
-
537
- # # Example usage:
538
- # results = {
539
- # "method_name": "EAP-IG (mean)",
540
- # "results": [
541
- # {"model_id": "meta-llama/Llama-3.1-8B", "scores": {}},
542
- # {"model_id": "Qwen/Qwen2-1.5B", "scores": {}}
543
- # ]
544
- # }
545
-
546
- # # Get models using TasksMIB
547
- # model_ids = TasksMIB.get_models(results)
548
- # print(model_ids) # ['meta-llama/Llama-3.1-8B', 'Qwen/Qwen2-1.5B']
549
-
550
-
551
- from dataclasses import dataclass
552
- from enum import Enum
553
- from typing import Dict, List, Tuple
554
-
555
- @dataclass
556
- class Task:
557
- benchmark: str
558
- metrics: list[str]
559
- col_name: str
560
-
561
- def get_method_results(self, results: Dict) -> List[Tuple[str, str, Dict]]:
562
- """
563
- Extract (method_name, model_id, scores) tuples from results
564
-
565
- Args:
566
- results (Dict): Results dictionary containing method_name and results
567
-
568
- Returns:
569
- List[Tuple[str, str, Dict]]: List of (method_name, model_id, scores) tuples
570
- """
571
- method_name = results.get("method_name", "unknown")
572
- try:
573
- return [
574
- (method_name, result["model_id"], result["scores"])
575
- for result in results["results"]
576
- ]
577
- except (KeyError, TypeError):
578
- return []
579
-
580
- class TasksMIB(Enum):
581
- task0 = Task("ioi", ["edge_counts", "faithfulness"], "Indirect Object Identification")
582
- task1 = Task("mcqa", ["edge_counts", "faithfulness"], "Multiple Choice QA")
583
-
584
- @classmethod
585
- def get_method_model_pairs(cls, results: Dict) -> List[Tuple[str, str]]:
586
- """Get all (method_name, model_id) pairs from results"""
587
- return [(pair[0], pair[1]) for pair in cls.task0.value.get_method_results(results)]
588
-
589
- # Example usage:
590
- results = {
591
- "method_name": "EAP-IG (mean)",
592
- "results": [
593
- {"model_id": "meta-llama/Llama-3.1-8B", "scores": {}},
594
- {"model_id": "Qwen/Qwen2-1.5B", "scores": {}}
595
- ]
596
- }
597
-
598
- # Get method-model pairs
599
- method_model_pairs = TasksMIB.get_method_model_pairs(results)
600
- print(method_model_pairs)
601
- # [('EAP-IG (mean)', 'meta-llama/Llama-3.1-8B'), ('EAP-IG (mean)', 'Qwen/Qwen2-1.5B')]
602
-
603
- # Get full results including scores
604
- full_results = TasksMIB.task0.value.get_method_results(results)
605
- for method_name, model_id, scores in full_results:
606
- print(f"Method: {method_name}, Model: {model_id}")
607
- print(f"Scores: {scores}")
 
16
  from src.about import TasksMIB
17
 
18
 
19
+ # def compute_area(edge_counts, faithfulnesses, log_scale=True):
20
+ # percentages = [e / max(edge_counts) for e in edge_counts]
21
+ # area_under = 0.
22
+ # area_from_100 = 0.
23
+ # for i in range(len(faithfulnesses) - 1):
24
+ # i_1, i_2 = i, i+1
25
+ # x_1 = percentages[i_1]
26
+ # x_2 = percentages[i_2]
27
+ # # area from point to 100
28
+ # if log_scale:
29
+ # x_1 = math.log(x_1)
30
+ # x_2 = math.log(x_2)
31
+ # trapezoidal = (percentages[i_2] - percentages[i_1]) * \
32
+ # (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
33
+ # area_from_100 += trapezoidal
34
+
35
+ # trapezoidal = (percentages[i_2] - percentages[i_1]) * ((faithfulnesses[i_1] + faithfulnesses[i_2]) / 2)
36
+ # area_under += trapezoidal
37
+ # average = sum(faithfulnesses) / len(faithfulnesses)
38
+ # return (area_under, area_from_100, average)
39
  def compute_area(edge_counts, faithfulnesses, log_scale=True):
40
+ # Return None if either list is empty
41
+ if not edge_counts or not faithfulnesses:
42
+ return None, None, None
43
+
44
  percentages = [e / max(edge_counts) for e in edge_counts]
45
  area_under = 0.
46
  area_from_100 = 0.
 
68
  method_name: str # name of the interpretation method
69
  results: Dict # nested dict of results {task: {model: {metric: scores}}}
70
 
 
 
 
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def init_from_json_file(self, json_filepath):
73
  """Inits results from the method result file"""
74
  with open(json_filepath) as fp:
 
111
  )
112
 
113
 
114
+
115
  # def to_dict(self):
116
  # """Converts the Eval Result to a dict for dataframe display"""
117
  # data_dict = {
 
120
  # }
121
 
122
  # all_scores = []
123
+ # required_entries = {
124
+ # 'ioi_meta_llama': False,
125
+ # 'ioi_qwen': False,
126
+ # 'ioi_gpt2': False,
127
+ # 'mcqa_meta_llama': False,
128
+ # 'mcqa_qwen': False,
129
+ # 'mcqa_gpt2': False
130
+ # }
131
 
132
  # # For each task (ioi, mcqa)
133
  # for task, task_results in self.results.items():
 
135
  # models = task_results.keys()
136
 
137
  # for model in models:
 
138
  # col_name = f"{task}_{model}"
139
  # metrics = task_results[model]
140
  # if metrics:
 
143
  # if isinstance(faithfulness[0], list):
144
  # faithfulness = faithfulness[0]
145
 
146
+ # # Use compute_area
147
  # area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
148
+ # score = area_under * 100
149
  # data_dict[col_name] = round(score, 2)
150
  # all_scores.append(score)
151
+ # required_entries[col_name] = True
152
  # else:
153
  # data_dict[col_name] = '-'
154
 
155
+ # # Only show average if all six required entries are present
156
+ # if all(required_entries.values()):
157
  # data_dict["Average"] = round(np.mean(all_scores), 2)
158
  # else:
159
  # data_dict["Average"] = '-'
160
 
161
  # return data_dict
162
+
163
+
164
  def to_dict(self):
165
  """Converts the Eval Result to a dict for dataframe display"""
166
  data_dict = {
167
  "eval_name": self.eval_name,
168
  "Method": self.method_name,
169
  }
170
+
171
  all_scores = []
172
  required_entries = {
173
  'ioi_meta_llama': False,
 
178
  'mcqa_gpt2': False
179
  }
180
 
 
181
  for task, task_results in self.results.items():
 
182
  models = task_results.keys()
183
 
184
  for model in models:
185
  col_name = f"{task}_{model}"
186
  metrics = task_results[model]
187
+
188
+ # Handle empty lists case
189
+ if not metrics or not metrics["edge_counts"] or not metrics["faithfulness"]:
190
+ data_dict[col_name] = '-'
191
+ continue
192
+
193
+ faithfulness = metrics["faithfulness"]
194
+ if isinstance(faithfulness[0], list):
195
+ faithfulness = faithfulness[0]
196
+
197
+ result = compute_area(metrics["edge_counts"], faithfulness)
198
+ if result is None or result[0] is None:
199
+ data_dict[col_name] = '-'
200
+ else:
201
+ area_under, _, _ = result
202
  score = area_under * 100
203
  data_dict[col_name] = round(score, 2)
204
  all_scores.append(score)
205
  required_entries[col_name] = True
206
+
 
 
207
  # Only show average if all six required entries are present
208
  if all(required_entries.values()):
209
  data_dict["Average"] = round(np.mean(all_scores), 2)
 
216
 
217
 
218
 
219
+
220
+
221
+
222
+ def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[EvalResult_MIB]:
223
+ """From the path of the results folder root, extract all needed info for MIB results"""
224
+ model_result_filepaths = []
225
+
226
+ print(f"results_path is {results_path}")
227
+
228
+ for root, dirnames, files in os.walk(results_path):
229
+ print(f"root is {root}, dirnames is {dirnames}, files is {files}")
230
+ # We should only have json files in model results
231
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
232
+ continue
233
+
234
+ # Sort the files by date - keeping original sorting logic
235
+ try:
236
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
237
+ except dateutil.parser._parser.ParserError:
238
+ files = [files[-1]]
239
+
240
+ for file in files:
241
+ model_result_filepaths.append(os.path.join(root, file))
242
+
243
+ print(f"model_result_filepaths is {model_result_filepaths}")
244
+
245
+ eval_results = []
246
+ for model_result_filepath in model_result_filepaths:
247
+ try:
248
+ eval_result = EvalResult_MIB("", "", {}) # Create empty instance
249
+ result = eval_result.init_from_json_file(model_result_filepath)
250
+ print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
251
+ # Verify the result can be converted to dict format
252
+ result.to_dict()
253
+ eval_results.append(result)
254
+ except Exception as e:
255
+ print(f"Error processing {model_result_filepath}: {e}")
256
+ continue
257
+
258
+ return eval_results
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
  @dataclass
270
  class EvalResult:
271
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
 
459
 
460
 
461