shunshao commited on
Commit
19554a3
·
verified ·
1 Parent(s): a5e49c6

Update src/leaderboard/read_evals.py

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +26 -28
src/leaderboard/read_evals.py CHANGED
@@ -161,6 +161,7 @@ class EvalResult_MIB:
161
  # return data_dict
162
 
163
 
 
164
  def to_dict(self):
165
  """Converts the Eval Result to a dict for dataframe display"""
166
  data_dict = {
@@ -168,48 +169,45 @@ class EvalResult_MIB:
168
  "Method": self.method_name,
169
  }
170
 
 
 
 
 
 
 
 
171
  all_scores = []
172
- required_entries = {
173
- 'ioi_meta_llama': False,
174
- 'ioi_qwen': False,
175
- 'ioi_gpt2': False,
176
- 'mcqa_meta_llama': False,
177
- 'mcqa_qwen': False,
178
- 'mcqa_gpt2': False
179
- }
180
-
181
  for task, task_results in self.results.items():
182
- models = task_results.keys()
183
-
184
- for model in models:
185
  col_name = f"{task}_{model}"
186
- metrics = task_results[model]
187
 
188
- # Handle empty lists case
189
  if not metrics or not metrics["edge_counts"] or not metrics["faithfulness"]:
190
- data_dict[col_name] = '-'
191
  continue
192
 
193
  faithfulness = metrics["faithfulness"]
194
  if isinstance(faithfulness[0], list):
195
  faithfulness = faithfulness[0]
196
-
197
  result = compute_area(metrics["edge_counts"], faithfulness)
198
  if result is None or result[0] is None:
199
- data_dict[col_name] = '-'
200
- else:
201
- area_under, _, _ = result
202
- score = area_under * 100
203
- data_dict[col_name] = round(score, 2)
204
- all_scores.append(score)
205
- required_entries[col_name] = True
206
 
207
- # Only show average if all six required entries are present
208
- if all(required_entries.values()):
209
- data_dict["Average"] = round(np.mean(all_scores), 2)
210
- else:
211
- data_dict["Average"] = '-'
 
 
 
 
212
 
 
213
  return data_dict
214
 
215
 
 
161
  # return data_dict
162
 
163
 
164
+
165
  def to_dict(self):
166
  """Converts the Eval Result to a dict for dataframe display"""
167
  data_dict = {
 
169
  "Method": self.method_name,
170
  }
171
 
172
+ # Initialize all possible columns with '-'
173
+ expected_models = ["meta_llama", "qwen", "gpt2"]
174
+ expected_tasks = ["ioi", "mcqa"]
175
+ for task in expected_tasks:
176
+ for model in expected_models:
177
+ data_dict[f"{task}_{model}"] = '-'
178
+
179
  all_scores = []
 
 
 
 
 
 
 
 
 
180
  for task, task_results in self.results.items():
181
+ for model, metrics in task_results.items():
 
 
182
  col_name = f"{task}_{model}"
 
183
 
 
184
  if not metrics or not metrics["edge_counts"] or not metrics["faithfulness"]:
 
185
  continue
186
 
187
  faithfulness = metrics["faithfulness"]
188
  if isinstance(faithfulness[0], list):
189
  faithfulness = faithfulness[0]
190
+
191
  result = compute_area(metrics["edge_counts"], faithfulness)
192
  if result is None or result[0] is None:
193
+ continue
194
+
195
+ area_under, _, _ = result
196
+ score = area_under * 100
197
+ data_dict[col_name] = round(score, 2)
198
+ all_scores.append(score)
 
199
 
200
+ # All entries must be present for average
201
+ required_entries = [
202
+ data_dict['ioi_meta_llama'] != '-',
203
+ data_dict['ioi_qwen'] != '-',
204
+ data_dict['ioi_gpt2'] != '-',
205
+ data_dict['mcqa_meta_llama'] != '-',
206
+ data_dict['mcqa_qwen'] != '-',
207
+ data_dict['mcqa_gpt2'] != '-'
208
+ ]
209
 
210
+ data_dict["Average"] = round(np.mean(all_scores), 2) if all(required_entries) else '-'
211
  return data_dict
212
 
213