jasonshaoshun commited on
Commit
79683c9
·
1 Parent(s): 7ed00c5
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +51 -21
src/leaderboard/read_evals.py CHANGED
@@ -188,7 +188,7 @@ class EvalResult_MIB_CAUSALGRAPH:
188
  eval_name: str
189
  method_name: str
190
  results: Dict
191
-
192
  def init_from_json_file(self, json_filepath):
193
  """Inits results from the method result file"""
194
  with open(json_filepath) as fp:
@@ -229,30 +229,60 @@ class EvalResult_MIB_CAUSALGRAPH:
229
  results=results
230
  )
231
 
232
- def to_dict(self):
233
- """Converts the Eval Result to a dict for dataframe display"""
234
- data_dict = {
235
- "eval_name": self.eval_name,
236
- "Method": self.method_name,
237
- }
238
 
239
- # Process each model's results
240
- for model_id, model_results in self.results.items():
241
- for task, task_scores in model_results.items():
242
- # Calculate layer-averaged scores for each intervention
243
- intervention_scores = defaultdict(list)
244
 
245
- for layer_data in task_scores:
246
- for score_data in layer_data['scores']:
247
- intervention = score_data['intervention']
248
- intervention_scores[intervention].append(score_data['score'])
249
 
250
- # Average across layers for each intervention
251
- for intervention, scores in intervention_scores.items():
252
- col_name = f"{model_id}_{task}_{intervention}".lower()
253
- data_dict[col_name] = round(np.mean(scores), 3)
254
 
255
- return data_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
 
258
  # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
 
188
  eval_name: str
189
  method_name: str
190
  results: Dict
191
+
192
  def init_from_json_file(self, json_filepath):
193
  """Inits results from the method result file"""
194
  with open(json_filepath) as fp:
 
229
  results=results
230
  )
231
 
232
+ # def to_dict(self):
233
+ # """Converts the Eval Result to a dict for dataframe display"""
234
+ # data_dict = {
235
+ # "eval_name": self.eval_name,
236
+ # "Method": self.method_name,
237
+ # }
238
 
239
+ # # Process each model's results
240
+ # for model_id, model_results in self.results.items():
241
+ # for task, task_scores in model_results.items():
242
+ # # Calculate layer-averaged scores for each intervention
243
+ # intervention_scores = defaultdict(list)
244
 
245
+ # for layer_data in task_scores:
246
+ # for score_data in layer_data['scores']:
247
+ # intervention = score_data['intervention']
248
+ # intervention_scores[intervention].append(score_data['score'])
249
 
250
+ # # Average across layers for each intervention
251
+ # for intervention, scores in intervention_scores.items():
252
+ # col_name = f"{model_id}_{task}_{intervention}".lower()
253
+ # data_dict[col_name] = round(np.mean(scores), 3)
254
 
255
+ # return data_dict
256
+ def to_dict(self):
257
+ """Converts the Eval Result to a dict for dataframe display"""
258
+ data_dict = {
259
+ "eval_name": self.eval_name,
260
+ "Method": self.method_name,
261
+ }
262
+
263
+ # Process each model's results
264
+ for model_id, task_scores in self.results.items():
265
+ model_name = model_id.lower() # Lowercase for consistency
266
+
267
+ # Each task_scores contains layer data
268
+ for layer_data in task_scores.get("MCQA", []):
269
+ layer = layer_data.get("layer")
270
+ layer_scores = layer_data.get("layer_scores", [])
271
+
272
+ # Process each intervention and counterfactual
273
+ for intervention_data in layer_scores:
274
+ intervention = intervention_data["intervention"][0]
275
+ counterfactual_scores = intervention_data["counterfactual_scores"]
276
+
277
+ for cf_score in counterfactual_scores:
278
+ counterfactual = cf_score["counterfactual"][0]
279
+ score = cf_score["score"]
280
+
281
+ # Column name matches what we defined in utils.py
282
+ col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}".lower()
283
+ data_dict[col_name] = score
284
+
285
+ return data_dict
286
 
287
 
288
  # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]: