Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
aea751c
1
Parent(s):
79683c9
debug
Browse files- src/leaderboard/read_evals.py +25 -25
src/leaderboard/read_evals.py
CHANGED
@@ -254,35 +254,35 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
254 |
|
255 |
# return data_dict
|
256 |
def to_dict(self):
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
# Each task_scores contains layer data
|
268 |
-
for layer_data in task_scores.get("MCQA", []):
|
269 |
-
layer = layer_data.get("layer")
|
270 |
-
layer_scores = layer_data.get("layer_scores", [])
|
271 |
|
272 |
-
#
|
273 |
-
for
|
274 |
-
|
275 |
-
|
276 |
|
277 |
-
|
278 |
-
|
279 |
-
|
|
|
280 |
|
281 |
-
|
282 |
-
|
283 |
-
|
|
|
|
|
|
|
|
|
284 |
|
285 |
-
|
286 |
|
287 |
|
288 |
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
|
|
254 |
|
255 |
# return data_dict
|
256 |
def to_dict(self):
|
257 |
+
"""Converts the Eval Result to a dict for dataframe display"""
|
258 |
+
data_dict = {
|
259 |
+
"eval_name": self.eval_name,
|
260 |
+
"Method": self.method_name,
|
261 |
+
}
|
262 |
+
|
263 |
+
# Process each model's results
|
264 |
+
for model_id, task_scores in self.results.items():
|
265 |
+
model_name = model_id.lower() # Lowercase for consistency
|
|
|
|
|
|
|
|
|
|
|
266 |
|
267 |
+
# Each task_scores contains layer data
|
268 |
+
for layer_data in task_scores.get("MCQA", []):
|
269 |
+
layer = layer_data.get("layer")
|
270 |
+
layer_scores = layer_data.get("layer_scores", [])
|
271 |
|
272 |
+
# Process each intervention and counterfactual
|
273 |
+
for intervention_data in layer_scores:
|
274 |
+
intervention = intervention_data["intervention"][0]
|
275 |
+
counterfactual_scores = intervention_data["counterfactual_scores"]
|
276 |
|
277 |
+
for cf_score in counterfactual_scores:
|
278 |
+
counterfactual = cf_score["counterfactual"][0]
|
279 |
+
score = cf_score["score"]
|
280 |
+
|
281 |
+
# Column name matches what we defined in utils.py
|
282 |
+
col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}".lower()
|
283 |
+
data_dict[col_name] = score
|
284 |
|
285 |
+
return data_dict
|
286 |
|
287 |
|
288 |
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|