Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
79683c9
1
Parent(s):
7ed00c5
debug
Browse files- src/leaderboard/read_evals.py +51 -21
src/leaderboard/read_evals.py
CHANGED
@@ -188,7 +188,7 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
188 |
eval_name: str
|
189 |
method_name: str
|
190 |
results: Dict
|
191 |
-
|
192 |
def init_from_json_file(self, json_filepath):
|
193 |
"""Inits results from the method result file"""
|
194 |
with open(json_filepath) as fp:
|
@@ -229,30 +229,60 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
229 |
results=results
|
230 |
)
|
231 |
|
232 |
-
def to_dict(self):
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
|
255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
|
258 |
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
|
|
188 |
eval_name: str
|
189 |
method_name: str
|
190 |
results: Dict
|
191 |
+
|
192 |
def init_from_json_file(self, json_filepath):
|
193 |
"""Inits results from the method result file"""
|
194 |
with open(json_filepath) as fp:
|
|
|
229 |
results=results
|
230 |
)
|
231 |
|
232 |
+
# def to_dict(self):
|
233 |
+
# """Converts the Eval Result to a dict for dataframe display"""
|
234 |
+
# data_dict = {
|
235 |
+
# "eval_name": self.eval_name,
|
236 |
+
# "Method": self.method_name,
|
237 |
+
# }
|
238 |
|
239 |
+
# # Process each model's results
|
240 |
+
# for model_id, model_results in self.results.items():
|
241 |
+
# for task, task_scores in model_results.items():
|
242 |
+
# # Calculate layer-averaged scores for each intervention
|
243 |
+
# intervention_scores = defaultdict(list)
|
244 |
|
245 |
+
# for layer_data in task_scores:
|
246 |
+
# for score_data in layer_data['scores']:
|
247 |
+
# intervention = score_data['intervention']
|
248 |
+
# intervention_scores[intervention].append(score_data['score'])
|
249 |
|
250 |
+
# # Average across layers for each intervention
|
251 |
+
# for intervention, scores in intervention_scores.items():
|
252 |
+
# col_name = f"{model_id}_{task}_{intervention}".lower()
|
253 |
+
# data_dict[col_name] = round(np.mean(scores), 3)
|
254 |
|
255 |
+
# return data_dict
|
256 |
+
def to_dict(self):
|
257 |
+
"""Converts the Eval Result to a dict for dataframe display"""
|
258 |
+
data_dict = {
|
259 |
+
"eval_name": self.eval_name,
|
260 |
+
"Method": self.method_name,
|
261 |
+
}
|
262 |
+
|
263 |
+
# Process each model's results
|
264 |
+
for model_id, task_scores in self.results.items():
|
265 |
+
model_name = model_id.lower() # Lowercase for consistency
|
266 |
+
|
267 |
+
# Each task_scores contains layer data
|
268 |
+
for layer_data in task_scores.get("MCQA", []):
|
269 |
+
layer = layer_data.get("layer")
|
270 |
+
layer_scores = layer_data.get("layer_scores", [])
|
271 |
+
|
272 |
+
# Process each intervention and counterfactual
|
273 |
+
for intervention_data in layer_scores:
|
274 |
+
intervention = intervention_data["intervention"][0]
|
275 |
+
counterfactual_scores = intervention_data["counterfactual_scores"]
|
276 |
+
|
277 |
+
for cf_score in counterfactual_scores:
|
278 |
+
counterfactual = cf_score["counterfactual"][0]
|
279 |
+
score = cf_score["score"]
|
280 |
+
|
281 |
+
# Column name matches what we defined in utils.py
|
282 |
+
col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}".lower()
|
283 |
+
data_dict[col_name] = score
|
284 |
+
|
285 |
+
return data_dict
|
286 |
|
287 |
|
288 |
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|