Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
4d9550b
1
Parent(s):
da0827e
caulsal-track debug
Browse files- src/leaderboard/read_evals.py +173 -157
src/leaderboard/read_evals.py
CHANGED
@@ -171,172 +171,201 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
|
|
171 |
|
172 |
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
177 |
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
|
189 |
-
|
190 |
-
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
|
204 |
-
|
205 |
-
|
206 |
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
# @dataclass
|
229 |
-
# class EvalResult_MIB_CAUSALGRAPH:
|
230 |
-
# eval_name: str
|
231 |
-
# method_name: str
|
232 |
-
# results: Dict
|
233 |
-
|
234 |
-
# def init_from_json_file(self, json_filepath):
|
235 |
-
# """Inits results from the method result file"""
|
236 |
-
# with open(json_filepath) as fp:
|
237 |
-
# data = json.load(fp)
|
238 |
|
239 |
-
#
|
240 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
-
#
|
243 |
-
#
|
244 |
-
#
|
|
|
245 |
|
|
|
|
|
246 |
|
|
|
247 |
|
248 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
250 |
-
# # Process
|
251 |
-
#
|
252 |
-
#
|
253 |
-
#
|
254 |
-
#
|
255 |
-
#
|
256 |
-
|
257 |
-
#
|
258 |
-
|
259 |
-
#
|
260 |
-
#
|
261 |
-
#
|
|
|
|
|
262 |
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
|
|
|
|
268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
-
|
271 |
-
|
|
|
|
|
|
|
|
|
272 |
|
273 |
-
#
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
# unique_method_name = f"{method_name}_{method_counters[method_name]}"
|
278 |
|
279 |
-
#
|
|
|
|
|
|
|
280 |
|
281 |
-
|
282 |
-
|
283 |
|
284 |
-
|
285 |
-
|
286 |
-
#
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
# ])
|
295 |
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
#
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
# for intervention, layer_scores in intervention_scores.items():
|
305 |
-
# column = f"{model}_{task}_{intervention}"
|
306 |
-
# avg_score = np.mean(layer_scores) if layer_scores else 0.0
|
307 |
-
# method_scores.append((column, f"{avg_score:.3f}"))
|
308 |
-
|
309 |
-
# # Sort by column names for consistency
|
310 |
-
# method_scores.sort(key=lambda x: x[0])
|
311 |
-
# data[unique_method_name] = {
|
312 |
-
# col: score for col, score in method_scores
|
313 |
-
# }
|
314 |
-
|
315 |
-
# return pd.DataFrame.from_dict(data, orient='index')
|
316 |
-
|
317 |
-
# def to_dict(self):
|
318 |
-
# """Converts the Eval Result to a dict for dataframe display"""
|
319 |
-
# data_dict = {
|
320 |
-
# "eval_name": self.eval_name,
|
321 |
-
# "Method": self.method_name,
|
322 |
-
# }
|
323 |
-
|
324 |
-
# # Add all results directly
|
325 |
-
# data_dict.update(self.results)
|
326 |
-
|
327 |
-
# return data_dict
|
328 |
-
|
329 |
|
|
|
|
|
|
|
|
|
|
|
330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
|
332 |
|
333 |
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
|
|
334 |
model_result_filepaths = []
|
335 |
|
336 |
-
# print(f"Scanning directory: {results_path}")
|
337 |
for root, dirnames, files in os.walk(results_path):
|
338 |
-
# print(f"Current directory: {root}")
|
339 |
-
# print(f"Found files: {files}")
|
340 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
341 |
continue
|
342 |
|
@@ -348,32 +377,19 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
|
|
348 |
for file in files:
|
349 |
model_result_filepaths.append(os.path.join(root, file))
|
350 |
|
351 |
-
|
352 |
-
|
353 |
-
method_counters = defaultdict(int)
|
354 |
-
dataframes = []
|
355 |
-
|
356 |
-
for json_file in model_result_filepaths:
|
357 |
try:
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
# Process single JSON file
|
364 |
-
df = process_single_json(json_data, method_counters[method_name])
|
365 |
-
dataframes.append(df)
|
366 |
except Exception as e:
|
367 |
-
print(f"Error processing {
|
368 |
continue
|
369 |
-
|
370 |
-
|
371 |
-
# if dataframes:
|
372 |
-
# final_df = pd.concat(dataframes, axis=0)
|
373 |
-
# return final_df
|
374 |
-
# else:
|
375 |
-
# return pd.DataFrame()
|
376 |
-
return dataframes
|
377 |
|
378 |
|
379 |
|
|
|
171 |
|
172 |
|
173 |
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
# def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
|
178 |
+
# """
|
179 |
+
# Process a single JSON file and convert it to a DataFrame.
|
180 |
|
181 |
+
# Args:
|
182 |
+
# json_file: Dictionary containing the analysis results
|
183 |
+
# method_counter: Counter for handling duplicate method names
|
184 |
|
185 |
+
# Returns:
|
186 |
+
# pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
|
187 |
+
# """
|
188 |
+
# method_name = json_file['method_name']
|
189 |
+
# unique_method_name = f"{method_name}_{method_counter}"
|
190 |
+
# method_scores = []
|
191 |
|
192 |
+
# for result in json_file['results']:
|
193 |
+
# model = result['model_id']
|
194 |
|
195 |
+
# for task, scores in result['task_scores'].items():
|
196 |
+
# # Process each layer's data
|
197 |
+
# intervention_scores = defaultdict(list)
|
198 |
|
199 |
+
# for layer_data in scores:
|
200 |
+
# for intervention_data in layer_data['layer_scores']:
|
201 |
+
# # Calculate average score for counterfactuals
|
202 |
+
# avg_cf_score = np.mean([
|
203 |
+
# cf['score']
|
204 |
+
# for cf in intervention_data['counterfactual_scores']
|
205 |
+
# ])
|
206 |
|
207 |
+
# if np.isnan(avg_cf_score):
|
208 |
+
# avg_cf_score = 0.0
|
209 |
|
210 |
+
# # Group scores by intervention
|
211 |
+
# intervention_key = '_'.join(intervention_data['intervention'])
|
212 |
+
# intervention_scores[intervention_key].append(avg_cf_score)
|
213 |
|
214 |
+
# # Average across layers for each intervention
|
215 |
+
# for intervention, layer_scores in intervention_scores.items():
|
216 |
+
# column = f"{model}_{task}_{intervention}"
|
217 |
+
# avg_score = np.mean(layer_scores) if layer_scores else 0.0
|
218 |
+
# method_scores.append((column, f"{avg_score:.3f}"))
|
219 |
|
220 |
+
# # Sort by column names for consistency
|
221 |
+
# method_scores.sort(key=lambda x: x[0])
|
222 |
+
# data = {
|
223 |
+
# unique_method_name: {
|
224 |
+
# col: score for col, score in method_scores
|
225 |
+
# }
|
226 |
+
# }
|
227 |
|
228 |
+
# return pd.DataFrame.from_dict(data, orient='index')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
+
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
231 |
+
# model_result_filepaths = []
|
232 |
+
|
233 |
+
# # print(f"Scanning directory: {results_path}")
|
234 |
+
# for root, dirnames, files in os.walk(results_path):
|
235 |
+
# # print(f"Current directory: {root}")
|
236 |
+
# # print(f"Found files: {files}")
|
237 |
+
# if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
238 |
+
# continue
|
239 |
|
240 |
+
# try:
|
241 |
+
# files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
242 |
+
# except dateutil.parser._parser.ParserError:
|
243 |
+
# files = [files[-1]]
|
244 |
|
245 |
+
# for file in files:
|
246 |
+
# model_result_filepaths.append(os.path.join(root, file))
|
247 |
|
248 |
+
# # print(f"Found json files: {model_result_filepaths}")
|
249 |
|
250 |
+
# method_counters = defaultdict(int)
|
251 |
+
# dataframes = []
|
252 |
+
|
253 |
+
# for json_file in model_result_filepaths:
|
254 |
+
# try:
|
255 |
+
# with open(filepath, 'r') as f:
|
256 |
+
# json_data = json.load(f)
|
257 |
+
# method_name = json_data['method_name']
|
258 |
+
# method_counters[method_name] += 1
|
259 |
|
260 |
+
# # Process single JSON file
|
261 |
+
# df = process_single_json(json_data, method_counters[method_name])
|
262 |
+
# dataframes.append(df)
|
263 |
+
# except Exception as e:
|
264 |
+
# print(f"Error processing {json_file}: {e}")
|
265 |
+
# continue
|
266 |
+
|
267 |
+
# # # Concatenate all DataFrames
|
268 |
+
# # if dataframes:
|
269 |
+
# # final_df = pd.concat(dataframes, axis=0)
|
270 |
+
# # return final_df
|
271 |
+
# # else:
|
272 |
+
# # return pd.DataFrame()
|
273 |
+
# return dataframes
|
274 |
|
275 |
+
from dataclasses import dataclass
|
276 |
+
import json
|
277 |
+
import numpy as np
|
278 |
+
from typing import Dict, List, Any
|
279 |
+
import os
|
280 |
+
from datetime import datetime
|
281 |
+
import dateutil
|
282 |
|
283 |
+
@dataclass
|
284 |
+
class EvalResult_MIB_CAUSALGRAPH:
|
285 |
+
"""Represents one full evaluation for a method across all models in MIB for causal graph track."""
|
286 |
+
eval_name: str # method name as identifier
|
287 |
+
method_name: str # name of the interpretation method
|
288 |
+
results: Dict # nested dict of results for each model and task
|
289 |
|
290 |
+
def init_from_json_file(self, json_filepath: str):
|
291 |
+
"""Inits results from the method result file"""
|
292 |
+
with open(json_filepath) as fp:
|
293 |
+
data = json.load(fp)
|
294 |
+
|
295 |
+
method_name = data.get("method_name")
|
296 |
|
297 |
+
# Initialize results dictionary
|
298 |
+
results = {}
|
299 |
+
for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:
|
300 |
+
results[task] = {}
|
|
|
301 |
|
302 |
+
# Process each model's results
|
303 |
+
for result in data.get("results", []):
|
304 |
+
model_id = result.get("model_id", "")
|
305 |
+
model_name = model_id.replace(".", "_")
|
306 |
|
307 |
+
for task, scores in result.get("task_scores", {}).items():
|
308 |
+
intervention_scores = defaultdict(list)
|
309 |
|
310 |
+
for layer_data in scores:
|
311 |
+
for intervention_data in layer_data['layer_scores']:
|
312 |
+
# Calculate average score for counterfactuals
|
313 |
+
avg_cf_score = np.mean([
|
314 |
+
cf['score']
|
315 |
+
for cf in intervention_data['counterfactual_scores']
|
316 |
+
])
|
317 |
+
|
318 |
+
if np.isnan(avg_cf_score):
|
319 |
+
avg_cf_score = 0.0
|
|
|
320 |
|
321 |
+
intervention_key = '_'.join(intervention_data['intervention'])
|
322 |
+
intervention_scores[intervention_key].append(avg_cf_score)
|
323 |
+
|
324 |
+
# Average across layers for each intervention
|
325 |
+
results[task][model_name] = {
|
326 |
+
interv: np.mean(scores) if scores else 0.0
|
327 |
+
for interv, scores in intervention_scores.items()
|
328 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
|
330 |
+
return EvalResult_MIB_CAUSALGRAPH(
|
331 |
+
eval_name=method_name,
|
332 |
+
method_name=method_name,
|
333 |
+
results=results
|
334 |
+
)
|
335 |
|
336 |
+
def to_dict(self, metric_type="average"):
|
337 |
+
"""Converts the Eval Result to a dict for dataframe display"""
|
338 |
+
data_dict = {
|
339 |
+
"eval_name": self.eval_name,
|
340 |
+
"Method": self.method_name,
|
341 |
+
}
|
342 |
+
|
343 |
+
# Initialize columns for all task-model combinations
|
344 |
+
all_scores = []
|
345 |
+
for task, task_results in self.results.items():
|
346 |
+
for model, intervention_scores in task_results.items():
|
347 |
+
if not intervention_scores:
|
348 |
+
continue
|
349 |
+
|
350 |
+
col_name = f"{task}_{model}"
|
351 |
+
scores = list(intervention_scores.values())
|
352 |
+
if not scores:
|
353 |
+
data_dict[col_name] = '-'
|
354 |
+
continue
|
355 |
+
|
356 |
+
avg_score = np.mean(scores)
|
357 |
+
data_dict[col_name] = round(avg_score, 3)
|
358 |
+
all_scores.append(avg_score)
|
359 |
+
|
360 |
+
data_dict["Average"] = round(np.mean(all_scores), 3) if all_scores else '-'
|
361 |
+
return data_dict
|
362 |
|
363 |
|
364 |
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
365 |
+
"""From the path of the results folder root, extract all needed info for MIB causal graph results"""
|
366 |
model_result_filepaths = []
|
367 |
|
|
|
368 |
for root, dirnames, files in os.walk(results_path):
|
|
|
|
|
369 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
370 |
continue
|
371 |
|
|
|
377 |
for file in files:
|
378 |
model_result_filepaths.append(os.path.join(root, file))
|
379 |
|
380 |
+
eval_results = []
|
381 |
+
for model_result_filepath in model_result_filepaths:
|
|
|
|
|
|
|
|
|
382 |
try:
|
383 |
+
eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {}) # Create empty instance
|
384 |
+
result = eval_result.init_from_json_file(model_result_filepath)
|
385 |
+
# Verify the result can be converted to dict format
|
386 |
+
result.to_dict()
|
387 |
+
eval_results.append(result)
|
|
|
|
|
|
|
388 |
except Exception as e:
|
389 |
+
print(f"Error processing {model_result_filepath}: {e}")
|
390 |
continue
|
391 |
+
|
392 |
+
return eval_results
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
|
394 |
|
395 |
|