jasonshaoshun commited on
Commit
4d9550b
·
1 Parent(s): da0827e

caulsal-track debug

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +173 -157
src/leaderboard/read_evals.py CHANGED
@@ -171,172 +171,201 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
171
 
172
 
173
 
174
- def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
175
- """
176
- Process a single JSON file and convert it to a DataFrame.
 
 
 
177
 
178
- Args:
179
- json_file: Dictionary containing the analysis results
180
- method_counter: Counter for handling duplicate method names
181
 
182
- Returns:
183
- pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
184
- """
185
- method_name = json_file['method_name']
186
- unique_method_name = f"{method_name}_{method_counter}"
187
- method_scores = []
188
 
189
- for result in json_file['results']:
190
- model = result['model_id']
191
 
192
- for task, scores in result['task_scores'].items():
193
- # Process each layer's data
194
- intervention_scores = defaultdict(list)
195
 
196
- for layer_data in scores:
197
- for intervention_data in layer_data['layer_scores']:
198
- # Calculate average score for counterfactuals
199
- avg_cf_score = np.mean([
200
- cf['score']
201
- for cf in intervention_data['counterfactual_scores']
202
- ])
203
 
204
- if np.isnan(avg_cf_score):
205
- avg_cf_score = 0.0
206
 
207
- # Group scores by intervention
208
- intervention_key = '_'.join(intervention_data['intervention'])
209
- intervention_scores[intervention_key].append(avg_cf_score)
210
 
211
- # Average across layers for each intervention
212
- for intervention, layer_scores in intervention_scores.items():
213
- column = f"{model}_{task}_{intervention}"
214
- avg_score = np.mean(layer_scores) if layer_scores else 0.0
215
- method_scores.append((column, f"{avg_score:.3f}"))
216
 
217
- # Sort by column names for consistency
218
- method_scores.sort(key=lambda x: x[0])
219
- data = {
220
- unique_method_name: {
221
- col: score for col, score in method_scores
222
- }
223
- }
224
 
225
- return pd.DataFrame.from_dict(data, orient='index')
226
-
227
-
228
- # @dataclass
229
- # class EvalResult_MIB_CAUSALGRAPH:
230
- # eval_name: str
231
- # method_name: str
232
- # results: Dict
233
-
234
- # def init_from_json_file(self, json_filepath):
235
- # """Inits results from the method result file"""
236
- # with open(json_filepath) as fp:
237
- # data = json.load(fp)
238
 
239
- # method_name = data.get("method_name")
240
- # results = {}
 
 
 
 
 
 
 
241
 
242
- # # Process each model's results
243
- # for model_result in data.get("results", []):
244
- # model_id = model_result.get("model_id", "")
 
245
 
 
 
246
 
 
247
 
248
- # task_scores = model_result.get("task_scores", {})
 
 
 
 
 
 
 
 
249
 
250
- # # Process MCQA scores
251
- # for layer_data in task_scores.get("MCQA", []):
252
- # layer = layer_data.get("layer")
253
- # for score_data in layer_data.get("layer_scores", []):
254
- # intervention = score_data["intervention"][0]
255
- # for cf_score in score_data["counterfactual_scores"]:
256
- # counterfactual = cf_score["counterfactual"][0]
257
- # score = cf_score["score"]
258
-
259
- # # Create key matching the expected column format
260
- # key = f"{model_id}_layer{layer}_{intervention}_{counterfactual}"
261
- # results[key] = score
 
 
262
 
263
- # return EvalResult_MIB_CAUSALGRAPH(
264
- # eval_name=method_name,
265
- # method_name=method_name,
266
- # results=results
267
- # )
 
 
268
 
 
 
 
 
 
 
269
 
270
- # data = {}
271
- # method_counters = defaultdict(int)
 
 
 
 
272
 
273
- # for json_file in json_files:
274
- # # Handle method name and duplicates
275
- # method_name = json_file['method_name']
276
- # method_counters[method_name] += 1
277
- # unique_method_name = f"{method_name}_{method_counters[method_name]}"
278
 
279
- # method_scores = []
 
 
 
280
 
281
- # for result in json_file['results']:
282
- # model = result['model_id']
283
 
284
- # for task, scores in result['task_scores'].items():
285
- # # Process each layer's data
286
- # intervention_scores = defaultdict(list)
287
-
288
- # for layer_data in scores:
289
- # for intervention_data in layer_data['layer_scores']:
290
- # # Calculate average score for counterfactuals
291
- # avg_cf_score = np.mean([
292
- # cf['score']
293
- # for cf in intervention_data['counterfactual_scores']
294
- # ])
295
 
296
- # if np.isnan(avg_cf_score):
297
- # avg_cf_score = 0.0
298
-
299
- # # Group scores by intervention
300
- # intervention_key = '_'.join(intervention_data['intervention'])
301
- # intervention_scores[intervention_key].append(avg_cf_score)
302
-
303
- # # Average across layers for each intervention
304
- # for intervention, layer_scores in intervention_scores.items():
305
- # column = f"{model}_{task}_{intervention}"
306
- # avg_score = np.mean(layer_scores) if layer_scores else 0.0
307
- # method_scores.append((column, f"{avg_score:.3f}"))
308
-
309
- # # Sort by column names for consistency
310
- # method_scores.sort(key=lambda x: x[0])
311
- # data[unique_method_name] = {
312
- # col: score for col, score in method_scores
313
- # }
314
-
315
- # return pd.DataFrame.from_dict(data, orient='index')
316
-
317
- # def to_dict(self):
318
- # """Converts the Eval Result to a dict for dataframe display"""
319
- # data_dict = {
320
- # "eval_name": self.eval_name,
321
- # "Method": self.method_name,
322
- # }
323
-
324
- # # Add all results directly
325
- # data_dict.update(self.results)
326
-
327
- # return data_dict
328
-
329
 
 
 
 
 
 
330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
 
333
  def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
 
334
  model_result_filepaths = []
335
 
336
- # print(f"Scanning directory: {results_path}")
337
  for root, dirnames, files in os.walk(results_path):
338
- # print(f"Current directory: {root}")
339
- # print(f"Found files: {files}")
340
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
341
  continue
342
 
@@ -348,32 +377,19 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
348
  for file in files:
349
  model_result_filepaths.append(os.path.join(root, file))
350
 
351
- # print(f"Found json files: {model_result_filepaths}")
352
-
353
- method_counters = defaultdict(int)
354
- dataframes = []
355
-
356
- for json_file in model_result_filepaths:
357
  try:
358
- with open(filepath, 'r') as f:
359
- json_data = json.load(f)
360
- method_name = json_data['method_name']
361
- method_counters[method_name] += 1
362
-
363
- # Process single JSON file
364
- df = process_single_json(json_data, method_counters[method_name])
365
- dataframes.append(df)
366
  except Exception as e:
367
- print(f"Error processing {json_file}: {e}")
368
  continue
369
-
370
- # # Concatenate all DataFrames
371
- # if dataframes:
372
- # final_df = pd.concat(dataframes, axis=0)
373
- # return final_df
374
- # else:
375
- # return pd.DataFrame()
376
- return dataframes
377
 
378
 
379
 
 
171
 
172
 
173
 
174
+
175
+
176
+
177
+ # def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
178
+ # """
179
+ # Process a single JSON file and convert it to a DataFrame.
180
 
181
+ # Args:
182
+ # json_file: Dictionary containing the analysis results
183
+ # method_counter: Counter for handling duplicate method names
184
 
185
+ # Returns:
186
+ # pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
187
+ # """
188
+ # method_name = json_file['method_name']
189
+ # unique_method_name = f"{method_name}_{method_counter}"
190
+ # method_scores = []
191
 
192
+ # for result in json_file['results']:
193
+ # model = result['model_id']
194
 
195
+ # for task, scores in result['task_scores'].items():
196
+ # # Process each layer's data
197
+ # intervention_scores = defaultdict(list)
198
 
199
+ # for layer_data in scores:
200
+ # for intervention_data in layer_data['layer_scores']:
201
+ # # Calculate average score for counterfactuals
202
+ # avg_cf_score = np.mean([
203
+ # cf['score']
204
+ # for cf in intervention_data['counterfactual_scores']
205
+ # ])
206
 
207
+ # if np.isnan(avg_cf_score):
208
+ # avg_cf_score = 0.0
209
 
210
+ # # Group scores by intervention
211
+ # intervention_key = '_'.join(intervention_data['intervention'])
212
+ # intervention_scores[intervention_key].append(avg_cf_score)
213
 
214
+ # # Average across layers for each intervention
215
+ # for intervention, layer_scores in intervention_scores.items():
216
+ # column = f"{model}_{task}_{intervention}"
217
+ # avg_score = np.mean(layer_scores) if layer_scores else 0.0
218
+ # method_scores.append((column, f"{avg_score:.3f}"))
219
 
220
+ # # Sort by column names for consistency
221
+ # method_scores.sort(key=lambda x: x[0])
222
+ # data = {
223
+ # unique_method_name: {
224
+ # col: score for col, score in method_scores
225
+ # }
226
+ # }
227
 
228
+ # return pd.DataFrame.from_dict(data, orient='index')
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
+ # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
231
+ # model_result_filepaths = []
232
+
233
+ # # print(f"Scanning directory: {results_path}")
234
+ # for root, dirnames, files in os.walk(results_path):
235
+ # # print(f"Current directory: {root}")
236
+ # # print(f"Found files: {files}")
237
+ # if len(files) == 0 or any([not f.endswith(".json") for f in files]):
238
+ # continue
239
 
240
+ # try:
241
+ # files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
242
+ # except dateutil.parser._parser.ParserError:
243
+ # files = [files[-1]]
244
 
245
+ # for file in files:
246
+ # model_result_filepaths.append(os.path.join(root, file))
247
 
248
+ # # print(f"Found json files: {model_result_filepaths}")
249
 
250
+ # method_counters = defaultdict(int)
251
+ # dataframes = []
252
+
253
+ # for json_file in model_result_filepaths:
254
+ # try:
255
+ # with open(filepath, 'r') as f:
256
+ # json_data = json.load(f)
257
+ # method_name = json_data['method_name']
258
+ # method_counters[method_name] += 1
259
 
260
+ # # Process single JSON file
261
+ # df = process_single_json(json_data, method_counters[method_name])
262
+ # dataframes.append(df)
263
+ # except Exception as e:
264
+ # print(f"Error processing {json_file}: {e}")
265
+ # continue
266
+
267
+ # # # Concatenate all DataFrames
268
+ # # if dataframes:
269
+ # # final_df = pd.concat(dataframes, axis=0)
270
+ # # return final_df
271
+ # # else:
272
+ # # return pd.DataFrame()
273
+ # return dataframes
274
 
275
+ from dataclasses import dataclass
276
+ import json
277
+ import numpy as np
278
+ from typing import Dict, List, Any
279
+ import os
280
+ from datetime import datetime
281
+ import dateutil
282
 
283
+ @dataclass
284
+ class EvalResult_MIB_CAUSALGRAPH:
285
+ """Represents one full evaluation for a method across all models in MIB for causal graph track."""
286
+ eval_name: str # method name as identifier
287
+ method_name: str # name of the interpretation method
288
+ results: Dict # nested dict of results for each model and task
289
 
290
+ def init_from_json_file(self, json_filepath: str):
291
+ """Inits results from the method result file"""
292
+ with open(json_filepath) as fp:
293
+ data = json.load(fp)
294
+
295
+ method_name = data.get("method_name")
296
 
297
+ # Initialize results dictionary
298
+ results = {}
299
+ for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:
300
+ results[task] = {}
 
301
 
302
+ # Process each model's results
303
+ for result in data.get("results", []):
304
+ model_id = result.get("model_id", "")
305
+ model_name = model_id.replace(".", "_")
306
 
307
+ for task, scores in result.get("task_scores", {}).items():
308
+ intervention_scores = defaultdict(list)
309
 
310
+ for layer_data in scores:
311
+ for intervention_data in layer_data['layer_scores']:
312
+ # Calculate average score for counterfactuals
313
+ avg_cf_score = np.mean([
314
+ cf['score']
315
+ for cf in intervention_data['counterfactual_scores']
316
+ ])
317
+
318
+ if np.isnan(avg_cf_score):
319
+ avg_cf_score = 0.0
 
320
 
321
+ intervention_key = '_'.join(intervention_data['intervention'])
322
+ intervention_scores[intervention_key].append(avg_cf_score)
323
+
324
+ # Average across layers for each intervention
325
+ results[task][model_name] = {
326
+ interv: np.mean(scores) if scores else 0.0
327
+ for interv, scores in intervention_scores.items()
328
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
 
330
+ return EvalResult_MIB_CAUSALGRAPH(
331
+ eval_name=method_name,
332
+ method_name=method_name,
333
+ results=results
334
+ )
335
 
336
+ def to_dict(self, metric_type="average"):
337
+ """Converts the Eval Result to a dict for dataframe display"""
338
+ data_dict = {
339
+ "eval_name": self.eval_name,
340
+ "Method": self.method_name,
341
+ }
342
+
343
+ # Initialize columns for all task-model combinations
344
+ all_scores = []
345
+ for task, task_results in self.results.items():
346
+ for model, intervention_scores in task_results.items():
347
+ if not intervention_scores:
348
+ continue
349
+
350
+ col_name = f"{task}_{model}"
351
+ scores = list(intervention_scores.values())
352
+ if not scores:
353
+ data_dict[col_name] = '-'
354
+ continue
355
+
356
+ avg_score = np.mean(scores)
357
+ data_dict[col_name] = round(avg_score, 3)
358
+ all_scores.append(avg_score)
359
+
360
+ data_dict["Average"] = round(np.mean(all_scores), 3) if all_scores else '-'
361
+ return data_dict
362
 
363
 
364
  def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
365
+ """From the path of the results folder root, extract all needed info for MIB causal graph results"""
366
  model_result_filepaths = []
367
 
 
368
  for root, dirnames, files in os.walk(results_path):
 
 
369
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
370
  continue
371
 
 
377
  for file in files:
378
  model_result_filepaths.append(os.path.join(root, file))
379
 
380
+ eval_results = []
381
+ for model_result_filepath in model_result_filepaths:
 
 
 
 
382
  try:
383
+ eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {}) # Create empty instance
384
+ result = eval_result.init_from_json_file(model_result_filepath)
385
+ # Verify the result can be converted to dict format
386
+ result.to_dict()
387
+ eval_results.append(result)
 
 
 
388
  except Exception as e:
389
+ print(f"Error processing {model_result_filepath}: {e}")
390
  continue
391
+
392
+ return eval_results
 
 
 
 
 
 
393
 
394
 
395