jasonshaoshun commited on
Commit
2ba536b
·
1 Parent(s): a5eab2c

fix: Align task & model names in CausalGraph

Browse files
Files changed (3) hide show
  1. app.py +38 -5
  2. src/about.py +6 -4
  3. src/leaderboard/read_evals.py +250 -301
app.py CHANGED
@@ -399,18 +399,48 @@ def init_leaderboard_mib_subgraph(dataframe, track):
399
 
400
 
401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
  def init_leaderboard_mib_causalgraph(dataframe, track):
404
  model_name_mapping = {
405
  "Qwen2ForCausalLM": "Qwen-2.5",
406
  "GPT2ForCausalLM": "GPT-2",
 
407
  "Gemma2ForCausalLM": "Gemma-2",
408
  "LlamaForCausalLM": "Llama-3.1"
409
  }
410
 
411
  benchmark_mapping = {
412
- "IOI": "IOI",
413
- "MCQA": "MCQA",
414
  "arithmetic_addition": "Arithmetic (+)",
415
  "arithmetic_subtraction": "Arithmetic (-)",
416
  "arc_easy": "ARC (Easy)",
@@ -420,13 +450,16 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
420
  display_mapping = {}
421
  for task in TasksMib_Causalgraph:
422
  for model in task.value.models:
423
- field_name = f"{task.value.col_name}_{model}"
 
424
  display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
425
  display_mapping[field_name] = display_name
426
 
 
 
427
  renamed_df = dataframe.rename(columns=display_mapping)
428
-
429
- print(renamed_df)
430
 
431
  # Create only necessary columns
432
  return Leaderboard(
 
399
 
400
 
401
 
402
+ # @dataclass
403
+ # class TaskMIB_Causalgraph:
404
+ # benchmark: str # task name in json (ioi/arithmetic)
405
+ # models: list[str] # list of models to show as sub-columns
406
+ # col_name: str # display name in leaderboard
407
+ # metrics: list[str] # metrics to store (average_score)
408
+
409
+ # class TasksMib_Causalgraph(Enum):
410
+ # task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "ioi_task", ["average_score"])
411
+ # task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
412
+ # task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
413
+ # task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
414
+
415
+ # @classmethod
416
+ # def get_all_tasks(cls):
417
+ # """Returns a list of all task benchmarks"""
418
+ # return [task.value.benchmark for task in cls]
419
+
420
+ # @classmethod
421
+ # def get_all_models(cls):
422
+ # """Returns a list of all unique models across all tasks"""
423
+ # models = set()
424
+ # for task in cls:
425
+ # models.update(task.value.models)
426
+ # return sorted(list(models))
427
+
428
+ # ioi_task
429
+ # 4_answer_MCQA
430
+
431
 
432
  def init_leaderboard_mib_causalgraph(dataframe, track):
433
  model_name_mapping = {
434
  "Qwen2ForCausalLM": "Qwen-2.5",
435
  "GPT2ForCausalLM": "GPT-2",
436
+ "GPT2LMHeadModel": "GPT-2",
437
  "Gemma2ForCausalLM": "Gemma-2",
438
  "LlamaForCausalLM": "Llama-3.1"
439
  }
440
 
441
  benchmark_mapping = {
442
+ "ioi_task": "IOI",
443
+ "4_answer_MCQA": "MCQA",
444
  "arithmetic_addition": "Arithmetic (+)",
445
  "arithmetic_subtraction": "Arithmetic (-)",
446
  "arc_easy": "ARC (Easy)",
 
450
  display_mapping = {}
451
  for task in TasksMib_Causalgraph:
452
  for model in task.value.models:
453
+ # print(f"Task: {task.value.benchmark}, Model: {model}")
454
+ field_name = f"{model}_{task.value.col_name}"
455
  display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
456
  display_mapping[field_name] = display_name
457
 
458
+ # print("\nDebugging display_mapping:", display_mapping)
459
+
460
  renamed_df = dataframe.rename(columns=display_mapping)
461
+
462
+ # print("\nDebugging DataFrame columns:", renamed_df.columns.tolist())
463
 
464
  # Create only necessary columns
465
  return Leaderboard(
src/about.py CHANGED
@@ -79,11 +79,13 @@ class TaskMIB_Causalgraph:
79
  col_name: str # display name in leaderboard
80
  metrics: list[str] # metrics to store (average_score)
81
 
 
 
82
  class TasksMib_Causalgraph(Enum):
83
- task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "IOI", ["average_score"])
84
- task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "MCQA", ["average_score"])
85
- task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
86
- task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
87
 
88
  @classmethod
89
  def get_all_tasks(cls):
 
79
  col_name: str # display name in leaderboard
80
  metrics: list[str] # metrics to store (average_score)
81
 
82
+
83
+
84
  class TasksMib_Causalgraph(Enum):
85
+ task0 = TaskMIB_Subgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"])
86
+ task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
87
+ task2 = TaskMIB_Subgraph("arithmetic_addition", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
88
+ task3 = TaskMIB_Subgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
89
 
90
  @classmethod
91
  def get_all_tasks(cls):
src/leaderboard/read_evals.py CHANGED
@@ -2,19 +2,22 @@ import glob
2
  import json
3
  import math
4
  import os
 
 
5
  from dataclasses import dataclass
 
 
 
6
 
7
  import dateutil
8
  import numpy as np
 
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal
12
  from src.submission.check_validity import is_model_on_hub
13
- from src.about import TasksMib_Subgraph
14
 
15
- from typing import List, Dict, Any
16
- from collections import defaultdict
17
- import pandas as pd
18
 
19
 
20
 
@@ -205,226 +208,10 @@ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_
205
 
206
 
207
 
208
- # def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
209
- # """
210
- # Process a single JSON file and convert it to a DataFrame.
211
-
212
- # Args:
213
- # json_file: Dictionary containing the analysis results
214
- # method_counter: Counter for handling duplicate method names
215
-
216
- # Returns:
217
- # pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
218
- # """
219
- # method_name = json_file['method_name']
220
- # unique_method_name = f"{method_name}_{method_counter}"
221
- # method_scores = []
222
-
223
- # for result in json_file['results']:
224
- # model = result['model_id']
225
-
226
- # for task, scores in result['task_scores'].items():
227
- # # Process each layer's data
228
- # intervention_scores = defaultdict(list)
229
-
230
- # for layer_data in scores:
231
- # for intervention_data in layer_data['layer_scores']:
232
- # # Calculate average score for counterfactuals
233
- # avg_cf_score = np.mean([
234
- # cf['score']
235
- # for cf in intervention_data['counterfactual_scores']
236
- # ])
237
-
238
- # if np.isnan(avg_cf_score):
239
- # avg_cf_score = 0.0
240
-
241
- # # Group scores by intervention
242
- # intervention_key = '_'.join(intervention_data['intervention'])
243
- # intervention_scores[intervention_key].append(avg_cf_score)
244
-
245
- # # Average across layers for each intervention
246
- # for intervention, layer_scores in intervention_scores.items():
247
- # column = f"{model}_{task}_{intervention}"
248
- # avg_score = np.mean(layer_scores) if layer_scores else 0.0
249
- # method_scores.append((column, f"{avg_score:.3f}"))
250
-
251
- # # Sort by column names for consistency
252
- # method_scores.sort(key=lambda x: x[0])
253
- # data = {
254
- # unique_method_name: {
255
- # col: score for col, score in method_scores
256
- # }
257
- # }
258
-
259
- # return pd.DataFrame.from_dict(data, orient='index')
260
-
261
- # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
262
- # model_result_filepaths = []
263
-
264
- # # print(f"Scanning directory: {results_path}")
265
- # for root, dirnames, files in os.walk(results_path):
266
- # # print(f"Current directory: {root}")
267
- # # print(f"Found files: {files}")
268
- # if len(files) == 0 or any([not f.endswith(".json") for f in files]):
269
- # continue
270
-
271
- # try:
272
- # files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
273
- # except dateutil.parser._parser.ParserError:
274
- # files = [files[-1]]
275
-
276
- # for file in files:
277
- # model_result_filepaths.append(os.path.join(root, file))
278
-
279
- # # print(f"Found json files: {model_result_filepaths}")
280
-
281
- # method_counters = defaultdict(int)
282
- # dataframes = []
283
-
284
- # for json_file in model_result_filepaths:
285
- # try:
286
- # with open(filepath, 'r') as f:
287
- # json_data = json.load(f)
288
- # method_name = json_data['method_name']
289
- # method_counters[method_name] += 1
290
-
291
- # # Process single JSON file
292
- # df = process_single_json(json_data, method_counters[method_name])
293
- # dataframes.append(df)
294
- # except Exception as e:
295
- # print(f"Error processing {json_file}: {e}")
296
- # continue
297
-
298
- # return dataframes
299
-
300
-
301
-
302
-
303
- from dataclasses import dataclass
304
- import json
305
- import numpy as np
306
- import pandas as pd
307
- from typing import Dict, List, Any
308
- import os
309
- from datetime import datetime
310
- import dateutil
311
- from collections import defaultdict
312
-
313
- @dataclass
314
- class EvalResult_MIB_CAUSALGRAPH:
315
- """Represents one full evaluation for a method across all models in MIB for causal graph track."""
316
- method_name: str # name of the interpretation method
317
- results: Dict # nested dict of results for each model and task
318
-
319
- def init_from_json_file(self, json_filepath: str):
320
- """Inits results from the method result file"""
321
- with open(json_filepath) as fp:
322
- data = json.load(fp)
323
-
324
- method_name = data.get("method_name")
325
-
326
- # Initialize results dictionary
327
- results = {}
328
- for task in ["IOI", "MCQA", "arithmetic", "ARC-easy"]:
329
- results[task] = {}
330
-
331
- print(f"Processing file: {json_filepath}")
332
- # Process each model's results
333
- for result in data.get("results", []):
334
- model_id = result.get("model_id", "")
335
- model_name = model_id.replace(".", "_")
336
-
337
- for task, scores in result.get("task_scores", {}).items():
338
- intervention_scores = defaultdict(list)
339
-
340
- for layer_data in scores:
341
- for intervention_data in layer_data['layer_scores']:
342
- # Calculate average score for counterfactuals
343
- avg_cf_score = np.mean([
344
- cf['score'] if 'score' in cf else 0
345
- for cf in intervention_data['counterfactual_scores']
346
- ])
347
-
348
- if np.isnan(avg_cf_score):
349
- avg_cf_score = 0.0
350
-
351
- intervention_key = '_'.join(intervention_data['intervention'])
352
- intervention_scores[intervention_key].append(avg_cf_score)
353
- print(f"intervention_key is {intervention_key}, avg_cf_score is {avg_cf_score}")
354
-
355
- # Average across layers for each intervention
356
- results[task][model_name] = {
357
- interv: np.mean(scores) if scores else 0.0
358
- for interv, scores in intervention_scores.items()
359
- }
360
-
361
- return EvalResult_MIB_CAUSALGRAPH(
362
- method_name=method_name,
363
- results=results
364
- )
365
-
366
- def to_dict(self, metric_type="average"):
367
- """Converts the Eval Result to a dict for dataframe display"""
368
- data_dict = {
369
- "Method": self.method_name,
370
- "Average": "-" # Initialize first to make the order consistent
371
- }
372
-
373
- # Initialize columns for all task-model combinations
374
- all_scores = []
375
- for task, task_results in self.results.items():
376
- for model, intervention_scores in task_results.items():
377
- if not intervention_scores:
378
- continue
379
-
380
- col_name = f"{task}_{model}"
381
- scores = list(intervention_scores.values())
382
- if not scores:
383
- data_dict[col_name] = '-'
384
- continue
385
-
386
- avg_score = np.mean(scores)
387
- data_dict[col_name] = f"{avg_score:.3f}"
388
- all_scores.append(avg_score)
389
-
390
- data_dict["Average"] = f"{np.mean(all_scores):.3f}"
391
- return data_dict
392
-
393
-
394
 
395
 
396
- # def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
397
- # """
398
- # Aggregates rows with the same base method name by taking the max value for each column.
399
- # Works with Method as a regular column instead of index.
400
- # """
401
- # df_copy = df.copy()
402
- # print("\nBase methods extraction:")
403
- # base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
404
- # else name for name in df_copy['Method']]
405
- # print(f"Original methods: {df_copy['Method'].tolist()}")
406
- # print(f"Base methods: {base_methods}")
407
 
408
 
409
- # df_copy['base_method'] = base_methods
410
-
411
- # # Convert scores to numeric values
412
- # score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
413
- # for col in score_columns:
414
- # df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
415
-
416
- # # Group by base method name and take the max
417
- # aggregated_df = df_copy.groupby('base_method')[score_columns].max().round(3)
418
-
419
- # # Reset index to make base_method a regular column and rename it to Method
420
- # aggregated_df = aggregated_df.reset_index()
421
- # aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
422
-
423
- # # Convert back to string format
424
- # for col in score_columns:
425
- # aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}")
426
-
427
- # return aggregated_df
428
  def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
429
  """
430
  Aggregates rows with the same base method name by taking the max value for each column.
@@ -446,21 +233,21 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
446
  # Convert scores to numeric values
447
  score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
448
  for col in score_columns:
449
- df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) and not pd.isna(x) else x)
450
 
451
  # Group by base method name and take the max, handling NaN values
452
- aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(2)
453
 
454
- # Convert back to string format and reset index
455
  aggregated_df = aggregated_df.reset_index()
456
  aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
457
 
458
- # Convert numeric values back to strings with 3 decimal places
459
- for col in score_columns:
460
- aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}" if not pd.isna(x) else x)
461
-
462
  return aggregated_df
463
 
 
 
 
 
464
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
465
  """
466
  Creates a DataFrame where columns are model_task and cells are averaged over interventions.
@@ -469,99 +256,261 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
469
  # Create a copy of the DataFrame
470
  df_copy = df.copy()
471
 
472
- # Remove the Average column if it exists
473
- if 'Average' in df_copy.columns:
474
- df_copy = df_copy.drop('Average', axis=1)
475
-
476
- # Get score columns (excluding Method)
477
- score_columns = [col for col in df_copy.columns if col != 'Method']
478
 
479
- # Convert all scores to numeric values
480
- for col in score_columns:
481
- df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
482
-
483
- # Group columns by model_task
484
- model_task_groups = {}
485
- for col in score_columns:
486
- model_task = '_'.join(col.split('_')[:2]) # Get model_task part
487
- if model_task not in model_task_groups:
488
- model_task_groups[model_task] = []
489
- model_task_groups[model_task].append(col)
490
 
491
- # Create new DataFrame with Method column and averaged intervention scores
492
  averaged_data = []
493
  for _, row in df_copy.iterrows():
494
- averaged_row = {'Method': row['Method']}
 
 
495
  for model_task, cols in model_task_groups.items():
496
- averaged_row[model_task] = np.mean([row[col] for col in cols]).round(2)
497
- averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(2)
498
- averaged_data.append(averaged_row)
 
 
 
 
 
 
 
 
 
 
 
499
 
 
500
  averaged_df = pd.DataFrame(averaged_data)
501
-
502
- # Sort by Average column
503
- averaged_df = averaged_df.sort_values('Average', ascending=False)
504
 
505
  return averaged_df
506
 
507
 
508
- def get_raw_eval_results_mib_causalgraph(results_path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
509
- """From the path of the results folder root, extract all needed info for MIB causal graph results"""
510
- model_result_filepaths = []
511
-
512
- for root, dirnames, files in os.walk(results_path):
513
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
514
- continue
 
 
 
 
 
 
 
 
515
 
516
- try:
517
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
518
- except dateutil.parser._parser.ParserError:
519
- files = [files[-1]]
520
-
521
- for file in files:
522
- model_result_filepaths.append(os.path.join(root, file))
523
-
524
- method_counters = defaultdict(int)
525
- data_dicts = []
526
-
527
- for filepath in model_result_filepaths:
528
- with open(filepath, 'r') as f:
529
- json_data = json.load(f)
530
- method_name = json_data['method_name']
531
- method_counters[method_name] += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
 
533
- eval_result = EvalResult_MIB_CAUSALGRAPH("", {})
534
- result = eval_result.init_from_json_file(filepath)
535
- data_dict = result.to_dict()
536
 
537
- # print(f"data_dict.keys(): {data_dict.keys()}")
 
 
 
 
 
 
538
 
539
- # Add method counter to the method name if it's not the first instance
540
- if method_counters[method_name] > 1:
541
- data_dict["Method"] = f"{method_name}_{method_counters[method_name]}"
542
 
543
- data_dicts.append(data_dict)
544
-
545
- if not data_dicts:
546
- return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
547
-
548
- # Create the detailed DataFrame
549
- detailed_df = pd.DataFrame(data_dicts)
550
- # detailed_df.set_index("Method", inplace=True)
551
- # print(f"detailed_df coluns are {detailed_df.columns.tolist()}")
552
- # if "eval_name" in detailed_df.columns:
553
- # detailed_df.drop("eval_name", axis=1, inplace=True)
554
 
555
- print("Before aggregation:")
556
- print(detailed_df)
557
 
558
- # Create aggregated DataFrame
559
- aggregated_df = aggregate_methods(detailed_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560
 
561
- # Create intervention-averaged DataFrame
562
- intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
563
 
564
- return detailed_df, aggregated_df, intervention_averaged_df
565
 
566
 
567
 
 
2
  import json
3
  import math
4
  import os
5
+ import re
6
+ import ast
7
  from dataclasses import dataclass
8
+ from datetime import datetime
9
+ from typing import List, Dict, Any, Tuple
10
+ from collections import defaultdict
11
 
12
  import dateutil
13
  import numpy as np
14
+ import pandas as pd
15
 
16
  from src.display.formatting import make_clickable_model
17
+ from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, AutoEvalColumn_mib_causalgraph
18
  from src.submission.check_validity import is_model_on_hub
19
+ from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
20
 
 
 
 
21
 
22
 
23
 
 
208
 
209
 
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
216
  """
217
  Aggregates rows with the same base method name by taking the max value for each column.
 
233
  # Convert scores to numeric values
234
  score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
235
  for col in score_columns:
236
+ df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
237
 
238
  # Group by base method name and take the max, handling NaN values
239
+ aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(3)
240
 
241
+ # Reset index to make base_method a regular column and rename it to Method
242
  aggregated_df = aggregated_df.reset_index()
243
  aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
244
 
 
 
 
 
245
  return aggregated_df
246
 
247
+
248
+
249
+
250
+
251
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
252
  """
253
  Creates a DataFrame where columns are model_task and cells are averaged over interventions.
 
256
  # Create a copy of the DataFrame
257
  df_copy = df.copy()
258
 
259
+ # Get all columns except Method and Average
260
+ columns_to_process = [col for col in df_copy.columns if col not in ['Method', 'Average']]
 
 
 
 
261
 
262
+ # Extract model and task information from column names
263
+ model_task_groups = defaultdict(list)
264
+ for col in columns_to_process:
265
+ # Split by underscore and extract model, task
266
+ parts = col.split('_')
267
+ if len(parts) >= 2:
268
+ model_task = f"{parts[0]}_{parts[1]}"
269
+ model_task_groups[model_task].append(col)
 
 
 
270
 
271
+ # Create new DataFrame with Method and averaged columns
272
  averaged_data = []
273
  for _, row in df_copy.iterrows():
274
+ new_row = {'Method': row['Method']}
275
+
276
+ # Calculate average for each model_task group
277
  for model_task, cols in model_task_groups.items():
278
+ values = [row[col] for col in cols if pd.notna(row[col])]
279
+ if values:
280
+ new_row[model_task] = round(np.mean(values), 3)
281
+ else:
282
+ new_row[model_task] = np.nan
283
+
284
+ # Calculate overall average
285
+ model_task_values = [v for k, v in new_row.items() if k != 'Method' and pd.notna(v)]
286
+ if model_task_values:
287
+ new_row['Average'] = round(np.mean(model_task_values), 3)
288
+ else:
289
+ new_row['Average'] = np.nan
290
+
291
+ averaged_data.append(new_row)
292
 
293
+ # Create DataFrame and sort by Average
294
  averaged_df = pd.DataFrame(averaged_data)
295
+ if 'Average' in averaged_df.columns:
296
+ averaged_df = averaged_df.sort_values('Average', ascending=False)
 
297
 
298
  return averaged_df
299
 
300
 
301
+ @dataclass
302
+ class EvalResult_MIB_CAUSALGRAPH:
303
+ """Represents one full evaluation for a method across all models for causal variable localization."""
304
+ eval_name: str # method name as identifier
305
+ method_name: str # name of the interpretation method
306
+ model_name: str # name of the model
307
+ task_name: str # name of the task
308
+ target_variables: str # target variables (e.g., "answer", "answer_pointer")
309
+ average_accuracy: float # average accuracy score
310
+ highest_accuracy: float # highest accuracy score
311
+
312
+ @staticmethod
313
+ def init_from_consolidated_json(json_data: Dict):
314
+ """
315
+ Initialize results from the consolidated JSON format, treating each entry as a separate result
316
 
317
+ Args:
318
+ json_data: The parsed JSON data with tuple keys
319
+
320
+ Returns:
321
+ List of EvalResult_MIB_CAUSALGRAPH objects
322
+ """
323
+ results = []
324
+
325
+ for key, entry in json_data.items():
326
+ try:
327
+ # Parse tuple key: "('method', 'model', 'task', 'variable')"
328
+ try:
329
+ key_tuple = ast.literal_eval(key)
330
+ method_name, model_name, task_name, target_variable = key_tuple
331
+ except:
332
+ # Alternative parsing with regex
333
+ pattern = r"\('([^']+)', '([^']+)', '([^']+)', '([^']+)'\)"
334
+ match = re.match(pattern, key)
335
+ if match:
336
+ method_name, model_name, task_name, target_variable = match.groups()
337
+ else:
338
+ print(f"Couldn't parse key: {key}")
339
+ continue
340
+
341
+ # Get average and highest accuracy
342
+ average_accuracy = entry.get("average_accuracy", 0.0)
343
+ highest_accuracy = entry.get("highest_accuracy", 0.0)
344
+
345
+ # Create a result object for this entry
346
+ result = EvalResult_MIB_CAUSALGRAPH(
347
+ eval_name=f"{method_name}_{model_name}_{task_name}_{target_variable}",
348
+ method_name=method_name,
349
+ model_name=model_name,
350
+ task_name=task_name,
351
+ target_variables=target_variable,
352
+ average_accuracy=average_accuracy,
353
+ highest_accuracy=highest_accuracy
354
+ )
355
+
356
+ results.append(result)
357
+
358
+ except Exception as e:
359
+ print(f"Error processing entry {key}: {e}")
360
+ continue
361
+
362
+ return results
363
 
364
+ def to_dict(self, metric_type="Highest"):
365
+ """
366
+ Converts the Eval Result to a dict for dataframe display
367
 
368
+ Args:
369
+ metric_type: Either "Mean" to use average_accuracy or "Highest" to use highest_accuracy
370
+ """
371
+ # Create column name in the exact format requested
372
+ # col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
373
+ col_name = f"{self.model_name}_{self.task_name}"
374
+ print(f"col_name is {col_name}")
375
 
376
+ # Select the appropriate accuracy metric based on metric_type
377
+ score = self.average_accuracy if metric_type == "Mean" else self.highest_accuracy
 
378
 
379
+ # Create data dictionary with method name and the score
380
+ data_dict = {
381
+ "eval_name": self.eval_name,
382
+ "Method": self.method_name,
383
+ col_name: score
384
+ }
385
+
386
+ return data_dict
 
 
 
387
 
 
 
388
 
389
+ def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
390
+ """
391
+ Processes the consolidated JSON format for causal variable localization results
392
+ Treats each entry as a separate result and then combines them by method
393
+
394
+ Args:
395
+ results_path: Path to the directory containing results
396
+
397
+ Returns:
398
+ Tuple of four DataFrames:
399
+ - detailed_df_highest: Detailed view with highest accuracy scores
400
+ - detailed_df_mean: Detailed view with mean accuracy scores
401
+ - intervention_averaged_highest_df: Averaged by intervention using highest accuracy
402
+ - intervention_averaged_mean_df: Averaged by intervention using mean accuracy
403
+ """
404
+ # Find the consolidated JSON file
405
+ json_files = []
406
+ for root, _, files in os.walk(results_path):
407
+ for file in files:
408
+ if file.endswith('.json'):
409
+ json_files.append(os.path.join(root, file))
410
+
411
+ if not json_files:
412
+ print(f"No JSON files found in {results_path}")
413
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
414
+
415
+ # Load and process the consolidated JSON format
416
+ raw_data = None
417
+ for json_file in json_files:
418
+ try:
419
+ with open(json_file, 'r') as f:
420
+ data = json.load(f)
421
+
422
+ # Check if this is the consolidated format by examining a sample key
423
+ sample_key = next(iter(data), None)
424
+ if sample_key and isinstance(sample_key, str) and '(' in sample_key and ')' in sample_key:
425
+ raw_data = data
426
+ print(f"Found consolidated data file: {json_file}")
427
+ break
428
+ except Exception as e:
429
+ print(f"Error reading {json_file}: {e}")
430
+
431
+ if raw_data is None:
432
+ print("No valid consolidated JSON file found")
433
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
434
+
435
+ # Get all results
436
+ eval_results = EvalResult_MIB_CAUSALGRAPH.init_from_consolidated_json(raw_data)
437
+
438
+ if not eval_results:
439
+ print("No results could be extracted from the JSON data")
440
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
441
+
442
+ # Create two sets of dictionaries - one for highest accuracy and one for mean accuracy
443
+ highest_results = [result.to_dict(metric_type="Highest") for result in eval_results]
444
+ mean_results = [result.to_dict(metric_type="Mean") for result in eval_results]
445
+
446
+ # Process highest accuracy results
447
+ # Group results by method
448
+ highest_method_groups = {}
449
+ for result_dict in highest_results:
450
+ method = result_dict["Method"]
451
+ if method not in highest_method_groups:
452
+ highest_method_groups[method] = {
453
+ "eval_name": method,
454
+ "Method": method
455
+ }
456
+
457
+ # Copy all score columns to the method's group
458
+ for key, value in result_dict.items():
459
+ if key not in ["eval_name", "Method"]:
460
+ highest_method_groups[method][key] = value
461
+
462
+ # Create the detailed DataFrame for highest accuracy
463
+ highest_records = list(highest_method_groups.values())
464
+ detailed_df_highest = pd.DataFrame(highest_records)
465
+
466
+ # Process mean accuracy results
467
+ # Group results by method
468
+ mean_method_groups = {}
469
+ for result_dict in mean_results:
470
+ method = result_dict["Method"]
471
+ if method not in mean_method_groups:
472
+ mean_method_groups[method] = {
473
+ "eval_name": method,
474
+ "Method": method
475
+ }
476
+
477
+ # Copy all score columns to the method's group
478
+ for key, value in result_dict.items():
479
+ if key not in ["eval_name", "Method"]:
480
+ mean_method_groups[method][key] = value
481
+
482
+ # Create the detailed DataFrame for mean accuracy
483
+ mean_records = list(mean_method_groups.values())
484
+ detailed_df_mean = pd.DataFrame(mean_records)
485
+
486
+ if detailed_df_highest.empty or detailed_df_mean.empty:
487
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
488
+
489
+ # Calculate and add Average column for both DataFrames
490
+ score_columns_highest = [col for col in detailed_df_highest.columns if col not in ["eval_name", "Method"]]
491
+ score_columns_mean = [col for col in detailed_df_mean.columns if col not in ["eval_name", "Method"]]
492
+
493
+ if score_columns_highest:
494
+ detailed_df_highest["Average"] = detailed_df_highest[score_columns_highest].mean(axis=1).round(3)
495
+
496
+ if score_columns_mean:
497
+ detailed_df_mean["Average"] = detailed_df_mean[score_columns_mean].mean(axis=1).round(3)
498
+
499
+ # Sort by Average descending
500
+ if "Average" in detailed_df_highest.columns:
501
+ detailed_df_highest = detailed_df_highest.sort_values("Average", ascending=False)
502
+
503
+ if "Average" in detailed_df_mean.columns:
504
+ detailed_df_mean = detailed_df_mean.sort_values("Average", ascending=False)
505
+
506
+ # # Create intervention-averaged DataFrames for both metrics
507
+ # intervention_averaged_highest_df = create_intervention_averaged_df(detailed_df_highest)
508
+ # intervention_averaged_mean_df = create_intervention_averaged_df(detailed_df_mean)
509
+
510
+ # return detailed_df_highest, detailed_df_mean, intervention_averaged_highest_df
511
+ return detailed_df_highest, detailed_df_mean, detailed_df_mean
512
 
 
 
513
 
 
514
 
515
 
516