jasonshaoshun commited on
Commit
4780a48
·
1 Parent(s): 1d8e193

caulsal-track debug

Browse files
Files changed (6) hide show
  1. app.py +1 -277
  2. caulsal_metric.py +5 -0
  3. src/about.py +38 -70
  4. src/display/utils.py +23 -200
  5. src/leaderboard/read_evals.py +141 -250
  6. src/populate.py +21 -140
app.py CHANGED
@@ -23,10 +23,8 @@ from src.display.utils import (
23
  BENCHMARK_COLS,
24
  BENCHMARK_COLS_MULTIMODAL,
25
  BENCHMARK_COLS_MIB_SUBGRAPH,
26
- BENCHMARK_COLS_MIB_CAUSALGRAPH,
27
  COLS,
28
  COLS_MIB_SUBGRAPH,
29
- COLS_MIB_CAUSALGRAPH,
30
  COLS_MULTIMODAL,
31
  EVAL_COLS,
32
  EVAL_TYPES,
@@ -281,9 +279,7 @@ LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_M
281
  # In app.py, modify the LEADERBOARD initialization
282
  LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
283
  EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
284
- EVAL_REQUESTS_PATH,
285
- COLS_MIB_CAUSALGRAPH,
286
- BENCHMARK_COLS_MIB_CAUSALGRAPH
287
  )
288
 
289
 
@@ -300,95 +296,6 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
300
 
301
 
302
 
303
- # def init_leaderboard_mib_subgraph(dataframe, track):
304
- # # print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
305
-
306
- # if dataframe is None or dataframe.empty:
307
- # raise ValueError("Leaderboard DataFrame is empty or None.")
308
-
309
- # # filter for correct track
310
- # # dataframe = dataframe.loc[dataframe["Track"] == track]
311
-
312
- # # print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
313
-
314
- # return Leaderboard(
315
- # value=dataframe,
316
- # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
317
- # select_columns=SelectColumns(
318
- # default_selection=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default],
319
- # cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.never_hidden],
320
- # label="Select Columns to Display:",
321
- # ),
322
- # search_columns=["Method"], # Changed from AutoEvalColumn_mib_subgraph.model.name to "Method"
323
- # hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden],
324
- # bool_checkboxgroup_label="Hide models",
325
- # interactive=False,
326
- # )
327
-
328
-
329
-
330
-
331
-
332
- # def init_leaderboard_mib_subgraph(dataframe, track):
333
- # """Initialize the subgraph leaderboard with grouped column selection by benchmark."""
334
- # if dataframe is None or dataframe.empty:
335
- # raise ValueError("Leaderboard DataFrame is empty or None.")
336
-
337
- # print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
338
-
339
- # # Create groups of columns by benchmark
340
- # benchmark_groups = []
341
-
342
- # # For each benchmark in our TasksMib_Subgraph enum...
343
- # for task in TasksMib_Subgraph:
344
- # benchmark = task.value.benchmark
345
- # # Get all valid columns for this benchmark's models
346
- # benchmark_cols = [
347
- # f"{benchmark}_{model}"
348
- # for model in task.value.models
349
- # if f"{benchmark}_{model}" in dataframe.columns
350
- # ]
351
- # if benchmark_cols: # Only add if we have valid columns
352
- # benchmark_groups.append(benchmark_cols)
353
- # print(f"\nBenchmark group for {benchmark}:", benchmark_cols)
354
-
355
- # # Create model groups as well
356
- # model_groups = []
357
- # all_models = list(set(model for task in TasksMib_Subgraph for model in task.value.models))
358
-
359
- # # For each unique model...
360
- # for model in all_models:
361
- # # Get all valid columns for this model across benchmarks
362
- # model_cols = [
363
- # f"{task.value.benchmark}_{model}"
364
- # for task in TasksMib_Subgraph
365
- # if model in task.value.models
366
- # and f"{task.value.benchmark}_{model}" in dataframe.columns
367
- # ]
368
- # if model_cols: # Only add if we have valid columns
369
- # model_groups.append(model_cols)
370
- # print(f"\nModel group for {model}:", model_cols)
371
-
372
- # # Combine all groups
373
- # all_groups = benchmark_groups + model_groups
374
-
375
- # # Flatten groups for default selection (show everything initially)
376
- # all_columns = [col for group in all_groups for col in group]
377
- # print("\nAll available columns:", all_columns)
378
-
379
- # return Leaderboard(
380
- # value=dataframe,
381
- # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
382
- # select_columns=SelectColumns(
383
- # default_selection=all_columns, # Show all columns initially
384
- # label="Select Results:"
385
- # ),
386
- # search_columns=["Method"],
387
- # hide_columns=[],
388
- # interactive=False,
389
- # )
390
-
391
-
392
  def init_leaderboard_mib_subgraph(dataframe, track):
393
  """Initialize the subgraph leaderboard with display names for better readability."""
394
  if dataframe is None or dataframe.empty:
@@ -478,189 +385,6 @@ def init_leaderboard_mib_subgraph(dataframe, track):
478
 
479
 
480
 
481
- # # Complete column groups for both benchmarks and models
482
- # # Define keywords for filtering
483
- # benchmark_keywords = ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]
484
- # model_keywords = ["qwen2_5", "gpt2", "gemma2", "llama3"]
485
-
486
- # # Optional: Define display names
487
- # mappings = {
488
- # "ioi_llama3": "IOI (LLaMA-3)",
489
- # "ioi_qwen2_5": "IOI (Qwen-2.5)",
490
- # "ioi_gpt2": "IOI (GPT-2)",
491
- # "ioi_gemma2": "IOI (Gemma-2)",
492
- # "mcqa_llama3": "MCQA (LLaMA-3)",
493
- # "mcqa_qwen2_5": "MCQA (Qwen-2.5)",
494
- # "mcqa_gemma2": "MCQA (Gemma-2)",
495
- # "arithmetic_addition_llama3": "Arithmetic Addition (LLaMA-3)",
496
- # "arithmetic_subtraction_llama3": "Arithmetic Subtraction (LLaMA-3)",
497
- # "arc_easy_llama3": "ARC Easy (LLaMA-3)",
498
- # "arc_easy_gemma2": "ARC Easy (Gemma-2)",
499
- # "arc_challenge_llama3": "ARC Challenge (LLaMA-3)",
500
- # "eval_name": "Evaluation Name",
501
- # "Method": "Method",
502
- # "Average": "Average Score"
503
- # }
504
- # # mappings = {}
505
-
506
- # # Create SmartSelectColumns instance
507
- # smart_columns = SmartSelectColumns(
508
- # benchmark_keywords=benchmark_keywords,
509
- # model_keywords=model_keywords,
510
- # column_mapping=mappings,
511
- # initial_selected=["Method", "Average"]
512
- # )
513
-
514
- # print("\nDebugging DataFrame columns:", renamed_df.columns.tolist())
515
-
516
- # # Create Leaderboard
517
- # leaderboard = Leaderboard(
518
- # value=renamed_df,
519
- # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
520
- # select_columns=smart_columns,
521
- # search_columns=["Method"],
522
- # hide_columns=[],
523
- # interactive=False
524
- # )
525
- # print(f"Successfully created leaderboard.")
526
- # return leaderboard
527
-
528
- # print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
529
-
530
- # # Define simple keywords for filtering
531
- # benchmark_keywords = ["ioi", "mcqa", "arithmetic", "arc"]
532
- # model_keywords = ["qwen2_5", "gpt2", "gemma2", "llama3"]
533
-
534
- # # Create SmartSelectColumns instance with exact same parameters as working version
535
- # smart_columns = SmartSelectColumns(
536
- # benchmark_keywords=benchmark_keywords,
537
- # model_keywords=model_keywords,
538
- # initial_selected=["Method", "Average"],
539
- # allow=True,
540
- # label=None,
541
- # show_label=True,
542
- # info=None
543
- # )
544
-
545
- # try:
546
- # print("\nCreating leaderboard...")
547
- # # Get groups before creating leaderboard
548
- # smart_columns.get_filtered_groups(dataframe.columns)
549
-
550
- # leaderboard = Leaderboard(
551
- # value=dataframe,
552
- # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
553
- # select_columns=smart_columns,
554
- # search_columns=["Method"],
555
- # hide_columns=[],
556
- # interactive=False
557
- # )
558
- # print("Leaderboard created successfully")
559
- # return leaderboard
560
-
561
- # except Exception as e:
562
- # print("Error creating leaderboard:", str(e))
563
- # raise
564
-
565
-
566
-
567
-
568
-
569
-
570
- # def init_leaderboard_mib_subgraph(dataframe, track):
571
- # """Initialize the subgraph leaderboard with group-based column selection."""
572
- # if dataframe is None or dataframe.empty:
573
- # raise ValueError("Leaderboard DataFrame is empty or None.")
574
-
575
- # print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
576
-
577
- # # Create selection mapping for benchmark groups
578
- # selection_mapping = {}
579
-
580
- # # Create benchmark groups with descriptive names
581
- # for task in TasksMib_Subgraph:
582
- # benchmark = task.value.benchmark
583
- # # Get all columns for this benchmark's models
584
- # benchmark_cols = [
585
- # f"{benchmark}_{model}"
586
- # for model in task.value.models
587
- # if f"{benchmark}_{model}" in dataframe.columns
588
- # ]
589
- # if benchmark_cols:
590
- # # Use a descriptive group name as the key
591
- # group_name = f"Benchmark: {benchmark.upper()}"
592
- # selection_mapping[group_name] = benchmark_cols
593
- # print(f"\n{group_name} maps to:", benchmark_cols)
594
-
595
- # # Create model groups with descriptive names
596
- # all_models = list(set(model for task in TasksMib_Subgraph for model in task.value.models))
597
- # for model in all_models:
598
- # # Get all columns for this model across benchmarks
599
- # model_cols = [
600
- # f"{task.value.benchmark}_{model}"
601
- # for task in TasksMib_Subgraph
602
- # if model in task.value.models
603
- # and f"{task.value.benchmark}_{model}" in dataframe.columns
604
- # ]
605
- # if model_cols:
606
- # # Use a descriptive group name as the key
607
- # group_name = f"Model: {model}"
608
- # selection_mapping[group_name] = model_cols
609
- # print(f"\n{group_name} maps to:", model_cols)
610
-
611
- # # The selection options are the group names
612
- # selection_options = list(selection_mapping.keys())
613
- # print("\nSelection options:", selection_options)
614
-
615
- # return Leaderboard(
616
- # value=dataframe,
617
- # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
618
- # select_columns=SelectColumns(
619
- # default_selection=selection_options, # Show all groups by default
620
- # label="Select Benchmark or Model Groups:"
621
- # ),
622
- # search_columns=["Method"],
623
- # hide_columns=[],
624
- # interactive=False,
625
- # )
626
-
627
-
628
-
629
-
630
-
631
-
632
-
633
-
634
- # def init_leaderboard_mib_causalgraph(dataframe, track):
635
- # # print("Debugging column issues:")
636
- # # print("\nActual DataFrame columns:")
637
- # # print(dataframe.columns.tolist())
638
-
639
- # # print("\nExpected columns for Leaderboard:")
640
- # expected_cols = [c.name for c in fields(AutoEvalColumn_mib_causalgraph)]
641
- # # print(expected_cols)
642
-
643
- # # print("\nMissing columns:")
644
- # missing_cols = [col for col in expected_cols if col not in dataframe.columns]
645
- # # print(missing_cols)
646
-
647
- # # print("\nSample of DataFrame content:")
648
- # # print(dataframe.head().to_string())
649
-
650
- # return Leaderboard(
651
- # value=dataframe,
652
- # datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
653
- # select_columns=SelectColumns(
654
- # default_selection=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.displayed_by_default],
655
- # cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.never_hidden],
656
- # label="Select Columns to Display:",
657
- # ),
658
- # search_columns=["Method"],
659
- # hide_columns=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.hidden],
660
- # bool_checkboxgroup_label="Hide models",
661
- # interactive=False,
662
- # )
663
-
664
  def init_leaderboard_mib_causalgraph(dataframe, track):
665
  # print("Debugging column issues:")
666
  # print("\nActual DataFrame columns:")
 
23
  BENCHMARK_COLS,
24
  BENCHMARK_COLS_MULTIMODAL,
25
  BENCHMARK_COLS_MIB_SUBGRAPH,
 
26
  COLS,
27
  COLS_MIB_SUBGRAPH,
 
28
  COLS_MULTIMODAL,
29
  EVAL_COLS,
30
  EVAL_TYPES,
 
279
  # In app.py, modify the LEADERBOARD initialization
280
  LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
281
  EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
282
+ EVAL_REQUESTS_PATH
 
 
283
  )
284
 
285
 
 
296
 
297
 
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  def init_leaderboard_mib_subgraph(dataframe, track):
300
  """Initialize the subgraph leaderboard with display names for better readability."""
301
  if dataframe is None or dataframe.empty:
 
385
 
386
 
387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  def init_leaderboard_mib_causalgraph(dataframe, track):
389
  # print("Debugging column issues:")
390
  # print("\nActual DataFrame columns:")
caulsal_metric.py CHANGED
@@ -135,6 +135,11 @@ def create_summary_dataframe(json_files: List[Dict[str, Any]]) -> pd.DataFrame:
135
 
136
  return df
137
 
 
 
 
 
 
138
  def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
139
  """
140
  Aggregates rows with the same base method name by taking the max value for each column.
 
135
 
136
  return df
137
 
138
+
139
+ # averaged_cf = average_counterfactuals(json_files)
140
+ # layer_averaged = find_layer_averages(averaged_cf)
141
+ # detailed_df = create_summary_dataframe(layer_averaged)
142
+
143
  def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
144
  """
145
  Aggregates rows with the same base method name by taking the max value for each column.
src/about.py CHANGED
@@ -61,33 +61,27 @@ class TasksMib_Subgraph(Enum):
61
  return sorted(list(models))
62
 
63
 
 
 
64
  # @dataclass
65
  # class TaskMIB_Causalgraph:
66
- # benchmark: str # MCQA
67
- # models: list[str] # List of all models
68
- # layers: list[str] # 0-31
69
- # col_name: str # display name in leaderboard
70
- # interventions: list[str] # output_token, output_location
71
- # counterfactuals: list[str] # symbol_counterfactual, etc.
72
- # metrics: list[str] # score
73
 
74
- # class TasksMib_Causalgraph(Enum):
75
- # task0 = TaskMIB_Causalgraph(
76
- # "MCQA",
77
- # ["LlamaForCausalLM", "Qwen2ForCausalLM", "Gemma2ForCausalLM"], # Updated model list
78
- # [str(i) for i in range(32)], # 0-31 layers
79
- # "mcqa",
80
- # ["output_token", "output_location"],
81
- # ["symbol_counterfactual", "randomLetter_counterfactual",
82
- # "answerPosition_counterfactual", "answerPosition_symbol_counterfactual"],
83
- # ["score"]
84
- # )
85
 
86
  # class TasksMib_Causalgraph(Enum):
87
- # task0 = TaskMIB_Causalgraph(
88
- # "MCQA",
89
- # ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], # Match exact model names with correct casing
90
- # [str(i) for i in range(32)],
 
 
 
91
  # "mcqa",
92
  # ["output_token", "output_location"],
93
  # ["randomLetter_counterfactual", "answerPosition_counterfactual",
@@ -95,58 +89,32 @@ class TasksMib_Subgraph(Enum):
95
  # ["score"]
96
  # )
97
 
98
- # class TasksMib_Causalgraph(Enum):
99
- # task0 = TaskMIB_Causalgraph(
100
- # "MCQA",
101
- # ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"], # Use lowercase consistently
102
- # [str(i) for i in range(32)],
103
- # "mcqa",
104
- # ["output_token", "output_location"],
105
- # ["randomLetter_counterfactual", "answerPosition_counterfactual",
106
- # "answerPosition_randomLetter_counterfactual"],
107
- # ["score"]
108
- # )
109
 
110
- @dataclass
111
  class TaskMIB_Causalgraph:
112
- benchmark: str
113
- models: list[str]
114
- layers: dict[str, list[str]] # Different layers for each model
115
- col_name: str
116
- interventions: list[str]
117
- counterfactuals: list[str]
118
- metrics: list[str]
119
 
120
- # class TasksMib_Causalgraph(Enum):
121
- # task0 = TaskMIB_Causalgraph(
122
- # "MCQA",
123
- # ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"],
124
- # {
125
- # "Qwen2ForCausalLM": [str(i) for i in range(24)], # 0-23
126
- # "Gemma2ForCausalLM": [str(i) for i in range(26)], # 0-25
127
- # "LlamaForCausalLM": [str(i) for i in range(32)] # 0-31
128
- # },
129
- # "mcqa",
130
- # ["output_token", "output_location"],
131
- # ["randomLetter_counterfactual", "answerPosition_counterfactual",
132
- # "answerPosition_randomLetter_counterfactual"],
133
- # ["score"]
134
- # )
135
  class TasksMib_Causalgraph(Enum):
136
- task0 = TaskMIB_Causalgraph(
137
- "MCQA",
138
- ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"], # Use lowercase names to match actual columns
139
- {
140
- "qwen2forcausallm": [str(i) for i in range(24)], # 0-23
141
- "gemma2forcausallm": [str(i) for i in range(26)], # 0-25
142
- "llamaforcausallm": [str(i) for i in range(32)] # 0-31
143
- },
144
- "mcqa",
145
- ["output_token", "output_location"],
146
- ["randomLetter_counterfactual", "answerPosition_counterfactual",
147
- "answerPosition_randomLetter_counterfactual"],
148
- ["score"]
149
- )
 
 
 
150
 
151
 
152
  NUM_FEWSHOT = 0 # Change with your few shot
 
61
  return sorted(list(models))
62
 
63
 
64
+
65
+
66
  # @dataclass
67
  # class TaskMIB_Causalgraph:
68
+ # benchmark: str
69
+ # models: list[str]
70
+ # layers: dict[str, list[str]] # Different layers for each model
71
+ # col_name: str
72
+ # interventions: list[str]
73
+ # counterfactuals: list[str]
74
+ # metrics: list[str]
75
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # class TasksMib_Causalgraph(Enum):
78
+ # task0 = TaskMIB_Causalgraph("MCQA",
79
+ # ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"],
80
+ # {
81
+ # "qwen2forcausallm": [str(i) for i in range(24)], # 0-23
82
+ # "gemma2forcausallm": [str(i) for i in range(26)], # 0-25
83
+ # "llamaforcausallm": [str(i) for i in range(32)] # 0-31
84
+ # },
85
  # "mcqa",
86
  # ["output_token", "output_location"],
87
  # ["randomLetter_counterfactual", "answerPosition_counterfactual",
 
89
  # ["score"]
90
  # )
91
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ @dataclass
94
  class TaskMIB_Causalgraph:
95
+ benchmark: str # task name in json (ioi/arithmetic)
96
+ models: list[str] # list of models to show as sub-columns
97
+ col_name: str # display name in leaderboard
98
+ metrics: list[str] # metrics to store (average_score)
 
 
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  class TasksMib_Causalgraph(Enum):
101
+ task0 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "MCQA", ["average_score"])
102
+
103
+ @classmethod
104
+ def get_all_tasks(cls):
105
+ """Returns a list of all task benchmarks"""
106
+ return [task.value.benchmark for task in cls]
107
+
108
+ @classmethod
109
+ def get_all_models(cls):
110
+ """Returns a list of all unique models across all tasks"""
111
+ models = set()
112
+ for task in cls:
113
+ models.update(task.value.models)
114
+ return sorted(list(models))
115
+
116
+
117
+
118
 
119
 
120
  NUM_FEWSHOT = 0 # Change with your few shot
src/display/utils.py CHANGED
@@ -58,64 +58,6 @@ AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_
58
 
59
 
60
 
61
- ##############################################################################################################
62
- # Version 1
63
-
64
- # auto_eval_column_dict_mib_subgraph = []
65
-
66
- # # Method name column
67
- # auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
68
-
69
- # # For each task and model combination
70
- # for task in TasksMib_Subgraph:
71
- # for model in task.value.models:
72
- # col_name = f"{task.value.benchmark}_{model}" # ioi_gpt2, mcqa_qwen2.5, etc.
73
- # auto_eval_column_dict_mib_subgraph.append([
74
- # col_name,
75
- # ColumnContent,
76
- # ColumnContent(col_name, "number", True)
77
- # ])
78
-
79
- # # Average column
80
- # auto_eval_column_dict_mib_subgraph.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
81
-
82
-
83
- # ##############################################################################################################
84
- # # Version 2
85
- # auto_eval_column_dict_mib_subgraph = []
86
-
87
- # # Method name column
88
- # auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
89
-
90
- # # Add task filter column
91
- # task_values = list(set(task.value.benchmark for task in TasksMib_Subgraph))
92
- # auto_eval_column_dict_mib_subgraph.append(
93
- # ["task_filter", ColumnContent, ColumnContent("Task", "str", True, never_hidden=True)]
94
- # )
95
-
96
- # # Add model filter column
97
- # model_values = list(set(
98
- # model
99
- # for task in TasksMib_Subgraph
100
- # for model in task.value.models
101
- # ))
102
- # auto_eval_column_dict_mib_subgraph.append(
103
- # ["model_filter", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)]
104
- # )
105
-
106
- # # For each task and model combination
107
- # for task in TasksMib_Subgraph:
108
- # for model in task.value.models:
109
- # col_name = f"{task.value.benchmark}_{model}"
110
- # auto_eval_column_dict_mib_subgraph.append([
111
- # col_name,
112
- # ColumnContent,
113
- # ColumnContent(col_name, "number", True)
114
- # ])
115
-
116
- # # Average column
117
- # auto_eval_column_dict_mib_subgraph.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
118
-
119
 
120
  ##############################################################################################################
121
  # Version 3
@@ -185,133 +127,6 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
185
 
186
 
187
 
188
-
189
- # # Initialize the MIB causal graph columns
190
- # auto_eval_column_dict_mib_causalgraph = []
191
-
192
- # # Method name column
193
- # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
194
-
195
- # # For each model-task-intervention combination
196
- # for task in TasksMib_Causalgraph:
197
- # for model in task.value.models:
198
- # for intervention in task.value.interventions:
199
- # col_name = f"{model}_{task.value.benchmark}_{intervention}".lower()
200
- # auto_eval_column_dict_mib_causalgraph.append([
201
- # col_name,
202
- # ColumnContent,
203
- # ColumnContent(col_name, "number", True)
204
- # ])
205
-
206
- # auto_eval_column_dict_mib_causalgraph = []
207
-
208
- # # Method name column
209
- # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
210
-
211
- # # For each model-task-intervention combination
212
- # for task in TasksMib_Causalgraph:
213
- # for model in task.value.models:
214
- # model_name = model.lower() # Convert model name to lowercase
215
- # for layer in task.value.layers:
216
- # for intervention in task.value.interventions:
217
- # for counterfactual in task.value.counterfactuals:
218
- # # Include model name in the column name
219
- # col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}"
220
- # field_name = col_name.lower()
221
- # auto_eval_column_dict_mib_causalgraph.append([
222
- # field_name,
223
- # ColumnContent,
224
- # ColumnContent(col_name, "number", True)
225
- # ])
226
-
227
- # # In utils.py, modify auto_eval_column_dict_mib_causalgraph:
228
- # auto_eval_column_dict_mib_causalgraph = []
229
-
230
- # # Method name column
231
- # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
232
-
233
- # # For each model-task-intervention-counterfactual combination
234
- # for task in TasksMib_Causalgraph:
235
- # for model in ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"]: # exact model names
236
- # for layer in task.value.layers:
237
- # for intervention in task.value.interventions:
238
- # for counterfactual in task.value.counterfactuals:
239
- # # Match the exact format from the data
240
- # col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}".lower()
241
- # auto_eval_column_dict_mib_causalgraph.append([
242
- # col_name,
243
- # ColumnContent,
244
- # ColumnContent(col_name, "number", True)
245
- # ])
246
-
247
-
248
-
249
-
250
- # auto_eval_column_dict_mib_causalgraph = []
251
-
252
- # # Method name column
253
- # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
254
-
255
- # # Add eval_name column
256
- # auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)])
257
-
258
- # # For each model-task-intervention-counterfactual combination
259
- # for task in TasksMib_Causalgraph:
260
- # for model in task.value.models: # Use exact model names with correct casing
261
- # model_name = model # Don't convert to lowercase
262
- # for layer in task.value.layers:
263
- # for intervention in task.value.interventions:
264
- # for counterfactual in task.value.counterfactuals:
265
- # # Match exact format from the actual data
266
- # col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}"
267
- # # Use the exact column name as both the field name and display name
268
- # auto_eval_column_dict_mib_causalgraph.append([
269
- # col_name,
270
- # ColumnContent,
271
- # ColumnContent(col_name, "number", True)
272
- # ])
273
-
274
-
275
-
276
- # auto_eval_column_dict_mib_causalgraph = []
277
-
278
- # # Method name column
279
- # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
280
- # auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)])
281
-
282
- # # For each model-task-intervention-counterfactual combination
283
- # for task in TasksMib_Causalgraph:
284
- # for model in task.value.models:
285
- # for layer in task.value.layers[model]: # Use model-specific layers
286
- # for intervention in task.value.interventions:
287
- # for counterfactual in task.value.counterfactuals:
288
- # col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}"
289
- # auto_eval_column_dict_mib_causalgraph.append([
290
- # col_name,
291
- # ColumnContent,
292
- # ColumnContent(col_name, "number", True)
293
- # ])
294
-
295
- # auto_eval_column_dict_mib_causalgraph = []
296
-
297
- # # Method name column
298
- # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
299
- # auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)])
300
-
301
- # # For each model-task-intervention-counterfactual combination
302
- # for task in TasksMib_Causalgraph:
303
- # for model in task.value.models: # model will already be lowercase
304
- # for layer in task.value.layers[model]:
305
- # for intervention in task.value.interventions:
306
- # for counterfactual in task.value.counterfactuals:
307
- # # Use exactly the same format as in DataFrame
308
- # col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}"
309
- # auto_eval_column_dict_mib_causalgraph.append([
310
- # col_name,
311
- # ColumnContent,
312
- # ColumnContent(col_name, "number", True)
313
- # ])
314
-
315
  auto_eval_column_dict_mib_causalgraph = []
316
 
317
  # Only include Method column as required
@@ -320,15 +135,17 @@ auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnCon
320
  # For each model-task-intervention-counterfactual combination
321
  for task in TasksMib_Causalgraph:
322
  for model in task.value.models: # model will be lowercase
323
- for layer in task.value.layers[model]:
324
- for intervention in task.value.interventions:
325
- for counterfactual in task.value.counterfactuals:
326
- col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}"
327
- auto_eval_column_dict_mib_causalgraph.append([
328
- col_name,
329
- ColumnContent,
330
- ColumnContent(col_name, "number", True)
331
- ])
 
 
332
 
333
  # Create the dataclass
334
  AutoEvalColumn_mib_causalgraph = make_dataclass(
@@ -337,14 +154,20 @@ AutoEvalColumn_mib_causalgraph = make_dataclass(
337
  frozen=True
338
  )
339
 
340
- # Column selection for display
341
- COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
342
 
343
 
344
- BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"{model}_{task.value.benchmark}_{intervention}".lower()
345
- for task in TasksMib_Causalgraph
346
- for model in task.value.models
347
- for intervention in task.value.interventions]
 
 
 
 
 
 
 
 
348
 
349
 
350
 
 
58
 
59
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  ##############################################################################################################
63
  # Version 3
 
127
 
128
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  auto_eval_column_dict_mib_causalgraph = []
131
 
132
  # Only include Method column as required
 
135
  # For each model-task-intervention-counterfactual combination
136
  for task in TasksMib_Causalgraph:
137
  for model in task.value.models: # model will be lowercase
138
+ col_name = f"{task.value.benchmark}_{model}"
139
+ auto_eval_column_dict_mib_causalgraph.append([
140
+ col_name,
141
+ ColumnContent,
142
+ ColumnContent(col_name, "number", True)
143
+ ])
144
+
145
+ # Add the Average column
146
+ auto_eval_column_dict_mib_causalgraph.append(
147
+ ["average_score", ColumnContent, ColumnContent("Average", "number", True)]
148
+ )
149
 
150
  # Create the dataclass
151
  AutoEvalColumn_mib_causalgraph = make_dataclass(
 
154
  frozen=True
155
  )
156
 
 
 
157
 
158
 
159
+
160
+
161
+
162
+
163
+ # # Column selection for display
164
+ # COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
165
+
166
+
167
+ # BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"{model}_{task.value.benchmark}_{intervention}".lower()
168
+ # for task in TasksMib_Causalgraph
169
+ # for model in task.value.models
170
+ # for intervention in task.value.interventions]
171
 
172
 
173
 
src/leaderboard/read_evals.py CHANGED
@@ -12,8 +12,10 @@ from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, Tasks, T
12
  from src.submission.check_validity import is_model_on_hub
13
  from src.about import TasksMib_Subgraph
14
 
15
- from typing import List, Dict
16
  from collections import defaultdict
 
 
17
 
18
 
19
  def compute_area(edge_counts, faithfulnesses, log_scale=True):
@@ -65,21 +67,8 @@ class EvalResult_MIB_SUBGRAPH:
65
  for model_result in data.get("results", []):
66
  model_id = model_result.get("model_id", "")
67
 
68
- # if "/" in model_id:
69
- # org = model_id.split("/")[0]
70
- # if org == "meta-llama":
71
- # model_name = "llama3"
72
- # elif org == "Qwen":
73
- # model_name = "qwen2_5"
74
- # elif "gpt" in model_id.lower():
75
- # model_name = "gpt2"
76
- # elif org == "google":
77
- # model_name = "gemma2"
78
- # else:
79
- # model_name = model_id.replace(".", "_")
80
  model_name = model_id.replace(".", "_")
81
 
82
-
83
  # Keep exact scores structure from JSON
84
  scores = model_result.get("scores", {})
85
 
@@ -108,16 +97,7 @@ class EvalResult_MIB_SUBGRAPH:
108
 
109
  # Initialize all possible columns with '-'
110
  expected_models = TasksMib_Subgraph.get_all_models()
111
- expected_tasks = TasksMib_Subgraph.get_all_tasks()
112
- # for task in expected_tasks:
113
- # for model in task.value.models:
114
- # # if model == "gpt2" and task != "ioi":
115
- # # continue
116
- # # if model == "qwen2_5" and task.startswith(("arithmetic", "arc")):
117
- # # continue
118
- # # if model == "gemma2" and (task.startswith("arithmetic") or task == "arc_challenge"):
119
- # # continue
120
- # data_dict[f"{task}_{model}"] = '-'
121
 
122
  for task in TasksMib_Subgraph:
123
  for model in task.value.models:
@@ -145,23 +125,6 @@ class EvalResult_MIB_SUBGRAPH:
145
  data_dict[col_name] = round(score, 2)
146
  all_scores.append(score)
147
 
148
- # All entries must be present for average
149
- # required_entries = [
150
- # data_dict['ioi_llama3'] != '-',
151
- # data_dict['ioi_qwen2_5'] != '-',
152
- # data_dict['ioi_gpt2'] != '-',
153
- # data_dict['ioi_gemma2'] != '-',
154
- # data_dict['mcqa_llama3'] != '-',
155
- # data_dict['mcqa_qwen2_5'] != '-',
156
- # data_dict['mcqa_gemma2'] != '-',
157
- # data_dict['arithmetic_addition_llama3'] != '-',
158
- # data_dict['arithmetic_subtraction_llama3'] != '-',
159
- # data_dict['arc_easy_gemma2'] != '-',
160
- # data_dict['arc_easy_llama3'] != '-',
161
- # data_dict['arc_challenge_llama3'] != '-'
162
- # ]
163
-
164
- # data_dict["Average"] = round(np.mean(all_scores), 2) if all(required_entries) else '-'
165
  data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
166
  return data_dict
167
 
@@ -207,9 +170,63 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
207
 
208
 
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  # @dataclass
211
  # class EvalResult_MIB_CAUSALGRAPH:
212
- # """Represents one full evaluation for a method in MIB causalgraph."""
213
  # eval_name: str
214
  # method_name: str
215
  # results: Dict
@@ -222,31 +239,26 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
222
  # method_name = data.get("method_name")
223
  # results = {}
224
 
225
- # # Get results for each model
226
  # for model_result in data.get("results", []):
227
- # model_id = model_result.get("model_id", "") # Will be one of the three models
 
 
 
228
  # task_scores = model_result.get("task_scores", {})
229
 
230
- # # Process MCQA task scores
231
- # mcqa_scores = {}
232
  # for layer_data in task_scores.get("MCQA", []):
233
  # layer = layer_data.get("layer")
234
- # layer_scores = layer_data.get("layer_scores", [])
235
-
236
- # # Store scores for each intervention and counterfactual
237
- # for intervention_data in layer_scores:
238
- # intervention = intervention_data["intervention"][0]
239
- # counterfactual_scores = intervention_data["counterfactual_scores"]
240
-
241
- # for cf_score in counterfactual_scores:
242
  # counterfactual = cf_score["counterfactual"][0]
243
  # score = cf_score["score"]
244
 
245
- # # Create key for this combination
246
- # key = f"layer{layer}_{intervention}_{counterfactual}"
247
- # mcqa_scores[key] = score
248
-
249
- # results[model_id] = mcqa_scores
250
 
251
  # return EvalResult_MIB_CAUSALGRAPH(
252
  # eval_name=method_name,
@@ -254,198 +266,70 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
254
  # results=results
255
  # )
256
 
257
- # def to_dict(self):
258
- # """Converts the Eval Result to a dict for dataframe display"""
259
- # data_dict = {
260
- # "eval_name": self.eval_name,
261
- # "Method": self.method_name,
262
- # }
263
-
264
- # # Process each model's results
265
- # for model_id, model_results in self.results.items():
266
- # for task, task_scores in model_results.items():
267
- # # Calculate layer-averaged scores for each intervention
268
- # intervention_scores = defaultdict(list)
269
-
270
- # for layer_data in task_scores:
271
- # for score_data in layer_data['scores']:
272
- # intervention = score_data['intervention']
273
- # intervention_scores[intervention].append(score_data['score'])
274
-
275
- # # Average across layers for each intervention
276
- # for intervention, scores in intervention_scores.items():
277
- # col_name = f"{model_id}_{task}_{intervention}".lower()
278
- # data_dict[col_name] = round(np.mean(scores), 3)
279
-
280
- # return data_dict
281
-
282
 
283
- # def to_dict(self):
284
- # """Converts the Eval Result to a dict for dataframe display"""
285
- # data_dict = {
286
- # "eval_name": self.eval_name,
287
- # "Method": self.method_name,
288
- # }
 
 
289
 
290
- # # Process each model's results
291
- # for model_id, task_scores in self.results.items():
292
- # model_name = model_id.lower() # Lowercase for consistency
293
 
294
- # # Each task_scores contains layer data
295
- # for layer_data in task_scores.get("MCQA", []):
296
- # layer = layer_data.get("layer")
297
- # layer_scores = layer_data.get("layer_scores", [])
298
 
299
- # # Process each intervention and counterfactual
300
- # for intervention_data in layer_scores:
301
- # intervention = intervention_data["intervention"][0]
302
- # counterfactual_scores = intervention_data["counterfactual_scores"]
303
 
304
- # for cf_score in counterfactual_scores:
305
- # counterfactual = cf_score["counterfactual"][0]
306
- # score = cf_score["score"]
307
-
308
- # # Column name matches what we defined in utils.py
309
- # col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}".lower()
310
- # data_dict[col_name] = score
311
-
312
- # return data_dict
313
-
314
- # def to_dict(self):
315
- # """Converts the Eval Result to a dict for dataframe display"""
316
- # print(f"Results in to_dict: {self.results}") # Debug print
317
-
318
- # data_dict = {
319
- # "eval_name": self.eval_name,
320
- # "Method": self.method_name,
321
- # }
322
-
323
- # # Process each model's results
324
- # for model_id, scores in self.results.items():
325
- # model_name = model_id.lower()
326
- # for task, layer_scores in scores.items():
327
- # for layer_data in layer_scores:
328
- # layer = layer_data.get("layer")
329
- # intervention_scores = layer_data.get("scores", [])
330
 
331
- # for intervention_data in intervention_scores:
332
- # col_name = f"{model_name}_layer{layer}_{intervention_data['intervention']}".lower()
333
- # data_dict[col_name] = intervention_data['score']
334
-
335
- # return data_dict
336
-
337
-
338
- # def to_dict(self):
339
- # """Converts the Eval Result to a dict for dataframe display"""
340
- # data_dict = {
341
- # "eval_name": self.eval_name,
342
- # "Method": self.method_name,
343
- # }
344
 
345
- # # Process each model's results
346
- # for model_id, scores in self.results.items():
347
- # model_name = model_id.lower()
348
- # # The scores are already in the format we want
349
- # for key, value in scores.items():
350
- # col_name = f"{model_name}_{key}"
351
- # data_dict[col_name] = value
352
-
353
- # return data_dict
354
-
355
-
356
-
357
-
358
-
359
-
360
-
361
-
362
- @dataclass
363
- class EvalResult_MIB_CAUSALGRAPH:
364
- eval_name: str
365
- method_name: str
366
- results: Dict
367
-
368
- def init_from_json_file(self, json_filepath):
369
- """Inits results from the method result file"""
370
- with open(json_filepath) as fp:
371
- data = json.load(fp)
372
-
373
- method_name = data.get("method_name")
374
- results = {}
375
 
376
- # Process each model's results
377
- for model_result in data.get("results", []):
378
- model_id = model_result.get("model_id", "")
379
- task_scores = model_result.get("task_scores", {})
380
-
381
- # Process MCQA scores
382
- for layer_data in task_scores.get("MCQA", []):
383
- layer = layer_data.get("layer")
384
- for score_data in layer_data.get("layer_scores", []):
385
- intervention = score_data["intervention"][0]
386
- for cf_score in score_data["counterfactual_scores"]:
387
- counterfactual = cf_score["counterfactual"][0]
388
- score = cf_score["score"]
389
-
390
- # Create key matching the expected column format
391
- key = f"{model_id}_layer{layer}_{intervention}_{counterfactual}"
392
- results[key] = score
393
-
394
- return EvalResult_MIB_CAUSALGRAPH(
395
- eval_name=method_name,
396
- method_name=method_name,
397
- results=results
398
- )
399
-
400
- def to_dict(self):
401
- """Converts the Eval Result to a dict for dataframe display"""
402
- data_dict = {
403
- "eval_name": self.eval_name,
404
- "Method": self.method_name,
405
- }
406
 
407
- # Add all results directly
408
- data_dict.update(self.results)
409
 
410
- return data_dict
411
-
412
 
413
 
414
 
415
 
416
 
417
-
418
-
419
-
420
- # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
421
- # """Extract evaluation results for MIB causalgraph"""
422
- # model_result_filepaths = []
423
-
424
- # for root, dirnames, files in os.walk(results_path):
425
- # if len(files) == 0 or any([not f.endswith(".json") for f in files]):
426
- # continue
427
-
428
- # try:
429
- # files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
430
- # except dateutil.parser._parser.ParserError:
431
- # files = [files[-1]]
432
-
433
- # for file in files:
434
- # model_result_filepaths.append(os.path.join(root, file))
435
-
436
- # eval_results = []
437
- # for filepath in model_result_filepaths:
438
- # try:
439
- # eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
440
- # result = eval_result.init_from_json_file(filepath)
441
- # result.to_dict() # Verify conversion works
442
- # eval_results.append(result)
443
- # except Exception as e:
444
- # print(f"Error processing {filepath}: {e}")
445
- # continue
446
-
447
- # return eval_results
448
-
449
  def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
450
  model_result_filepaths = []
451
 
@@ -466,23 +350,30 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
466
 
467
  # print(f"Found json files: {model_result_filepaths}")
468
 
469
- eval_results = []
470
- for filepath in model_result_filepaths:
 
 
471
  try:
472
- eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
473
- result = eval_result.init_from_json_file(filepath)
474
- # print(f"Processed file {filepath}")
475
- # print(f"Got result: {result}")
476
- eval_results.append(result)
 
 
 
477
  except Exception as e:
478
- print(f"Error processing {filepath}: {e}")
479
  continue
480
-
481
- # print(f"Total results processed: {len(eval_results)}")
482
- return eval_results
483
-
484
-
485
-
 
 
486
 
487
 
488
 
 
12
  from src.submission.check_validity import is_model_on_hub
13
  from src.about import TasksMib_Subgraph
14
 
15
+ from typing import List, Dict, Any
16
  from collections import defaultdict
17
+ import pandas as pd
18
+
19
 
20
 
21
  def compute_area(edge_counts, faithfulnesses, log_scale=True):
 
67
  for model_result in data.get("results", []):
68
  model_id = model_result.get("model_id", "")
69
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  model_name = model_id.replace(".", "_")
71
 
 
72
  # Keep exact scores structure from JSON
73
  scores = model_result.get("scores", {})
74
 
 
97
 
98
  # Initialize all possible columns with '-'
99
  expected_models = TasksMib_Subgraph.get_all_models()
100
+ # expected_tasks = TasksMib_Subgraph.get_all_tasks()
 
 
 
 
 
 
 
 
 
101
 
102
  for task in TasksMib_Subgraph:
103
  for model in task.value.models:
 
125
  data_dict[col_name] = round(score, 2)
126
  all_scores.append(score)
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
129
  return data_dict
130
 
 
170
 
171
 
172
 
173
+
174
+ def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
175
+ """
176
+ Process a single JSON file and convert it to a DataFrame.
177
+
178
+ Args:
179
+ json_file: Dictionary containing the analysis results
180
+ method_counter: Counter for handling duplicate method names
181
+
182
+ Returns:
183
+ pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
184
+ """
185
+ method_name = json_file['method_name']
186
+ unique_method_name = f"{method_name}_{method_counter}"
187
+ method_scores = []
188
+
189
+ for result in json_file['results']:
190
+ model = result['model_id']
191
+
192
+ for task, scores in result['task_scores'].items():
193
+ # Process each layer's data
194
+ intervention_scores = defaultdict(list)
195
+
196
+ for layer_data in scores:
197
+ for intervention_data in layer_data['layer_scores']:
198
+ # Calculate average score for counterfactuals
199
+ avg_cf_score = np.mean([
200
+ cf['score']
201
+ for cf in intervention_data['counterfactual_scores']
202
+ ])
203
+
204
+ if np.isnan(avg_cf_score):
205
+ avg_cf_score = 0.0
206
+
207
+ # Group scores by intervention
208
+ intervention_key = '_'.join(intervention_data['intervention'])
209
+ intervention_scores[intervention_key].append(avg_cf_score)
210
+
211
+ # Average across layers for each intervention
212
+ for intervention, layer_scores in intervention_scores.items():
213
+ column = f"{model}_{task}_{intervention}"
214
+ avg_score = np.mean(layer_scores) if layer_scores else 0.0
215
+ method_scores.append((column, f"{avg_score:.3f}"))
216
+
217
+ # Sort by column names for consistency
218
+ method_scores.sort(key=lambda x: x[0])
219
+ data = {
220
+ unique_method_name: {
221
+ col: score for col, score in method_scores
222
+ }
223
+ }
224
+
225
+ return pd.DataFrame.from_dict(data, orient='index')
226
+
227
+
228
  # @dataclass
229
  # class EvalResult_MIB_CAUSALGRAPH:
 
230
  # eval_name: str
231
  # method_name: str
232
  # results: Dict
 
239
  # method_name = data.get("method_name")
240
  # results = {}
241
 
242
+ # # Process each model's results
243
  # for model_result in data.get("results", []):
244
+ # model_id = model_result.get("model_id", "")
245
+
246
+
247
+
248
  # task_scores = model_result.get("task_scores", {})
249
 
250
+ # # Process MCQA scores
 
251
  # for layer_data in task_scores.get("MCQA", []):
252
  # layer = layer_data.get("layer")
253
+ # for score_data in layer_data.get("layer_scores", []):
254
+ # intervention = score_data["intervention"][0]
255
+ # for cf_score in score_data["counterfactual_scores"]:
 
 
 
 
 
256
  # counterfactual = cf_score["counterfactual"][0]
257
  # score = cf_score["score"]
258
 
259
+ # # Create key matching the expected column format
260
+ # key = f"{model_id}_layer{layer}_{intervention}_{counterfactual}"
261
+ # results[key] = score
 
 
262
 
263
  # return EvalResult_MIB_CAUSALGRAPH(
264
  # eval_name=method_name,
 
266
  # results=results
267
  # )
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
+ # data = {}
271
+ # method_counters = defaultdict(int)
272
+
273
+ # for json_file in json_files:
274
+ # # Handle method name and duplicates
275
+ # method_name = json_file['method_name']
276
+ # method_counters[method_name] += 1
277
+ # unique_method_name = f"{method_name}_{method_counters[method_name]}"
278
 
279
+ # method_scores = []
 
 
280
 
281
+ # for result in json_file['results']:
282
+ # model = result['model_id']
 
 
283
 
284
+ # for task, scores in result['task_scores'].items():
285
+ # # Process each layer's data
286
+ # intervention_scores = defaultdict(list)
 
287
 
288
+ # for layer_data in scores:
289
+ # for intervention_data in layer_data['layer_scores']:
290
+ # # Calculate average score for counterfactuals
291
+ # avg_cf_score = np.mean([
292
+ # cf['score']
293
+ # for cf in intervention_data['counterfactual_scores']
294
+ # ])
295
+
296
+ # if np.isnan(avg_cf_score):
297
+ # avg_cf_score = 0.0
298
+
299
+ # # Group scores by intervention
300
+ # intervention_key = '_'.join(intervention_data['intervention'])
301
+ # intervention_scores[intervention_key].append(avg_cf_score)
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
+ # # Average across layers for each intervention
304
+ # for intervention, layer_scores in intervention_scores.items():
305
+ # column = f"{model}_{task}_{intervention}"
306
+ # avg_score = np.mean(layer_scores) if layer_scores else 0.0
307
+ # method_scores.append((column, f"{avg_score:.3f}"))
 
 
 
 
 
 
 
 
308
 
309
+ # # Sort by column names for consistency
310
+ # method_scores.sort(key=lambda x: x[0])
311
+ # data[unique_method_name] = {
312
+ # col: score for col, score in method_scores
313
+ # }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
+ # return pd.DataFrame.from_dict(data, orient='index')
316
+
317
+ # def to_dict(self):
318
+ # """Converts the Eval Result to a dict for dataframe display"""
319
+ # data_dict = {
320
+ # "eval_name": self.eval_name,
321
+ # "Method": self.method_name,
322
+ # }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
+ # # Add all results directly
325
+ # data_dict.update(self.results)
326
 
327
+ # return data_dict
 
328
 
329
 
330
 
331
 
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
334
  model_result_filepaths = []
335
 
 
350
 
351
  # print(f"Found json files: {model_result_filepaths}")
352
 
353
+ method_counters = defaultdict(int)
354
+ dataframes = []
355
+
356
+ for json_file in model_result_filepaths:
357
  try:
358
+ with open(filepath, 'r') as f:
359
+ json_data = json.load(f)
360
+ method_name = json_data['method_name']
361
+ method_counters[method_name] += 1
362
+
363
+ # Process single JSON file
364
+ df = process_single_json(json_data, method_counters[method_name])
365
+ dataframes.append(df)
366
  except Exception as e:
367
+ print(f"Error processing {json_file}: {e}")
368
  continue
369
+
370
+ # # Concatenate all DataFrames
371
+ # if dataframes:
372
+ # final_df = pd.concat(dataframes, axis=0)
373
+ # return final_df
374
+ # else:
375
+ # return pd.DataFrame()
376
+ return dataframes
377
 
378
 
379
 
src/populate.py CHANGED
@@ -29,14 +29,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
29
 
30
  df = pd.DataFrame.from_records(all_data_json)
31
  df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
32
- # df = df.sort_values(by=[Tasks.task0.value.col_name], ascending=False)
33
- # df = df.sort_values(by=[AutoEvalColumn.track.name], ascending=False)
34
-
35
- # print(f"df is {df}")
36
-
37
- # df = df[cols].round(decimals=1)
38
-
39
- # filter out if any of the benchmarks have not been produced
40
  df = df[has_no_nan_values(df, benchmark_cols)]
41
  return df
42
 
@@ -69,23 +61,6 @@ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols:
69
 
70
 
71
 
72
- # def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
73
- # """Aggregates rows with the same base method name by taking the max value for each column"""
74
- # df_copy = df.copy()
75
-
76
- # # Extract base method names (remove _2, _3, etc. suffixes)
77
- # base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
78
- # else name for name in df_copy.index]
79
- # df_copy.index = base_methods
80
-
81
- # # Convert scores to numeric values
82
- # numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
83
-
84
- # # Group by base method name and take the max
85
- # aggregated_df = numeric_df.groupby(level=0).max().round(3)
86
-
87
- # return aggregated_df
88
-
89
  def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
90
  """Aggregates rows with the same base method name by taking the max value for each column"""
91
  df_copy = df.copy()
@@ -111,63 +86,6 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
111
 
112
  return aggregated_df
113
 
114
- # def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
115
- # """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
116
- # df_copy = df.copy()
117
-
118
- # # Remove the Method column and eval_name if present
119
- # columns_to_drop = ['Method', 'eval_name']
120
- # df_copy = df_copy.drop(columns=[col for col in columns_to_drop if col in df_copy.columns])
121
-
122
- # # Group columns by model_task
123
- # model_task_groups = {}
124
- # for col in df_copy.columns:
125
- # model_task = '_'.join(col.split('_')[:2]) # Get model_task part
126
- # if model_task not in model_task_groups:
127
- # model_task_groups[model_task] = []
128
- # model_task_groups[model_task].append(col)
129
-
130
- # # Create new DataFrame with averaged intervention scores
131
- # averaged_df = pd.DataFrame({
132
- # model_task: df_copy[cols].mean(axis=1).round(3)
133
- # for model_task, cols in model_task_groups.items()
134
- # })
135
-
136
- # return averaged_df
137
-
138
- # def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
139
- # """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
140
- # df_copy = df.copy()
141
-
142
- # # Store Method column if it exists
143
- # method_col = None
144
- # if 'Method' in df_copy.columns:
145
- # method_col = df_copy['Method']
146
- # df_copy = df_copy.drop('Method', axis=1)
147
-
148
- # # Remove eval_name if present
149
- # if 'eval_name' in df_copy.columns:
150
- # df_copy = df_copy.drop('eval_name', axis=1)
151
-
152
- # # Group columns by model_task
153
- # model_task_groups = {}
154
- # for col in df_copy.columns:
155
- # model_task = '_'.join(col.split('_')[:2]) # Get model_task part
156
- # if model_task not in model_task_groups:
157
- # model_task_groups[model_task] = []
158
- # model_task_groups[model_task].append(col)
159
-
160
- # # Create new DataFrame with averaged intervention scores
161
- # averaged_df = pd.DataFrame({
162
- # model_task: df_copy[cols].mean(axis=1).round(3)
163
- # for model_task, cols in model_task_groups.items()
164
- # })
165
-
166
- # # Add Method column back
167
- # if method_col is not None:
168
- # averaged_df.insert(0, 'Method', method_col)
169
-
170
- # return averaged_df
171
 
172
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
173
  """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
@@ -203,69 +121,32 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
203
 
204
  return averaged_df
205
 
206
- # def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
207
- # """Creates a dataframe from all the MIB causal graph experiment results"""
208
- # print(f"results_path is {results_path}, requests_path is {requests_path}")
209
- # raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
210
- # print(f"raw_data is {raw_data}")
211
-
212
- # # Convert each result to dict format for detailed df
213
- # all_data_json = [v.to_dict() for v in raw_data]
214
- # detailed_df = pd.DataFrame.from_records(all_data_json)
215
- # print(f"detailed_df is: {detailed_df}")
216
-
217
- # # Create and print other views for debugging/reference
218
- # aggregated_df = aggregate_methods(detailed_df)
219
- # print(f"aggregated_df is: {aggregated_df}")
220
-
221
- # intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
222
- # print(f"intervention_averaged_df is: {intervention_averaged_df}")
223
-
224
- # # Only return detailed_df for display
225
- # return detailed_df
226
 
227
- # def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
228
- # print(f"results_path is {results_path}, requests_path is {requests_path}")
229
- # raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
230
-
231
- # # Convert each result to dict format for detailed df
232
- # all_data_json = [v.to_dict() for v in raw_data]
233
- # detailed_df = pd.DataFrame.from_records(all_data_json)
234
- # print("Columns in detailed_df:", detailed_df.columns.tolist()) # Print actual columns
235
-
236
- # # Create aggregated df
237
- # aggregated_df = aggregate_methods(detailed_df)
238
- # print("Columns in aggregated_df:", aggregated_df.columns.tolist())
239
-
240
- # # Create intervention-averaged df
241
- # intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
242
- # print("Columns in intervention_averaged_df:", intervention_averaged_df.columns.tolist())
243
-
244
- # return detailed_df, aggregated_df, intervention_averaged_df
245
 
246
- def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
247
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
248
- raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
249
-
250
- # Convert each result to dict format for detailed df
251
- all_data_json = [v.to_dict() for v in raw_data]
252
- detailed_df = pd.DataFrame.from_records(all_data_json)
 
253
 
254
  # Print the actual columns for debugging
255
- # print("Original columns:", detailed_df.columns.tolist())
256
-
257
- # Rename columns to match schema
258
- column_mapping = {}
259
- for col in detailed_df.columns:
260
- if col in ['eval_name', 'Method']:
261
- continue
262
- # Ensure consistent casing for the column names
263
- new_col = col.replace('Qwen2ForCausalLM', 'qwen2forcausallm') \
264
- .replace('Gemma2ForCausalLM', 'gemma2forcausallm') \
265
- .replace('LlamaForCausalLM', 'llamaforcausallm')
266
- column_mapping[col] = new_col
267
-
268
- detailed_df = detailed_df.rename(columns=column_mapping)
269
 
270
  # Create aggregated df
271
  aggregated_df = aggregate_methods(detailed_df)
 
29
 
30
  df = pd.DataFrame.from_records(all_data_json)
31
  df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
 
 
 
 
 
 
 
 
32
  df = df[has_no_nan_values(df, benchmark_cols)]
33
  return df
34
 
 
61
 
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
65
  """Aggregates rows with the same base method name by taking the max value for each column"""
66
  df_copy = df.copy()
 
86
 
87
  return aggregated_df
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
91
  """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
 
121
 
122
  return averaged_df
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
127
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
128
+
129
+ # raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
130
+ # all_data_json = [v.to_dict() for v in raw_data]
131
+ # detailed_df = pd.DataFrame.from_records(all_data_json)
132
+
133
+ detailed_df = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
134
 
135
  # Print the actual columns for debugging
136
+ print("Original columns:", detailed_df.columns.tolist())
137
+
138
+ # # Rename columns to match schema
139
+ # column_mapping = {}
140
+ # for col in detailed_df.columns:
141
+ # if col in ['eval_name', 'Method']:
142
+ # continue
143
+ # # Ensure consistent casing for the column names
144
+ # new_col = col.replace('Qwen2ForCausalLM', 'qwen2forcausallm') \
145
+ # .replace('Gemma2ForCausalLM', 'gemma2forcausallm') \
146
+ # .replace('LlamaForCausalLM', 'llamaforcausallm')
147
+ # column_mapping[col] = new_col
148
+
149
+ # detailed_df = detailed_df.rename(columns=column_mapping)
150
 
151
  # Create aggregated df
152
  aggregated_df = aggregate_methods(detailed_df)