Aaron Mueller commited on
Commit
f59c752
·
2 Parent(s): 33ddef9 475701c

Merge branch 'main' of https://huggingface.co/spaces/mech-interp-bench/leaderboard

Browse files
Files changed (3) hide show
  1. app.py +90 -18
  2. src/about.py +7 -4
  3. src/leaderboard/read_evals.py +251 -297
app.py CHANGED
@@ -399,34 +399,76 @@ def init_leaderboard_mib_subgraph(dataframe, track):
399
 
400
 
401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
  def init_leaderboard_mib_causalgraph(dataframe, track):
404
  model_name_mapping = {
405
  "Qwen2ForCausalLM": "Qwen-2.5",
406
  "GPT2ForCausalLM": "GPT-2",
 
407
  "Gemma2ForCausalLM": "Gemma-2",
408
  "LlamaForCausalLM": "Llama-3.1"
409
  }
410
 
411
  benchmark_mapping = {
412
- "IOI": "IOI",
413
- "MCQA": "MCQA",
414
  "arithmetic_addition": "Arithmetic (+)",
415
  "arithmetic_subtraction": "Arithmetic (-)",
416
- "arc_easy": "ARC (Easy)",
417
- "arc_challenge": "ARC (Challenge)"
 
 
 
 
 
 
 
 
 
 
 
418
  }
419
 
420
  display_mapping = {}
421
  for task in TasksMib_Causalgraph:
422
  for model in task.value.models:
423
- field_name = f"{task.value.col_name}_{model}"
424
- display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
425
- display_mapping[field_name] = display_name
 
 
426
 
427
  renamed_df = dataframe.rename(columns=display_mapping)
428
-
429
- print(renamed_df)
430
 
431
  # Create only necessary columns
432
  return Leaderboard(
@@ -488,8 +530,10 @@ def get_hf_username(hf_repo):
488
  # Define the preset substrings for filtering
489
  PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
490
  TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
 
491
  MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
492
 
 
493
  def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
494
  selected_model_substrings: List[str]) -> pd.DataFrame:
495
  """
@@ -648,21 +692,21 @@ with demo:
648
  # Then modify the Causal Graph tab section
649
  with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
650
  with gr.Tabs() as causalgraph_tabs:
651
- with gr.TabItem("Detailed View", id=0):
652
- leaderboard_detailed, data = init_leaderboard_mib_causalgraph(
653
- LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
654
- "Causal Graph"
655
- )
656
- with gr.TabItem("Aggregated View", id=1):
657
  gr.Markdown("""
658
  ### Filtering Options
659
  Use the dropdown menus below to filter results by specific tasks or models.
660
  You can combine filters to see specific task-model combinations.
661
  """)
662
  task_substring_checkbox = gr.CheckboxGroup(
663
- choices=TASK_SUBSTRINGS,
664
  label="View tasks:",
665
- value=TASK_SUBSTRINGS, # Default to all substrings selected
666
  )
667
  model_substring_checkbox = gr.CheckboxGroup(
668
  choices = MODEL_SUBSTRINGS,
@@ -685,11 +729,39 @@ with demo:
685
  inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
686
  outputs=leaderboard_aggregated
687
  )
688
- with gr.TabItem("Intervention Averaged", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
689
  leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
690
  LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
691
  "Causal Graph"
692
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693
 
694
  with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
695
  # Track selection
 
399
 
400
 
401
 
402
+ # @dataclass
403
+ # class TaskMIB_Causalgraph:
404
+ # benchmark: str # task name in json (ioi/arithmetic)
405
+ # models: list[str] # list of models to show as sub-columns
406
+ # col_name: str # display name in leaderboard
407
+ # metrics: list[str] # metrics to store (average_score)
408
+
409
+ # class TasksMib_Causalgraph(Enum):
410
+ # task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "ioi_task", ["average_score"])
411
+ # task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
412
+ # task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
413
+ # task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
414
+
415
+ # @classmethod
416
+ # def get_all_tasks(cls):
417
+ # """Returns a list of all task benchmarks"""
418
+ # return [task.value.benchmark for task in cls]
419
+
420
+ # @classmethod
421
+ # def get_all_models(cls):
422
+ # """Returns a list of all unique models across all tasks"""
423
+ # models = set()
424
+ # for task in cls:
425
+ # models.update(task.value.models)
426
+ # return sorted(list(models))
427
+
428
+ # ioi_task
429
+ # 4_answer_MCQA
430
+
431
 
432
  def init_leaderboard_mib_causalgraph(dataframe, track):
433
  model_name_mapping = {
434
  "Qwen2ForCausalLM": "Qwen-2.5",
435
  "GPT2ForCausalLM": "GPT-2",
436
+ "GPT2LMHeadModel": "GPT-2",
437
  "Gemma2ForCausalLM": "Gemma-2",
438
  "LlamaForCausalLM": "Llama-3.1"
439
  }
440
 
441
  benchmark_mapping = {
442
+ "ioi_task": "IOI",
443
+ "4_answer_MCQA": "MCQA",
444
  "arithmetic_addition": "Arithmetic (+)",
445
  "arithmetic_subtraction": "Arithmetic (-)",
446
+ "ARC_easy": "ARC (Easy)",
447
+ "RAVEL_task": "RAVEL"
448
+ }
449
+
450
+ target_variables_mapping = {
451
+ "output_token": "Output Token",
452
+ "output_position": "Output Position",
453
+ "answer_pointer": "Answer Pointer",
454
+ "answer": "Answer",
455
+ "Continent": "Continent",
456
+ "Language": "Language",
457
+ "Country": "Country",
458
+ "Language": "Language"
459
  }
460
 
461
  display_mapping = {}
462
  for task in TasksMib_Causalgraph:
463
  for model in task.value.models:
464
+ for target_variables in task.value.target_variables:
465
+ field_name = f"{model}_{task.value.col_name}_{target_variables}"
466
+ display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]} - {target_variables_mapping[target_variables]}"
467
+ display_mapping[field_name] = display_name
468
+
469
 
470
  renamed_df = dataframe.rename(columns=display_mapping)
471
+
 
472
 
473
  # Create only necessary columns
474
  return Leaderboard(
 
530
  # Define the preset substrings for filtering
531
  PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
532
  TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
533
+ TASK_CAUSAL_SUBSTRINGS = ["IOI", "MCQA", "ARC (Easy)", "RAVEL"]
534
  MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
535
 
536
+
537
  def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
538
  selected_model_substrings: List[str]) -> pd.DataFrame:
539
  """
 
692
  # Then modify the Causal Graph tab section
693
  with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
694
  with gr.Tabs() as causalgraph_tabs:
695
+ # with gr.TabItem("Detailed View", id=0):
696
+ # leaderboard_detailed, data = init_leaderboard_mib_causalgraph(
697
+ # LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
698
+ # "Causal Graph"
699
+ # )
700
+ with gr.TabItem("Highest View", id=0):
701
  gr.Markdown("""
702
  ### Filtering Options
703
  Use the dropdown menus below to filter results by specific tasks or models.
704
  You can combine filters to see specific task-model combinations.
705
  """)
706
  task_substring_checkbox = gr.CheckboxGroup(
707
+ choices=TASK_CAUSAL_SUBSTRINGS,
708
  label="View tasks:",
709
+ value=TASK_CAUSAL_SUBSTRINGS, # Default to all substrings selected
710
  )
711
  model_substring_checkbox = gr.CheckboxGroup(
712
  choices = MODEL_SUBSTRINGS,
 
729
  inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
730
  outputs=leaderboard_aggregated
731
  )
732
+ with gr.TabItem("Averaged View", id=1):
733
+
734
+ task_substring_checkbox = gr.CheckboxGroup(
735
+ choices=TASK_CAUSAL_SUBSTRINGS,
736
+ label="View tasks:",
737
+ value=TASK_CAUSAL_SUBSTRINGS, # Default to all substrings selected
738
+ )
739
+ model_substring_checkbox = gr.CheckboxGroup(
740
+ choices = MODEL_SUBSTRINGS,
741
+ label = "View models:",
742
+ value = MODEL_SUBSTRINGS
743
+ )
744
+
745
  leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
746
  LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
747
  "Causal Graph"
748
  )
749
+ original_leaderboard = gr.State(value=data)
750
+ task_substring_checkbox.change(
751
+ fn=update_leaderboard,
752
+ inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
753
+ outputs=leaderboard_averaged
754
+ )
755
+ model_substring_checkbox.change(
756
+ fn=update_leaderboard,
757
+ inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
758
+ outputs=leaderboard_averaged
759
+ )
760
+
761
+ # leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
762
+ # LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
763
+ # "Causal Graph"
764
+ # )
765
 
766
  with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
767
  # Track selection
src/about.py CHANGED
@@ -78,12 +78,15 @@ class TaskMIB_Causalgraph:
78
  models: list[str] # list of models to show as sub-columns
79
  col_name: str # display name in leaderboard
80
  metrics: list[str] # metrics to store (average_score)
 
 
 
81
 
82
  class TasksMib_Causalgraph(Enum):
83
- task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "IOI", ["average_score"])
84
- task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "MCQA", ["average_score"])
85
- task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
86
- task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
87
 
88
  @classmethod
89
  def get_all_tasks(cls):
 
78
  models: list[str] # list of models to show as sub-columns
79
  col_name: str # display name in leaderboard
80
  metrics: list[str] # metrics to store (average_score)
81
+ target_variables: list[str]
82
+
83
+
84
 
85
  class TasksMib_Causalgraph(Enum):
86
+ task0 = TaskMIB_Causalgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"], ["output_token", "output_position"])
87
+ task1 = TaskMIB_Causalgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"], ["answer_pointer", "answer"])
88
+ task2 = TaskMIB_Causalgraph("ravel", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "RAVEL_task", ["average_score"], ["Continent", "Language", "Country", "Language"])
89
+ task3 = TaskMIB_Causalgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ARC_easy", ["average_score"], ["answer_pointer", "answer"])
90
 
91
  @classmethod
92
  def get_all_tasks(cls):
src/leaderboard/read_evals.py CHANGED
@@ -2,19 +2,22 @@ import glob
2
  import json
3
  import math
4
  import os
 
 
5
  from dataclasses import dataclass
 
 
 
6
 
7
  import dateutil
8
  import numpy as np
 
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal
12
  from src.submission.check_validity import is_model_on_hub
13
- from src.about import TasksMib_Subgraph
14
 
15
- from typing import List, Dict, Any
16
- from collections import defaultdict
17
- import pandas as pd
18
 
19
 
20
 
@@ -205,224 +208,10 @@ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_
205
 
206
 
207
 
208
- # def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
209
- # """
210
- # Process a single JSON file and convert it to a DataFrame.
211
-
212
- # Args:
213
- # json_file: Dictionary containing the analysis results
214
- # method_counter: Counter for handling duplicate method names
215
-
216
- # Returns:
217
- # pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
218
- # """
219
- # method_name = json_file['method_name']
220
- # unique_method_name = f"{method_name}_{method_counter}"
221
- # method_scores = []
222
-
223
- # for result in json_file['results']:
224
- # model = result['model_id']
225
-
226
- # for task, scores in result['task_scores'].items():
227
- # # Process each layer's data
228
- # intervention_scores = defaultdict(list)
229
-
230
- # for layer_data in scores:
231
- # for intervention_data in layer_data['layer_scores']:
232
- # # Calculate average score for counterfactuals
233
- # avg_cf_score = np.mean([
234
- # cf['score']
235
- # for cf in intervention_data['counterfactual_scores']
236
- # ])
237
-
238
- # if np.isnan(avg_cf_score):
239
- # avg_cf_score = 0.0
240
-
241
- # # Group scores by intervention
242
- # intervention_key = '_'.join(intervention_data['intervention'])
243
- # intervention_scores[intervention_key].append(avg_cf_score)
244
-
245
- # # Average across layers for each intervention
246
- # for intervention, layer_scores in intervention_scores.items():
247
- # column = f"{model}_{task}_{intervention}"
248
- # avg_score = np.mean(layer_scores) if layer_scores else 0.0
249
- # method_scores.append((column, f"{avg_score:.3f}"))
250
-
251
- # # Sort by column names for consistency
252
- # method_scores.sort(key=lambda x: x[0])
253
- # data = {
254
- # unique_method_name: {
255
- # col: score for col, score in method_scores
256
- # }
257
- # }
258
-
259
- # return pd.DataFrame.from_dict(data, orient='index')
260
-
261
- # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
262
- # model_result_filepaths = []
263
-
264
- # # print(f"Scanning directory: {results_path}")
265
- # for root, dirnames, files in os.walk(results_path):
266
- # # print(f"Current directory: {root}")
267
- # # print(f"Found files: {files}")
268
- # if len(files) == 0 or any([not f.endswith(".json") for f in files]):
269
- # continue
270
-
271
- # try:
272
- # files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
273
- # except dateutil.parser._parser.ParserError:
274
- # files = [files[-1]]
275
-
276
- # for file in files:
277
- # model_result_filepaths.append(os.path.join(root, file))
278
-
279
- # # print(f"Found json files: {model_result_filepaths}")
280
-
281
- # method_counters = defaultdict(int)
282
- # dataframes = []
283
-
284
- # for json_file in model_result_filepaths:
285
- # try:
286
- # with open(filepath, 'r') as f:
287
- # json_data = json.load(f)
288
- # method_name = json_data['method_name']
289
- # method_counters[method_name] += 1
290
-
291
- # # Process single JSON file
292
- # df = process_single_json(json_data, method_counters[method_name])
293
- # dataframes.append(df)
294
- # except Exception as e:
295
- # print(f"Error processing {json_file}: {e}")
296
- # continue
297
-
298
- # return dataframes
299
-
300
-
301
-
302
-
303
- from dataclasses import dataclass
304
- import json
305
- import numpy as np
306
- import pandas as pd
307
- from typing import Dict, List, Any
308
- import os
309
- from datetime import datetime
310
- import dateutil
311
- from collections import defaultdict
312
-
313
- @dataclass
314
- class EvalResult_MIB_CAUSALGRAPH:
315
- """Represents one full evaluation for a method across all models in MIB for causal graph track."""
316
- method_name: str # name of the interpretation method
317
- results: Dict # nested dict of results for each model and task
318
-
319
- def init_from_json_file(self, json_filepath: str):
320
- """Inits results from the method result file"""
321
- with open(json_filepath) as fp:
322
- data = json.load(fp)
323
-
324
- method_name = data.get("method_name")
325
-
326
- # Initialize results dictionary
327
- results = {}
328
- for task in ["IOI", "MCQA", "arithmetic", "ARC-easy"]:
329
- results[task] = {}
330
-
331
- # Process each model's results
332
- for result in data.get("results", []):
333
- model_id = result.get("model_id", "")
334
- model_name = model_id.replace(".", "_")
335
-
336
- for task, scores in result.get("task_scores", {}).items():
337
- intervention_scores = defaultdict(list)
338
-
339
- for layer_data in scores:
340
- for intervention_data in layer_data['layer_scores']:
341
- # Calculate average score for counterfactuals
342
- avg_cf_score = np.mean([
343
- cf['score'] if 'score' in cf else 0
344
- for cf in intervention_data['counterfactual_scores']
345
- ])
346
-
347
- if np.isnan(avg_cf_score):
348
- avg_cf_score = 0.0
349
-
350
- intervention_key = '_'.join(intervention_data['intervention'])
351
- intervention_scores[intervention_key].append(avg_cf_score)
352
-
353
- # Average across layers for each intervention
354
- results[task][model_name] = {
355
- interv: np.mean(scores) if scores else 0.0
356
- for interv, scores in intervention_scores.items()
357
- }
358
-
359
- return EvalResult_MIB_CAUSALGRAPH(
360
- method_name=method_name,
361
- results=results
362
- )
363
-
364
- def to_dict(self, metric_type="average"):
365
- """Converts the Eval Result to a dict for dataframe display"""
366
- data_dict = {
367
- "Method": self.method_name,
368
- "Average": "-" # Initialize first to make the order consistent
369
- }
370
-
371
- # Initialize columns for all task-model combinations
372
- all_scores = []
373
- for task, task_results in self.results.items():
374
- for model, intervention_scores in task_results.items():
375
- if not intervention_scores:
376
- continue
377
-
378
- col_name = f"{task}_{model}"
379
- scores = list(intervention_scores.values())
380
- if not scores:
381
- data_dict[col_name] = '-'
382
- continue
383
-
384
- avg_score = np.mean(scores)
385
- data_dict[col_name] = f"{avg_score:.3f}"
386
- all_scores.append(avg_score)
387
-
388
- data_dict["Average"] = f"{np.mean(all_scores):.3f}"
389
- return data_dict
390
-
391
-
392
 
393
 
394
- # def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
395
- # """
396
- # Aggregates rows with the same base method name by taking the max value for each column.
397
- # Works with Method as a regular column instead of index.
398
- # """
399
- # df_copy = df.copy()
400
- # print("\nBase methods extraction:")
401
- # base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
402
- # else name for name in df_copy['Method']]
403
- # print(f"Original methods: {df_copy['Method'].tolist()}")
404
- # print(f"Base methods: {base_methods}")
405
 
406
 
407
- # df_copy['base_method'] = base_methods
408
-
409
- # # Convert scores to numeric values
410
- # score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
411
- # for col in score_columns:
412
- # df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
413
-
414
- # # Group by base method name and take the max
415
- # aggregated_df = df_copy.groupby('base_method')[score_columns].max().round(3)
416
-
417
- # # Reset index to make base_method a regular column and rename it to Method
418
- # aggregated_df = aggregated_df.reset_index()
419
- # aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
420
-
421
- # # Convert back to string format
422
- # for col in score_columns:
423
- # aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}")
424
-
425
- # return aggregated_df
426
  def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
427
  """
428
  Aggregates rows with the same base method name by taking the max value for each column.
@@ -444,21 +233,21 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
444
  # Convert scores to numeric values
445
  score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
446
  for col in score_columns:
447
- df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) and not pd.isna(x) else x)
448
 
449
  # Group by base method name and take the max, handling NaN values
450
- aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(2)
451
 
452
- # Convert back to string format and reset index
453
  aggregated_df = aggregated_df.reset_index()
454
  aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
455
 
456
- # Convert numeric values back to strings with 3 decimal places
457
- for col in score_columns:
458
- aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}" if not pd.isna(x) else x)
459
-
460
  return aggregated_df
461
 
 
 
 
 
462
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
463
  """
464
  Creates a DataFrame where columns are model_task and cells are averaged over interventions.
@@ -467,99 +256,264 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
467
  # Create a copy of the DataFrame
468
  df_copy = df.copy()
469
 
470
- # Remove the Average column if it exists
471
- if 'Average' in df_copy.columns:
472
- df_copy = df_copy.drop('Average', axis=1)
473
-
474
- # Get score columns (excluding Method)
475
- score_columns = [col for col in df_copy.columns if col != 'Method']
476
 
477
- # Convert all scores to numeric values
478
- for col in score_columns:
479
- df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
480
-
481
- # Group columns by model_task
482
- model_task_groups = {}
483
- for col in score_columns:
484
- model_task = '_'.join(col.split('_')[:2]) # Get model_task part
485
- if model_task not in model_task_groups:
486
- model_task_groups[model_task] = []
487
- model_task_groups[model_task].append(col)
488
 
489
- # Create new DataFrame with Method column and averaged intervention scores
490
  averaged_data = []
491
  for _, row in df_copy.iterrows():
492
- averaged_row = {'Method': row['Method']}
 
 
493
  for model_task, cols in model_task_groups.items():
494
- averaged_row[model_task] = np.mean([row[col] for col in cols]).round(2)
495
- averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(2)
496
- averaged_data.append(averaged_row)
 
 
 
 
 
 
 
 
 
 
 
497
 
 
498
  averaged_df = pd.DataFrame(averaged_data)
499
-
500
- # Sort by Average column
501
- averaged_df = averaged_df.sort_values('Average', ascending=False)
502
 
503
  return averaged_df
504
 
505
 
506
- def get_raw_eval_results_mib_causalgraph(results_path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
507
- """From the path of the results folder root, extract all needed info for MIB causal graph results"""
508
- model_result_filepaths = []
509
-
510
- for root, dirnames, files in os.walk(results_path):
511
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
512
- continue
513
-
514
- try:
515
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
516
- except dateutil.parser._parser.ParserError:
517
- files = [files[-1]]
518
-
519
- for file in files:
520
- model_result_filepaths.append(os.path.join(root, file))
521
 
522
- method_counters = defaultdict(int)
523
- data_dicts = []
524
-
525
- for filepath in model_result_filepaths:
526
- with open(filepath, 'r') as f:
527
- json_data = json.load(f)
528
- method_name = json_data['method_name']
529
- method_counters[method_name] += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
 
531
- eval_result = EvalResult_MIB_CAUSALGRAPH("", {})
532
- result = eval_result.init_from_json_file(filepath)
533
- data_dict = result.to_dict()
534
 
535
- # print(f"data_dict.keys(): {data_dict.keys()}")
 
 
 
 
 
 
536
 
537
- # Add method counter to the method name if it's not the first instance
538
- if method_counters[method_name] > 1:
539
- data_dict["Method"] = f"{method_name}_{method_counters[method_name]}"
540
 
541
- data_dicts.append(data_dict)
 
 
 
 
 
 
 
542
 
543
- if not data_dicts:
544
- return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
545
 
546
- # Create the detailed DataFrame
547
- detailed_df = pd.DataFrame(data_dicts)
548
- # detailed_df.set_index("Method", inplace=True)
549
- # print(f"detailed_df coluns are {detailed_df.columns.tolist()}")
550
- # if "eval_name" in detailed_df.columns:
551
- # detailed_df.drop("eval_name", axis=1, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
552
 
553
- print("Before aggregation:")
554
- print(detailed_df)
555
 
556
- # Create aggregated DataFrame
557
- aggregated_df = aggregate_methods(detailed_df)
558
 
559
- # Create intervention-averaged DataFrame
560
- intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
561
 
562
- return detailed_df, aggregated_df, intervention_averaged_df
563
 
564
 
565
 
 
2
  import json
3
  import math
4
  import os
5
+ import re
6
+ import ast
7
  from dataclasses import dataclass
8
+ from datetime import datetime
9
+ from typing import List, Dict, Any, Tuple
10
+ from collections import defaultdict
11
 
12
  import dateutil
13
  import numpy as np
14
+ import pandas as pd
15
 
16
  from src.display.formatting import make_clickable_model
17
+ from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, AutoEvalColumn_mib_causalgraph
18
  from src.submission.check_validity import is_model_on_hub
19
+ from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
20
 
 
 
 
21
 
22
 
23
 
 
208
 
209
 
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
216
  """
217
  Aggregates rows with the same base method name by taking the max value for each column.
 
233
  # Convert scores to numeric values
234
  score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
235
  for col in score_columns:
236
+ df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
237
 
238
  # Group by base method name and take the max, handling NaN values
239
+ aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(3)
240
 
241
+ # Reset index to make base_method a regular column and rename it to Method
242
  aggregated_df = aggregated_df.reset_index()
243
  aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
244
 
 
 
 
 
245
  return aggregated_df
246
 
247
+
248
+
249
+
250
+
251
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
252
  """
253
  Creates a DataFrame where columns are model_task and cells are averaged over interventions.
 
256
  # Create a copy of the DataFrame
257
  df_copy = df.copy()
258
 
259
+ # Get all columns except Method and Average
260
+ columns_to_process = [col for col in df_copy.columns if col not in ['Method', 'Average']]
 
 
 
 
261
 
262
+ # Extract model and task information from column names
263
+ model_task_groups = defaultdict(list)
264
+ for col in columns_to_process:
265
+ # Split by underscore and extract model, task
266
+ parts = col.split('_')
267
+ if len(parts) >= 2:
268
+ model_task = f"{parts[0]}_{parts[1]}"
269
+ model_task_groups[model_task].append(col)
 
 
 
270
 
271
+ # Create new DataFrame with Method and averaged columns
272
  averaged_data = []
273
  for _, row in df_copy.iterrows():
274
+ new_row = {'Method': row['Method']}
275
+
276
+ # Calculate average for each model_task group
277
  for model_task, cols in model_task_groups.items():
278
+ values = [row[col] for col in cols if pd.notna(row[col])]
279
+ if values:
280
+ new_row[model_task] = round(np.mean(values), 3)
281
+ else:
282
+ new_row[model_task] = np.nan
283
+
284
+ # Calculate overall average
285
+ model_task_values = [v for k, v in new_row.items() if k != 'Method' and pd.notna(v)]
286
+ if model_task_values:
287
+ new_row['Average'] = round(np.mean(model_task_values), 3)
288
+ else:
289
+ new_row['Average'] = np.nan
290
+
291
+ averaged_data.append(new_row)
292
 
293
+ # Create DataFrame and sort by Average
294
  averaged_df = pd.DataFrame(averaged_data)
295
+ if 'Average' in averaged_df.columns:
296
+ averaged_df = averaged_df.sort_values('Average', ascending=False)
 
297
 
298
  return averaged_df
299
 
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
+ @dataclass
303
+ class EvalResult_MIB_CAUSALGRAPH:
304
+ """Represents one full evaluation for a method across all models for causal variable localization."""
305
+ eval_name: str # method name as identifier
306
+ method_name: str # name of the interpretation method
307
+ model_name: str # name of the model
308
+ task_name: str # name of the task
309
+ target_variables: str # target variables (e.g., "answer", "answer_pointer")
310
+ average_accuracy: float # average accuracy score
311
+ highest_accuracy: float # highest accuracy score
312
+
313
+ @staticmethod
314
+ def init_from_consolidated_json(json_data: Dict):
315
+ """
316
+ Initialize results from the consolidated JSON format, treating each entry as a separate result
317
+
318
+ Args:
319
+ json_data: The parsed JSON data with tuple keys
320
+
321
+ Returns:
322
+ List of EvalResult_MIB_CAUSALGRAPH objects
323
+ """
324
+ results = []
325
+
326
+ for key, entry in json_data.items():
327
+ try:
328
+ # Parse tuple key: "('method', 'model', 'task', 'variable')"
329
+ try:
330
+ key_tuple = ast.literal_eval(key)
331
+ method_name, model_name, task_name, target_variable = key_tuple
332
+ except:
333
+ # Alternative parsing with regex
334
+ pattern = r"\('([^']+)', '([^']+)', '([^']+)', '([^']+)'\)"
335
+ match = re.match(pattern, key)
336
+ if match:
337
+ method_name, model_name, task_name, target_variable = match.groups()
338
+ else:
339
+ print(f"Couldn't parse key: {key}")
340
+ continue
341
+
342
+ # Get average and highest accuracy
343
+ average_accuracy = entry.get("average_accuracy", 0.0)
344
+ highest_accuracy = entry.get("highest_accuracy", 0.0)
345
+
346
+ # Create a result object for this entry
347
+ result = EvalResult_MIB_CAUSALGRAPH(
348
+ eval_name=f"{method_name}_{model_name}_{task_name}_{target_variable}",
349
+ method_name=method_name,
350
+ model_name=model_name,
351
+ task_name=task_name,
352
+ target_variables=target_variable,
353
+ average_accuracy=average_accuracy,
354
+ highest_accuracy=highest_accuracy
355
+ )
356
+
357
+ results.append(result)
358
+
359
+ except Exception as e:
360
+ print(f"Error processing entry {key}: {e}")
361
+ continue
362
+
363
+ return results
364
 
365
+ def to_dict(self, metric_type="Highest"):
366
+ """
367
+ Converts the Eval Result to a dict for dataframe display
368
 
369
+ Args:
370
+ metric_type: Either "Mean" to use average_accuracy or "Highest" to use highest_accuracy
371
+ """
372
+ # Create column name in the exact format requested
373
+ # col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
374
+ col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
375
+ print(f"col_name is {col_name}")
376
 
377
+ # Select the appropriate accuracy metric based on metric_type
378
+ score = self.average_accuracy if metric_type == "Mean" else self.highest_accuracy
 
379
 
380
+ # Create data dictionary with method name and the score
381
+ data_dict = {
382
+ "eval_name": self.eval_name,
383
+ "Method": self.method_name,
384
+ col_name: score
385
+ }
386
+
387
+ return data_dict
388
 
 
 
389
 
390
+ def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
391
+ """
392
+ Processes the consolidated JSON format for causal variable localization results
393
+ Treats each entry as a separate result and then combines them by method
394
+
395
+ Args:
396
+ results_path: Path to the directory containing results
397
+
398
+ Returns:
399
+ Tuple of four DataFrames:
400
+ - detailed_df_highest: Detailed view with highest accuracy scores
401
+ - detailed_df_mean: Detailed view with mean accuracy scores
402
+ - intervention_averaged_highest_df: Averaged by intervention using highest accuracy
403
+ - intervention_averaged_mean_df: Averaged by intervention using mean accuracy
404
+ """
405
+ # Find the consolidated JSON file
406
+ json_files = []
407
+ for root, _, files in os.walk(results_path):
408
+ for file in files:
409
+ if file.endswith('.json'):
410
+ json_files.append(os.path.join(root, file))
411
+
412
+ if not json_files:
413
+ print(f"No JSON files found in {results_path}")
414
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
415
+
416
+ # Load and process the consolidated JSON format
417
+ raw_data = None
418
+ for json_file in json_files:
419
+ try:
420
+ with open(json_file, 'r') as f:
421
+ data = json.load(f)
422
+
423
+ # Check if this is the consolidated format by examining a sample key
424
+ sample_key = next(iter(data), None)
425
+ if sample_key and isinstance(sample_key, str) and '(' in sample_key and ')' in sample_key:
426
+ raw_data = data
427
+ print(f"Found consolidated data file: {json_file}")
428
+ break
429
+ except Exception as e:
430
+ print(f"Error reading {json_file}: {e}")
431
+
432
+ if raw_data is None:
433
+ print("No valid consolidated JSON file found")
434
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
435
+
436
+ # Get all results
437
+ eval_results = EvalResult_MIB_CAUSALGRAPH.init_from_consolidated_json(raw_data)
438
+
439
+ if not eval_results:
440
+ print("No results could be extracted from the JSON data")
441
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
442
+
443
+ # Create two sets of dictionaries - one for highest accuracy and one for mean accuracy
444
+ highest_results = [result.to_dict(metric_type="Highest") for result in eval_results]
445
+ mean_results = [result.to_dict(metric_type="Mean") for result in eval_results]
446
+
447
+ # Process highest accuracy results
448
+ # Group results by method
449
+ highest_method_groups = {}
450
+ for result_dict in highest_results:
451
+ method = result_dict["Method"]
452
+ if method not in highest_method_groups:
453
+ highest_method_groups[method] = {
454
+ "eval_name": method,
455
+ "Method": method
456
+ }
457
+
458
+ # Copy all score columns to the method's group
459
+ for key, value in result_dict.items():
460
+ if key not in ["eval_name", "Method"]:
461
+ highest_method_groups[method][key] = value
462
+
463
+ # Create the detailed DataFrame for highest accuracy
464
+ highest_records = list(highest_method_groups.values())
465
+ detailed_df_highest = pd.DataFrame(highest_records)
466
+
467
+ # Process mean accuracy results
468
+ # Group results by method
469
+ mean_method_groups = {}
470
+ for result_dict in mean_results:
471
+ method = result_dict["Method"]
472
+ if method not in mean_method_groups:
473
+ mean_method_groups[method] = {
474
+ "eval_name": method,
475
+ "Method": method
476
+ }
477
+
478
+ # Copy all score columns to the method's group
479
+ for key, value in result_dict.items():
480
+ if key not in ["eval_name", "Method"]:
481
+ mean_method_groups[method][key] = value
482
+
483
+ # Create the detailed DataFrame for mean accuracy
484
+ mean_records = list(mean_method_groups.values())
485
+ detailed_df_mean = pd.DataFrame(mean_records)
486
+
487
+ if detailed_df_highest.empty or detailed_df_mean.empty:
488
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
489
+
490
+ # Calculate and add Average column for both DataFrames
491
+ score_columns_highest = [col for col in detailed_df_highest.columns if col not in ["eval_name", "Method"]]
492
+ score_columns_mean = [col for col in detailed_df_mean.columns if col not in ["eval_name", "Method"]]
493
+
494
+ if score_columns_highest:
495
+ detailed_df_highest["Average"] = detailed_df_highest[score_columns_highest].mean(axis=1).round(3)
496
+
497
+ if score_columns_mean:
498
+ detailed_df_mean["Average"] = detailed_df_mean[score_columns_mean].mean(axis=1).round(3)
499
+
500
+ # Sort by Average descending
501
+ if "Average" in detailed_df_highest.columns:
502
+ detailed_df_highest = detailed_df_highest.sort_values("Average", ascending=False)
503
+
504
+ if "Average" in detailed_df_mean.columns:
505
+ detailed_df_mean = detailed_df_mean.sort_values("Average", ascending=False)
506
+
507
+ # # Create intervention-averaged DataFrames for both metrics
508
+ # intervention_averaged_highest_df = create_intervention_averaged_df(detailed_df_highest)
509
+ # intervention_averaged_mean_df = create_intervention_averaged_df(detailed_df_mean)
510
+
511
+ # return detailed_df_highest, detailed_df_mean, intervention_averaged_highest_df
512
+ return detailed_df_highest, detailed_df_highest, detailed_df_mean
513
 
 
 
514
 
 
 
515
 
 
 
516
 
 
517
 
518
 
519