Aaron Mueller commited on
Commit
1d8e193
·
1 Parent(s): aaed88c

updated filtering, add F= tab

Browse files
Files changed (42) hide show
  1. .gradio/certificate.pem +31 -0
  2. app.py +144 -42
  3. app.pyi +867 -0
  4. custom-select-columns.py +1 -1
  5. eval-queue/.gitattributes +59 -0
  6. eval-queue/README.md +3 -0
  7. eval-results-mib-causalgraph/.gitattributes +59 -0
  8. eval-results-mib-causalgraph/README.md +0 -0
  9. eval-results-mib-causalgraph/submissions/MCQA_results_Qwen_correct_choice_period_token.json +1332 -0
  10. eval-results-mib-causalgraph/submissions/MCQA_results_Qwen_last_correct_choice_token.json +1332 -0
  11. eval-results-mib-causalgraph/submissions/MCQA_results_Qwen_last_token.json +1332 -0
  12. eval-results-mib-causalgraph/submissions/MCQA_results_Qwen_second_to_last_token.json +1332 -0
  13. eval-results-mib-causalgraph/submissions/MCQA_results_google_correct_choice_period_token.json +1442 -0
  14. eval-results-mib-causalgraph/submissions/MCQA_results_google_correct_choice_token.json +1442 -0
  15. eval-results-mib-causalgraph/submissions/MCQA_results_google_last_token.json +1442 -0
  16. eval-results-mib-causalgraph/submissions/MCQA_results_google_second_to_last_token.json +1442 -0
  17. eval-results-mib-causalgraph/submissions/MCQA_results_meta-llama_correct_choice_period_token.json +1772 -0
  18. eval-results-mib-causalgraph/submissions/MCQA_results_meta-llama_correct_choice_token.json +1772 -0
  19. eval-results-mib-causalgraph/submissions/MCQA_results_meta-llama_last_token.json +1772 -0
  20. eval-results-mib-causalgraph/submissions/MCQA_results_meta-llama_second_to_last_token.json +1772 -0
  21. eval-results-mib-subgraph/.gitattributes +59 -0
  22. eval-results-mib-subgraph/README.md +3 -0
  23. eval-results-mib-subgraph/baselines/EAP-IG-activations_CF.json +337 -0
  24. eval-results-mib-subgraph/baselines/EAP-IG-inputs_CF.json +311 -0
  25. eval-results-mib-subgraph/baselines/EAP_CF.json +285 -0
  26. eval-results-mib-subgraph/baselines/EAP_OA.json +93 -0
  27. eval-results-mib-subgraph/baselines/EAP_mean.json +311 -0
  28. eval-results-mib-subgraph/baselines/IFR.json +285 -0
  29. eval-results-mib-subgraph/baselines/NAP-IG-inputs_CF.json +337 -0
  30. eval-results-mib-subgraph/baselines/NAP_CF.json +337 -0
  31. eval-results-mib-subgraph/submissions/results_2024-10-2T13-36-121.json +19 -0
  32. src/__pycache__/about.cpython-310.pyc +0 -0
  33. src/__pycache__/envs.cpython-310.pyc +0 -0
  34. src/__pycache__/populate.cpython-310.pyc +0 -0
  35. src/display/__pycache__/css_html_js.cpython-310.pyc +0 -0
  36. src/display/__pycache__/formatting.cpython-310.pyc +0 -0
  37. src/display/__pycache__/utils.cpython-310.pyc +0 -0
  38. src/leaderboard/__pycache__/read_evals.cpython-310.pyc +0 -0
  39. src/leaderboard/read_evals.py +3 -3
  40. src/populate.py +3 -2
  41. src/submission/__pycache__/check_validity.cpython-310.pyc +0 -0
  42. src/submission/__pycache__/submit.cpython-310.pyc +0 -0
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app.py CHANGED
@@ -6,6 +6,9 @@ import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import snapshot_download
8
  from io import StringIO
 
 
 
9
 
10
  from src.about import (
11
  CITATION_BUTTON_LABEL,
@@ -39,20 +42,6 @@ from src.submission.submit import add_new_eval
39
 
40
  from src.about import TasksMib_Subgraph
41
 
42
-
43
-
44
-
45
-
46
-
47
-
48
-
49
-
50
-
51
- from gradio_leaderboard import SelectColumns, Leaderboard
52
- import pandas as pd
53
- from typing import List, Dict, Union, Optional, Any
54
- from dataclasses import fields
55
-
56
  # class SmartSelectColumns(SelectColumns):
57
  # """
58
  # Enhanced SelectColumns component with basic filtering functionality.
@@ -130,9 +119,6 @@ from dataclasses import fields
130
 
131
  # return super().update(value)
132
 
133
-
134
-
135
-
136
  from gradio_leaderboard import SelectColumns, Leaderboard
137
  import pandas as pd
138
  from typing import List, Dict, Optional
@@ -194,10 +180,60 @@ class SmartSelectColumns(SelectColumns):
194
  return filtered_groups
195
 
196
 
197
-
198
-
199
-
200
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
 
203
 
@@ -237,7 +273,9 @@ except Exception:
237
 
238
 
239
 
240
- LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
 
 
241
 
242
  # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
243
  # In app.py, modify the LEADERBOARD initialization
@@ -364,23 +402,23 @@ def init_leaderboard_mib_subgraph(dataframe, track):
364
  "qwen2_5": "Qwen-2.5",
365
  "gpt2": "GPT-2",
366
  "gemma2": "Gemma-2",
367
- "llama3": "LLaMA-3"
368
  }
369
 
370
  benchmark_mapping = {
371
  "ioi": "IOI",
372
  "mcqa": "MCQA",
373
- "arithmetic_addition": "Arithmetic Addition",
374
- "arithmetic_subtraction": "Arithmetic Subtraction",
375
- "arc_easy": "ARC Easy",
376
- "arc_challenge": "ARC Challenge"
377
  }
378
 
379
  display_mapping = {}
380
  for task in TasksMib_Subgraph:
381
  for model in task.value.models:
382
  field_name = f"{task.value.benchmark}_{model}"
383
- display_name = f"{benchmark_mapping[task.value.benchmark]}( {model_name_mapping[model]} )"
384
  display_mapping[field_name] = display_name
385
 
386
 
@@ -423,18 +461,19 @@ def init_leaderboard_mib_subgraph(dataframe, track):
423
  # all_columns = [c.name for c in fields(AutoEvalColumn_mib_subgraph)]
424
  all_columns = renamed_df.columns.tolist()
425
 
 
426
  # Original code
427
  return Leaderboard(
428
  value=renamed_df, # Use DataFrame with display names
429
  datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
430
- select_columns=SelectColumns(
431
- default_selection=all_columns, # Now contains display names
432
- label="Select Results:"
433
- ),
434
  search_columns=["Method"],
435
- hide_columns=[],
436
  interactive=False,
437
- )
438
 
439
 
440
 
@@ -685,6 +724,32 @@ def process_json(temp_file):
685
  return data
686
 
687
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688
  demo = gr.Blocks(css=custom_css)
689
  with demo:
690
  gr.HTML(TITLE)
@@ -709,14 +774,51 @@ with demo:
709
  # with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
710
  # leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
711
  with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
712
- # Add description for filters
713
- gr.Markdown("""
714
- ### Filtering Options
715
- Use the dropdown menus below to filter results by specific tasks or models.
716
- You can combine filters to see specific task-model combinations.
717
- """)
718
- leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
719
- print(f"Leaderboard is {leaderboard}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
720
 
721
  # Then modify the Causal Graph tab section
722
  with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
 
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import snapshot_download
8
  from io import StringIO
9
+ from typing import Dict, List, Optional
10
+ from dataclasses import dataclass, field
11
+ from copy import deepcopy
12
 
13
  from src.about import (
14
  CITATION_BUTTON_LABEL,
 
42
 
43
  from src.about import TasksMib_Subgraph
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  # class SmartSelectColumns(SelectColumns):
46
  # """
47
  # Enhanced SelectColumns component with basic filtering functionality.
 
119
 
120
  # return super().update(value)
121
 
 
 
 
122
  from gradio_leaderboard import SelectColumns, Leaderboard
123
  import pandas as pd
124
  from typing import List, Dict, Optional
 
180
  return filtered_groups
181
 
182
 
183
+ import re
184
+ @dataclass
185
+ class SubstringSelectColumns(SelectColumns):
186
+ """
187
+ Extends SelectColumns to support filtering columns by predefined substrings.
188
+ When a substring is selected, all columns containing that substring will be selected.
189
+ """
190
+ substring_groups: Dict[str, List[str]] = field(default_factory=dict)
191
+ selected_substrings: List[str] = field(default_factory=list)
192
+
193
+ def __post_init__(self):
194
+ # Ensure default_selection is a list
195
+ if self.default_selection is None:
196
+ self.default_selection = []
197
+
198
+ # Build reverse mapping of column to substrings
199
+ self.column_to_substrings = {}
200
+ for substring, patterns in self.substring_groups.items():
201
+ for pattern in patterns:
202
+ # Convert glob-style patterns to regex
203
+ regex = re.compile(pattern.replace('*', '.*'))
204
+ # Find matching columns in default_selection
205
+ for col in self.default_selection:
206
+ if regex.search(col):
207
+ if col not in self.column_to_substrings:
208
+ self.column_to_substrings[col] = []
209
+ self.column_to_substrings[col].append(substring)
210
+
211
+ # Apply initial substring selections
212
+ if self.selected_substrings:
213
+ self.update_selection_from_substrings()
214
+
215
+ def update_selection_from_substrings(self) -> List[str]:
216
+ """
217
+ Updates the column selection based on selected substrings.
218
+ Returns the new list of selected columns.
219
+ """
220
+ selected_columns = self.cant_deselect.copy()
221
+
222
+ # If no substrings selected, show all columns
223
+ if not self.selected_substrings:
224
+ selected_columns.extend([
225
+ col for col in self.default_selection
226
+ if col not in self.cant_deselect
227
+ ])
228
+ return selected_columns
229
+
230
+ # Add columns that match any selected substring
231
+ for col, substrings in self.column_to_substrings.items():
232
+ if any(s in self.selected_substrings for s in substrings):
233
+ if col not in selected_columns:
234
+ selected_columns.append(col)
235
+
236
+ return selected_columns
237
 
238
 
239
 
 
273
 
274
 
275
 
276
+ LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
277
+ LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
278
+ metric_type="F=")
279
 
280
  # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
281
  # In app.py, modify the LEADERBOARD initialization
 
402
  "qwen2_5": "Qwen-2.5",
403
  "gpt2": "GPT-2",
404
  "gemma2": "Gemma-2",
405
+ "llama3": "Llama-3.1"
406
  }
407
 
408
  benchmark_mapping = {
409
  "ioi": "IOI",
410
  "mcqa": "MCQA",
411
+ "arithmetic_addition": "Arithmetic (+)",
412
+ "arithmetic_subtraction": "Arithmetic (-)",
413
+ "arc_easy": "ARC (Easy)",
414
+ "arc_challenge": "ARC (Challenge)"
415
  }
416
 
417
  display_mapping = {}
418
  for task in TasksMib_Subgraph:
419
  for model in task.value.models:
420
  field_name = f"{task.value.benchmark}_{model}"
421
+ display_name = f"{benchmark_mapping[task.value.benchmark]} - {model_name_mapping[model]}"
422
  display_mapping[field_name] = display_name
423
 
424
 
 
461
  # all_columns = [c.name for c in fields(AutoEvalColumn_mib_subgraph)]
462
  all_columns = renamed_df.columns.tolist()
463
 
464
+
465
  # Original code
466
  return Leaderboard(
467
  value=renamed_df, # Use DataFrame with display names
468
  datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
469
+ # select_columns=SelectColumns(
470
+ # default_selection=all_columns, # Now contains display names
471
+ # label="Filter Results:",
472
+ # ),
473
  search_columns=["Method"],
474
+ hide_columns=["eval_name"],
475
  interactive=False,
476
+ ), renamed_df
477
 
478
 
479
 
 
724
  return data
725
 
726
 
727
+ # Define the preset substrings for filtering
728
+ PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
729
+
730
+ def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: List[str]) -> pd.DataFrame:
731
+ """
732
+ Filter columns based on the selected substrings.
733
+ """
734
+ original_dataframe = deepcopy(dataframe)
735
+ if not selected_substrings:
736
+ return dataframe # No filtering if no substrings are selected
737
+
738
+ # Filter columns that contain any of the selected substrings
739
+ filtered_columns = [
740
+ col for col in dataframe.columns
741
+ if any(sub.lower() in col.lower() for sub in selected_substrings)
742
+ or col == "Method"
743
+ ]
744
+ return dataframe[filtered_columns]
745
+
746
+ def update_leaderboard(dataframe: pd.DataFrame, selected_substrings: List[str]):
747
+ """
748
+ Update the leaderboard based on the selected substrings.
749
+ """
750
+ filtered_dataframe = filter_columns_by_substrings(dataframe, selected_substrings)
751
+ return filtered_dataframe
752
+
753
  demo = gr.Blocks(css=custom_css)
754
  with demo:
755
  gr.HTML(TITLE)
 
774
  # with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
775
  # leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
776
  with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
777
+ with gr.Tabs() as subgraph_tabs:
778
+ with gr.TabItem("F+", id=0):
779
+ # Add description for filters
780
+ gr.Markdown("""
781
+ ### Filtering Options
782
+ Use the dropdown menus below to filter results by specific tasks or models.
783
+ You can combine filters to see specific task-model combinations.
784
+ """)
785
+ # CheckboxGroup for selecting substrings
786
+ substring_checkbox = gr.CheckboxGroup(
787
+ choices=PRESET_SUBSTRINGS,
788
+ label="Filter results:",
789
+ value=PRESET_SUBSTRINGS, # Default to all substrings selected
790
+ )
791
+ leaderboard, data = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH_FPL, "Subgraph")
792
+ original_leaderboard = gr.State(value=data)
793
+ # Update the leaderboard when the user selects/deselects substrings
794
+ substring_checkbox.change(
795
+ fn=update_leaderboard,
796
+ inputs=[original_leaderboard, substring_checkbox],
797
+ outputs=leaderboard
798
+ )
799
+ print(f"Leaderboard is {leaderboard}")
800
+ with gr.TabItem("F=", id=1):
801
+ # Add description for filters
802
+ gr.Markdown("""
803
+ ### Filtering Options
804
+ Use the dropdown menus below to filter results by specific tasks or models.
805
+ You can combine filters to see specific task-model combinations.
806
+ """)
807
+ # CheckboxGroup for selecting substrings
808
+ substring_checkbox = gr.CheckboxGroup(
809
+ choices=PRESET_SUBSTRINGS,
810
+ label="Filter results:",
811
+ value=PRESET_SUBSTRINGS, # Default to all substrings selected
812
+ )
813
+ leaderboard, data = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH_FEQ, "Subgraph")
814
+ original_leaderboard = gr.State(value=data)
815
+ # Update the leaderboard when the user selects/deselects substrings
816
+ substring_checkbox.change(
817
+ fn=update_leaderboard,
818
+ inputs=[original_leaderboard, substring_checkbox],
819
+ outputs=leaderboard
820
+ )
821
+ print(f"Leaderboard is {leaderboard}")
822
 
823
  # Then modify the Causal Graph tab section
824
  with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
app.pyi ADDED
@@ -0,0 +1,867 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import gzip
3
+ import gradio as gr
4
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
+ import pandas as pd
6
+ from apscheduler.schedulers.background import BackgroundScheduler
7
+ from huggingface_hub import snapshot_download
8
+ from io import StringIO
9
+ from dataclasses import dataclass, field
10
+
11
+ from src.about import (
12
+ CITATION_BUTTON_LABEL,
13
+ CITATION_BUTTON_TEXT,
14
+ EVALUATION_QUEUE_TEXT,
15
+ INTRODUCTION_TEXT,
16
+ LLM_BENCHMARKS_TEXT,
17
+ TITLE,
18
+ )
19
+ from src.display.css_html_js import custom_css
20
+ from src.display.utils import (
21
+ BENCHMARK_COLS,
22
+ BENCHMARK_COLS_MULTIMODAL,
23
+ BENCHMARK_COLS_MIB_SUBGRAPH,
24
+ BENCHMARK_COLS_MIB_CAUSALGRAPH,
25
+ COLS,
26
+ COLS_MIB_SUBGRAPH,
27
+ COLS_MIB_CAUSALGRAPH,
28
+ COLS_MULTIMODAL,
29
+ EVAL_COLS,
30
+ EVAL_TYPES,
31
+ AutoEvalColumn,
32
+ AutoEvalColumn_mib_subgraph,
33
+ AutoEvalColumn_mib_causalgraph,
34
+ fields,
35
+ )
36
+ from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
37
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
38
+ from src.submission.submit import add_new_eval
39
+
40
+
41
+ from src.about import TasksMib_Subgraph
42
+
43
+ # class SmartSelectColumns(SelectColumns):
44
+ # """
45
+ # Enhanced SelectColumns component with basic filtering functionality.
46
+ # """
47
+ # def __init__(
48
+ # self,
49
+ # benchmark_keywords: Optional[List[str]] = None,
50
+ # model_keywords: Optional[List[str]] = None,
51
+ # initial_selected: Optional[List[str]] = None,
52
+ # **kwargs
53
+ # ):
54
+ # """
55
+ # Initialize SmartSelectColumns with minimal configuration.
56
+
57
+ # Args:
58
+ # benchmark_keywords: List of benchmark names to filter by
59
+ # model_keywords: List of model names to filter by
60
+ # initial_selected: List of columns to show initially
61
+ # """
62
+ # super().__init__(**kwargs)
63
+ # self.benchmark_keywords = benchmark_keywords or []
64
+ # self.model_keywords = model_keywords or []
65
+ # self.initial_selected = initial_selected or []
66
+
67
+ # def get_filtered_groups(self, df: pd.DataFrame) -> Dict[str, List[str]]:
68
+ # """
69
+ # Create column groups based on simple substring matching.
70
+ # """
71
+ # filtered_groups = {}
72
+
73
+ # # Create benchmark groups
74
+ # for benchmark in self.benchmark_keywords:
75
+ # matching_cols = [
76
+ # col for col in df.columns
77
+ # if benchmark in col.lower()
78
+ # ]
79
+ # if matching_cols:
80
+ # group_name = f"Benchmark group for {benchmark}"
81
+ # filtered_groups[group_name] = matching_cols
82
+
83
+ # # Create model groups
84
+ # for model in self.model_keywords:
85
+ # matching_cols = [
86
+ # col for col in df.columns
87
+ # if model in col.lower()
88
+ # ]
89
+ # if matching_cols:
90
+ # group_name = f"Model group for {model}"
91
+ # filtered_groups[group_name] = matching_cols
92
+
93
+ # return filtered_groups
94
+
95
+ # def update(
96
+ # self,
97
+ # value: Union[pd.DataFrame, Dict[str, List[str]], Any]
98
+ # ) -> Dict:
99
+ # """Update component with new values."""
100
+ # if isinstance(value, pd.DataFrame):
101
+ # choices = list(value.columns)
102
+ # selected = self.initial_selected if self.initial_selected else choices
103
+ # filtered_cols = self.get_filtered_groups(value)
104
+
105
+ # return {
106
+ # "choices": choices,
107
+ # "value": selected,
108
+ # "filtered_cols": filtered_cols
109
+ # }
110
+
111
+ # if hasattr(value, '__dataclass_fields__'):
112
+ # field_names = [field.name for field in fields(value)]
113
+ # return {
114
+ # "choices": field_names,
115
+ # "value": self.initial_selected if self.initial_selected else field_names
116
+ # }
117
+
118
+ # return super().update(value)
119
+
120
+ from gradio.events import Dependency
121
+
122
+ class ModifiedLeaderboard(Leaderboard):
123
+ """Extends Leaderboard to support substring-based column filtering"""
124
+
125
+ def __init__(self, *args, **kwargs):
126
+ super().__init__(*args, **kwargs)
127
+
128
+ # Process substring groups if they exist
129
+ if (isinstance(self.select_columns_config, SelectColumns) and
130
+ self.select_columns_config.substring_groups):
131
+
132
+ self.process_substring_groups()
133
+
134
+ def process_substring_groups(self):
135
+ """Processes substring groups to add them to the selectable columns"""
136
+ groups = self.select_columns_config.substring_groups
137
+ if not groups:
138
+ return
139
+
140
+ # Create a mapping of group name to matching columns
141
+ group_to_columns = {}
142
+ for group_name, patterns in groups.groups.items():
143
+ matching_cols = set()
144
+ for pattern in patterns:
145
+ regex = re.compile(pattern.replace('*', '.*'))
146
+ matching_cols.update(
147
+ col for col in self.headers
148
+ if regex.search(col)
149
+ )
150
+ if matching_cols:
151
+ group_to_columns[group_name] = list(matching_cols)
152
+
153
+ # Add groups to the headers and update column selection logic
154
+ self.group_to_columns = group_to_columns
155
+ self.original_headers = self.headers.copy()
156
+
157
+ # Add group names to the start of headers
158
+ self.headers = list(group_to_columns.keys()) + self.original_headers
159
+
160
+ # Update default selection to include groups
161
+ if self.select_columns_config.default_selection:
162
+ self.select_columns_config.default_selection = (
163
+ list(group_to_columns.keys()) +
164
+ self.select_columns_config.default_selection
165
+ )
166
+
167
+ def preprocess(self, payload):
168
+ """Override preprocess to handle group selection"""
169
+ df = super().preprocess(payload)
170
+
171
+ # If we don't have substring groups, return normally
172
+ if not hasattr(self, 'group_to_columns'):
173
+ return df
174
+
175
+ # Process group selections
176
+ selected_columns = set()
177
+ for column in payload.headers:
178
+ if column in self.group_to_columns:
179
+ # If a group is selected, add all its columns
180
+ selected_columns.update(self.group_to_columns[column])
181
+ elif column in self.original_headers:
182
+ # Add individually selected columns
183
+ selected_columns.add(column)
184
+
185
+ # Return DataFrame with only selected columns
186
+ return df[list(selected_columns)]
187
+ from typing import Callable, Literal, Sequence, Any, TYPE_CHECKING
188
+ from gradio.blocks import Block
189
+ if TYPE_CHECKING:
190
+ from gradio.components import Timer
191
+
192
+ from gradio_leaderboard import SelectColumns, Leaderboard
193
+ import pandas as pd
194
+ from typing import List, Dict, Optional
195
+ from dataclasses import fields
196
+
197
+ class SmartSelectColumns(SelectColumns):
198
+ """
199
+ Enhanced SelectColumns component matching exact original parameters.
200
+ """
201
+ def __init__(
202
+ self,
203
+ benchmark_keywords: Optional[List[str]] = None,
204
+ model_keywords: Optional[List[str]] = None,
205
+ initial_selected: Optional[List[str]] = None,
206
+ label: Optional[str] = None,
207
+ show_label: bool = True,
208
+ info: Optional[str] = None,
209
+ allow: bool = True
210
+ ):
211
+ # Match exact parameters from working SelectColumns
212
+ super().__init__(
213
+ default_selection=initial_selected or [],
214
+ cant_deselect=[],
215
+ allow=allow,
216
+ label=label,
217
+ show_label=show_label,
218
+ info=info
219
+ )
220
+
221
+ self.benchmark_keywords = benchmark_keywords or []
222
+ self.model_keywords = model_keywords or []
223
+
224
+ # Store groups for later use
225
+ self._groups = {}
226
+
227
+ def get_filtered_groups(self, columns: List[str]) -> Dict[str, List[str]]:
228
+ """Get column groups based on keywords."""
229
+ filtered_groups = {}
230
+
231
+ # Add benchmark groups
232
+ for benchmark in self.benchmark_keywords:
233
+ matching_cols = [
234
+ col for col in columns
235
+ if benchmark in col.lower()
236
+ ]
237
+ if matching_cols:
238
+ filtered_groups[f"Benchmark group for {benchmark}"] = matching_cols
239
+
240
+ # Add model groups
241
+ for model in self.model_keywords:
242
+ matching_cols = [
243
+ col for col in columns
244
+ if model in col.lower()
245
+ ]
246
+ if matching_cols:
247
+ filtered_groups[f"Model group for {model}"] = matching_cols
248
+
249
+ self._groups = filtered_groups
250
+ return filtered_groups
251
+
252
+
253
+ import re
254
+ @dataclass
255
+ class SubstringSelectColumns(SelectColumns):
256
+ """
257
+ Extends SelectColumns to support filtering columns by predefined substrings.
258
+ When a substring is selected, all columns containing that substring will be selected.
259
+ """
260
+ substring_groups: Dict[str, List[str]] = field(default_factory=dict)
261
+ selected_substrings: List[str] = field(default_factory=list)
262
+
263
+ def __post_init__(self):
264
+ # Ensure default_selection is a list
265
+ if self.default_selection is None:
266
+ self.default_selection = []
267
+
268
+ # Build reverse mapping of column to substrings
269
+ self.column_to_substrings = {}
270
+ for substring, patterns in self.substring_groups.items():
271
+ for pattern in patterns:
272
+ # Convert glob-style patterns to regex
273
+ regex = re.compile(pattern.replace('*', '.*'))
274
+ # Find matching columns in default_selection
275
+ for col in self.default_selection:
276
+ if regex.search(col):
277
+ if col not in self.column_to_substrings:
278
+ self.column_to_substrings[col] = []
279
+ self.column_to_substrings[col].append(substring)
280
+
281
+ # Apply initial substring selections
282
+ if self.selected_substrings:
283
+ self.update_selection_from_substrings()
284
+
285
+ def update_selection_from_substrings(self) -> List[str]:
286
+ """
287
+ Updates the column selection based on selected substrings.
288
+ Returns the new list of selected columns.
289
+ """
290
+ selected_columns = self.cant_deselect.copy()
291
+
292
+ # If no substrings selected, show all columns
293
+ if not self.selected_substrings:
294
+ selected_columns.extend([
295
+ col for col in self.default_selection
296
+ if col not in self.cant_deselect
297
+ ])
298
+ return selected_columns
299
+
300
+ # Add columns that match any selected substring
301
+ for col, substrings in self.column_to_substrings.items():
302
+ if any(s in self.selected_substrings for s in substrings):
303
+ if col not in selected_columns:
304
+ selected_columns.append(col)
305
+
306
+ return selected_columns
307
+
308
+
309
+
310
+
311
+
312
+ def restart_space():
313
+ API.restart_space(repo_id=REPO_ID)
314
+
315
+
316
+
317
+ ### Space initialisation
318
+ try:
319
+ # print(EVAL_REQUESTS_PATH)
320
+ snapshot_download(
321
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
322
+ )
323
+ except Exception:
324
+ restart_space()
325
+
326
+
327
+ try:
328
+ # print(RESULTS_REPO_MIB_SUBGRAPH)
329
+ snapshot_download(
330
+ repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
331
+ )
332
+ except Exception:
333
+ restart_space()
334
+
335
+
336
+ try:
337
+ # print(RESULTS_REPO_MIB_CAUSALGRAPH)
338
+ snapshot_download(
339
+ repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
340
+ )
341
+ except Exception:
342
+ restart_space()
343
+
344
+
345
+
346
+ LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
347
+
348
+ # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
349
+ # In app.py, modify the LEADERBOARD initialization
350
+ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
351
+ EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
352
+ EVAL_REQUESTS_PATH,
353
+ COLS_MIB_CAUSALGRAPH,
354
+ BENCHMARK_COLS_MIB_CAUSALGRAPH
355
+ )
356
+
357
+
358
+ # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
359
+ # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
360
+
361
+ (
362
+ finished_eval_queue_df,
363
+ running_eval_queue_df,
364
+ pending_eval_queue_df,
365
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
366
+
367
+
368
+
369
+
370
+
371
+ # def init_leaderboard_mib_subgraph(dataframe, track):
372
+ # # print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
373
+
374
+ # if dataframe is None or dataframe.empty:
375
+ # raise ValueError("Leaderboard DataFrame is empty or None.")
376
+
377
+ # # filter for correct track
378
+ # # dataframe = dataframe.loc[dataframe["Track"] == track]
379
+
380
+ # # print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
381
+
382
+ # return Leaderboard(
383
+ # value=dataframe,
384
+ # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
385
+ # select_columns=SelectColumns(
386
+ # default_selection=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default],
387
+ # cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.never_hidden],
388
+ # label="Select Columns to Display:",
389
+ # ),
390
+ # search_columns=["Method"], # Changed from AutoEvalColumn_mib_subgraph.model.name to "Method"
391
+ # hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden],
392
+ # bool_checkboxgroup_label="Hide models",
393
+ # interactive=False,
394
+ # )
395
+
396
+
397
+
398
+
399
+
400
+ # def init_leaderboard_mib_subgraph(dataframe, track):
401
+ # """Initialize the subgraph leaderboard with grouped column selection by benchmark."""
402
+ # if dataframe is None or dataframe.empty:
403
+ # raise ValueError("Leaderboard DataFrame is empty or None.")
404
+
405
+ # print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
406
+
407
+ # # Create groups of columns by benchmark
408
+ # benchmark_groups = []
409
+
410
+ # # For each benchmark in our TasksMib_Subgraph enum...
411
+ # for task in TasksMib_Subgraph:
412
+ # benchmark = task.value.benchmark
413
+ # # Get all valid columns for this benchmark's models
414
+ # benchmark_cols = [
415
+ # f"{benchmark}_{model}"
416
+ # for model in task.value.models
417
+ # if f"{benchmark}_{model}" in dataframe.columns
418
+ # ]
419
+ # if benchmark_cols: # Only add if we have valid columns
420
+ # benchmark_groups.append(benchmark_cols)
421
+ # print(f"\nBenchmark group for {benchmark}:", benchmark_cols)
422
+
423
+ # # Create model groups as well
424
+ # model_groups = []
425
+ # all_models = list(set(model for task in TasksMib_Subgraph for model in task.value.models))
426
+
427
+ # # For each unique model...
428
+ # for model in all_models:
429
+ # # Get all valid columns for this model across benchmarks
430
+ # model_cols = [
431
+ # f"{task.value.benchmark}_{model}"
432
+ # for task in TasksMib_Subgraph
433
+ # if model in task.value.models
434
+ # and f"{task.value.benchmark}_{model}" in dataframe.columns
435
+ # ]
436
+ # if model_cols: # Only add if we have valid columns
437
+ # model_groups.append(model_cols)
438
+ # print(f"\nModel group for {model}:", model_cols)
439
+
440
+ # # Combine all groups
441
+ # all_groups = benchmark_groups + model_groups
442
+
443
+ # # Flatten groups for default selection (show everything initially)
444
+ # all_columns = [col for group in all_groups for col in group]
445
+ # print("\nAll available columns:", all_columns)
446
+
447
+ # return Leaderboard(
448
+ # value=dataframe,
449
+ # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
450
+ # select_columns=SelectColumns(
451
+ # default_selection=all_columns, # Show all columns initially
452
+ # label="Select Results:"
453
+ # ),
454
+ # search_columns=["Method"],
455
+ # hide_columns=[],
456
+ # interactive=False,
457
+ # )
458
+
459
+
460
+ def init_leaderboard_mib_subgraph(dataframe, track):
461
+ """Initialize the subgraph leaderboard with display names for better readability."""
462
+ if dataframe is None or dataframe.empty:
463
+ raise ValueError("Leaderboard DataFrame is empty or None.")
464
+
465
+ print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
466
+
467
+ # First, create our display name mapping
468
+ # This is like creating a translation dictionary between internal names and display names
469
+ model_name_mapping = {
470
+ "qwen2_5": "Qwen-2.5",
471
+ "gpt2": "GPT-2",
472
+ "gemma2": "Gemma-2",
473
+ "llama3": "Llama-3.1"
474
+ }
475
+
476
+ benchmark_mapping = {
477
+ "ioi": "IOI",
478
+ "mcqa": "MCQA",
479
+ "arithmetic_addition": "Arithmetic (+)",
480
+ "arithmetic_subtraction": "Arithmetic (-)",
481
+ "arc_easy": "ARC (Easy)",
482
+ "arc_challenge": "ARC (Challenge)"
483
+ }
484
+
485
+ display_mapping = {}
486
+ for task in TasksMib_Subgraph:
487
+ for model in task.value.models:
488
+ field_name = f"{task.value.benchmark}_{model}"
489
+ display_name = f"{benchmark_mapping[task.value.benchmark]} - {model_name_mapping[model]}"
490
+ display_mapping[field_name] = display_name
491
+
492
+
493
+ # Now when creating benchmark groups, we'll use display names
494
+ benchmark_groups = []
495
+ for task in TasksMib_Subgraph:
496
+ benchmark = task.value.benchmark
497
+ benchmark_cols = [
498
+ display_mapping[f"{benchmark}_{model}"] # Use display name from our mapping
499
+ for model in task.value.models
500
+ if f"{benchmark}_{model}" in dataframe.columns
501
+ ]
502
+ if benchmark_cols:
503
+ benchmark_groups.append(benchmark_cols)
504
+ print(f"\nBenchmark group for {benchmark}:", benchmark_cols)
505
+
506
+ # Similarly for model groups
507
+ model_groups = []
508
+ all_models = list(set(model for task in TasksMib_Subgraph for model in task.value.models))
509
+
510
+ for model in all_models:
511
+ model_cols = [
512
+ display_mapping[f"{task.value.benchmark}_{model}"] # Use display name
513
+ for task in TasksMib_Subgraph
514
+ if model in task.value.models
515
+ and f"{task.value.benchmark}_{model}" in dataframe.columns
516
+ ]
517
+ if model_cols:
518
+ model_groups.append(model_cols)
519
+ print(f"\nModel group for {model}:", model_cols)
520
+
521
+ # Combine all groups using display names
522
+ all_groups = benchmark_groups + model_groups
523
+ all_columns = [col for group in all_groups for col in group]
524
+
525
+ # Important: We need to rename our DataFrame columns to match display names
526
+
527
+ renamed_df = dataframe.rename(columns=display_mapping)
528
+ # all_columns = [c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default]
529
+ # all_columns = [c.name for c in fields(AutoEvalColumn_mib_subgraph)]
530
+ all_columns = renamed_df.columns.tolist()
531
+
532
+ print(benchmark_groups)
533
+ print(model_groups)
534
+ filter_groups = {"ioi": "*IOI*",
535
+ "llama": "*Llama*"}
536
+
537
+
538
+ # Original code
539
+ return ModifiedLeaderboard(
540
+ value=renamed_df, # Use DataFrame with display names
541
+ datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
542
+ select_columns=SubstringSelectColumns(
543
+ substring_groups=filter_groups,
544
+ default_selection=all_columns, # Now contains display names
545
+ label="Filter Results:",
546
+ allow=True
547
+ ),
548
+ search_columns=["Method"],
549
+ hide_columns=[],
550
+ interactive=False,
551
+ )
552
+
553
+
554
+
555
+
556
+ # # Complete column groups for both benchmarks and models
557
+ # # Define keywords for filtering
558
+ # benchmark_keywords = ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]
559
+ # model_keywords = ["qwen2_5", "gpt2", "gemma2", "llama3"]
560
+
561
+ # # Optional: Define display names
562
+ # mappings = {
563
+ # "ioi_llama3": "IOI (LLaMA-3)",
564
+ # "ioi_qwen2_5": "IOI (Qwen-2.5)",
565
+ # "ioi_gpt2": "IOI (GPT-2)",
566
+ # "ioi_gemma2": "IOI (Gemma-2)",
567
+ # "mcqa_llama3": "MCQA (LLaMA-3)",
568
+ # "mcqa_qwen2_5": "MCQA (Qwen-2.5)",
569
+ # "mcqa_gemma2": "MCQA (Gemma-2)",
570
+ # "arithmetic_addition_llama3": "Arithmetic Addition (LLaMA-3)",
571
+ # "arithmetic_subtraction_llama3": "Arithmetic Subtraction (LLaMA-3)",
572
+ # "arc_easy_llama3": "ARC Easy (LLaMA-3)",
573
+ # "arc_easy_gemma2": "ARC Easy (Gemma-2)",
574
+ # "arc_challenge_llama3": "ARC Challenge (LLaMA-3)",
575
+ # "eval_name": "Evaluation Name",
576
+ # "Method": "Method",
577
+ # "Average": "Average Score"
578
+ # }
579
+ # # mappings = {}
580
+
581
+ # # Create SmartSelectColumns instance
582
+ # smart_columns = SmartSelectColumns(
583
+ # benchmark_keywords=benchmark_keywords,
584
+ # model_keywords=model_keywords,
585
+ # column_mapping=mappings,
586
+ # initial_selected=["Method", "Average"]
587
+ # )
588
+
589
+ # print("\nDebugging DataFrame columns:", renamed_df.columns.tolist())
590
+
591
+ # # Create Leaderboard
592
+ # leaderboard = Leaderboard(
593
+ # value=renamed_df,
594
+ # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
595
+ # select_columns=smart_columns,
596
+ # search_columns=["Method"],
597
+ # hide_columns=[],
598
+ # interactive=False
599
+ # )
600
+ # print(f"Successfully created leaderboard.")
601
+ # return leaderboard
602
+
603
+ # print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
604
+
605
+ # # Define simple keywords for filtering
606
+ # benchmark_keywords = ["ioi", "mcqa", "arithmetic", "arc"]
607
+ # model_keywords = ["qwen2_5", "gpt2", "gemma2", "llama3"]
608
+
609
+ # # Create SmartSelectColumns instance with exact same parameters as working version
610
+ # smart_columns = SmartSelectColumns(
611
+ # benchmark_keywords=benchmark_keywords,
612
+ # model_keywords=model_keywords,
613
+ # initial_selected=["Method", "Average"],
614
+ # allow=True,
615
+ # label=None,
616
+ # show_label=True,
617
+ # info=None
618
+ # )
619
+
620
+ # try:
621
+ # print("\nCreating leaderboard...")
622
+ # # Get groups before creating leaderboard
623
+ # smart_columns.get_filtered_groups(dataframe.columns)
624
+
625
+ # leaderboard = Leaderboard(
626
+ # value=dataframe,
627
+ # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
628
+ # select_columns=smart_columns,
629
+ # search_columns=["Method"],
630
+ # hide_columns=[],
631
+ # interactive=False
632
+ # )
633
+ # print("Leaderboard created successfully")
634
+ # return leaderboard
635
+
636
+ # except Exception as e:
637
+ # print("Error creating leaderboard:", str(e))
638
+ # raise
639
+
640
+
641
+
642
+
643
+
644
+
645
+ # def init_leaderboard_mib_subgraph(dataframe, track):
646
+ # """Initialize the subgraph leaderboard with group-based column selection."""
647
+ # if dataframe is None or dataframe.empty:
648
+ # raise ValueError("Leaderboard DataFrame is empty or None.")
649
+
650
+ # print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
651
+
652
+ # # Create selection mapping for benchmark groups
653
+ # selection_mapping = {}
654
+
655
+ # # Create benchmark groups with descriptive names
656
+ # for task in TasksMib_Subgraph:
657
+ # benchmark = task.value.benchmark
658
+ # # Get all columns for this benchmark's models
659
+ # benchmark_cols = [
660
+ # f"{benchmark}_{model}"
661
+ # for model in task.value.models
662
+ # if f"{benchmark}_{model}" in dataframe.columns
663
+ # ]
664
+ # if benchmark_cols:
665
+ # # Use a descriptive group name as the key
666
+ # group_name = f"Benchmark: {benchmark.upper()}"
667
+ # selection_mapping[group_name] = benchmark_cols
668
+ # print(f"\n{group_name} maps to:", benchmark_cols)
669
+
670
+ # # Create model groups with descriptive names
671
+ # all_models = list(set(model for task in TasksMib_Subgraph for model in task.value.models))
672
+ # for model in all_models:
673
+ # # Get all columns for this model across benchmarks
674
+ # model_cols = [
675
+ # f"{task.value.benchmark}_{model}"
676
+ # for task in TasksMib_Subgraph
677
+ # if model in task.value.models
678
+ # and f"{task.value.benchmark}_{model}" in dataframe.columns
679
+ # ]
680
+ # if model_cols:
681
+ # # Use a descriptive group name as the key
682
+ # group_name = f"Model: {model}"
683
+ # selection_mapping[group_name] = model_cols
684
+ # print(f"\n{group_name} maps to:", model_cols)
685
+
686
+ # # The selection options are the group names
687
+ # selection_options = list(selection_mapping.keys())
688
+ # print("\nSelection options:", selection_options)
689
+
690
+ # return Leaderboard(
691
+ # value=dataframe,
692
+ # datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
693
+ # select_columns=SelectColumns(
694
+ # default_selection=selection_options, # Show all groups by default
695
+ # label="Select Benchmark or Model Groups:"
696
+ # ),
697
+ # search_columns=["Method"],
698
+ # hide_columns=[],
699
+ # interactive=False,
700
+ # )
701
+
702
+
703
+
704
+
705
+
706
+
707
+
708
+
709
+ # def init_leaderboard_mib_causalgraph(dataframe, track):
710
+ # # print("Debugging column issues:")
711
+ # # print("\nActual DataFrame columns:")
712
+ # # print(dataframe.columns.tolist())
713
+
714
+ # # print("\nExpected columns for Leaderboard:")
715
+ # expected_cols = [c.name for c in fields(AutoEvalColumn_mib_causalgraph)]
716
+ # # print(expected_cols)
717
+
718
+ # # print("\nMissing columns:")
719
+ # missing_cols = [col for col in expected_cols if col not in dataframe.columns]
720
+ # # print(missing_cols)
721
+
722
+ # # print("\nSample of DataFrame content:")
723
+ # # print(dataframe.head().to_string())
724
+
725
+ # return Leaderboard(
726
+ # value=dataframe,
727
+ # datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
728
+ # select_columns=SelectColumns(
729
+ # default_selection=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.displayed_by_default],
730
+ # cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.never_hidden],
731
+ # label="Select Columns to Display:",
732
+ # ),
733
+ # search_columns=["Method"],
734
+ # hide_columns=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.hidden],
735
+ # bool_checkboxgroup_label="Hide models",
736
+ # interactive=False,
737
+ # )
738
+
739
+ def init_leaderboard_mib_causalgraph(dataframe, track):
740
+ # print("Debugging column issues:")
741
+ # print("\nActual DataFrame columns:")
742
+ # print(dataframe.columns.tolist())
743
+
744
+ # Create only necessary columns
745
+ return Leaderboard(
746
+ value=dataframe,
747
+ datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
748
+ select_columns=SelectColumns(
749
+ default_selection=["Method"], # Start with just Method column
750
+ cant_deselect=["Method"], # Method column should always be visible
751
+ label="Select Columns to Display:",
752
+ ),
753
+ search_columns=["Method"],
754
+ hide_columns=[],
755
+ bool_checkboxgroup_label="Hide models",
756
+ interactive=False,
757
+ )
758
+
759
+
760
+ def init_leaderboard(dataframe, track):
761
+ if dataframe is None or dataframe.empty:
762
+ raise ValueError("Leaderboard DataFrame is empty or None.")
763
+ # filter for correct track
764
+ dataframe = dataframe.loc[dataframe["Track"] == track]
765
+
766
+ # print(f"\n\n\n dataframe is {dataframe}\n\n\n")
767
+
768
+ return Leaderboard(
769
+ value=dataframe,
770
+ datatype=[c.type for c in fields(AutoEvalColumn)],
771
+ select_columns=SelectColumns(
772
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
773
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
774
+ label="Select Columns to Display:",
775
+ ),
776
+ search_columns=[AutoEvalColumn.model.name],
777
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
778
+ bool_checkboxgroup_label="Hide models",
779
+ interactive=False,
780
+ )
781
+
782
+ def process_json(temp_file):
783
+ if temp_file is None:
784
+ return {}
785
+
786
+ # Handle file upload
787
+ try:
788
+ file_path = temp_file.name
789
+ if file_path.endswith('.gz'):
790
+ with gzip.open(file_path, 'rt') as f:
791
+ data = json.load(f)
792
+ else:
793
+ with open(file_path, 'r') as f:
794
+ data = json.load(f)
795
+ except Exception as e:
796
+ raise gr.Error(f"Error processing file: {str(e)}")
797
+
798
+ gr.Markdown("Upload successful!")
799
+ return data
800
+
801
+
802
+ demo = gr.Blocks(css=custom_css)
803
+ with demo:
804
+ gr.HTML(TITLE)
805
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
806
+
807
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
808
+ # with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
809
+ # leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
810
+ # with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
811
+ # leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
812
+ # with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
813
+ # leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
814
+
815
+ # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
816
+ # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
817
+
818
+ # with gr.TabItem("👶 Submit", elem_id="llm-benchmark-tab-table", id=5):
819
+ # with gr.Column():
820
+ # with gr.Row():
821
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
822
+
823
+ # with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
824
+ # leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
825
+ with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
826
+ # Add description for filters
827
+ gr.Markdown("""
828
+ ### Filtering Options
829
+ Use the dropdown menus below to filter results by specific tasks or models.
830
+ You can combine filters to see specific task-model combinations.
831
+ """)
832
+ leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
833
+ print(f"Leaderboard is {leaderboard}")
834
+
835
+ # Then modify the Causal Graph tab section
836
+ with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
837
+ with gr.Tabs() as causalgraph_tabs:
838
+ with gr.TabItem("Detailed View", id=0):
839
+ leaderboard_detailed = init_leaderboard_mib_causalgraph(
840
+ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
841
+ "Causal Graph"
842
+ )
843
+ with gr.TabItem("Aggregated View", id=1):
844
+ leaderboard_aggregated = init_leaderboard_mib_causalgraph(
845
+ LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED,
846
+ "Causal Graph"
847
+ )
848
+ with gr.TabItem("Intervention Averaged", id=2):
849
+ leaderboard_averaged = init_leaderboard_mib_causalgraph(
850
+ LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
851
+ "Causal Graph"
852
+ )
853
+
854
+ # with gr.Row():
855
+ # with gr.Accordion("📙 Citation", open=False):
856
+ # citation_button = gr.Textbox(
857
+ # value=CITATION_BUTTON_TEXT,
858
+ # label=CITATION_BUTTON_LABEL,
859
+ # lines=20,
860
+ # elem_id="citation-button",
861
+ # show_copy_button=True,
862
+ # )
863
+
864
+ scheduler = BackgroundScheduler()
865
+ scheduler.add_job(restart_space, "interval", seconds=1800)
866
+ scheduler.start()
867
+ demo.launch(share=True, ssr_mode=False)
custom-select-columns.py CHANGED
@@ -96,7 +96,7 @@ if __name__ == "__main__":
96
 
97
  # Define filters and mappings
98
  column_filters = {
99
- "IOI Metrics": ["ioi"],
100
  "Performance Metrics": ["performance"]
101
  }
102
 
 
96
 
97
  # Define filters and mappings
98
  column_filters = {
99
+ "IOI": ["ioi"],
100
  "Performance Metrics": ["performance"]
101
  }
102
 
eval-queue/.gitattributes ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mds filter=lfs diff=lfs merge=lfs -text
13
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
14
+ *.model filter=lfs diff=lfs merge=lfs -text
15
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
16
+ *.npy filter=lfs diff=lfs merge=lfs -text
17
+ *.npz filter=lfs diff=lfs merge=lfs -text
18
+ *.onnx filter=lfs diff=lfs merge=lfs -text
19
+ *.ot filter=lfs diff=lfs merge=lfs -text
20
+ *.parquet filter=lfs diff=lfs merge=lfs -text
21
+ *.pb filter=lfs diff=lfs merge=lfs -text
22
+ *.pickle filter=lfs diff=lfs merge=lfs -text
23
+ *.pkl filter=lfs diff=lfs merge=lfs -text
24
+ *.pt filter=lfs diff=lfs merge=lfs -text
25
+ *.pth filter=lfs diff=lfs merge=lfs -text
26
+ *.rar filter=lfs diff=lfs merge=lfs -text
27
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
28
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
30
+ *.tar filter=lfs diff=lfs merge=lfs -text
31
+ *.tflite filter=lfs diff=lfs merge=lfs -text
32
+ *.tgz filter=lfs diff=lfs merge=lfs -text
33
+ *.wasm filter=lfs diff=lfs merge=lfs -text
34
+ *.xz filter=lfs diff=lfs merge=lfs -text
35
+ *.zip filter=lfs diff=lfs merge=lfs -text
36
+ *.zst filter=lfs diff=lfs merge=lfs -text
37
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
38
+ # Audio files - uncompressed
39
+ *.pcm filter=lfs diff=lfs merge=lfs -text
40
+ *.sam filter=lfs diff=lfs merge=lfs -text
41
+ *.raw filter=lfs diff=lfs merge=lfs -text
42
+ # Audio files - compressed
43
+ *.aac filter=lfs diff=lfs merge=lfs -text
44
+ *.flac filter=lfs diff=lfs merge=lfs -text
45
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
46
+ *.ogg filter=lfs diff=lfs merge=lfs -text
47
+ *.wav filter=lfs diff=lfs merge=lfs -text
48
+ # Image files - uncompressed
49
+ *.bmp filter=lfs diff=lfs merge=lfs -text
50
+ *.gif filter=lfs diff=lfs merge=lfs -text
51
+ *.png filter=lfs diff=lfs merge=lfs -text
52
+ *.tiff filter=lfs diff=lfs merge=lfs -text
53
+ # Image files - compressed
54
+ *.jpg filter=lfs diff=lfs merge=lfs -text
55
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
56
+ *.webp filter=lfs diff=lfs merge=lfs -text
57
+ # Video files - compressed
58
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ *.webm filter=lfs diff=lfs merge=lfs -text
eval-queue/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
eval-results-mib-causalgraph/.gitattributes ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mds filter=lfs diff=lfs merge=lfs -text
13
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
14
+ *.model filter=lfs diff=lfs merge=lfs -text
15
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
16
+ *.npy filter=lfs diff=lfs merge=lfs -text
17
+ *.npz filter=lfs diff=lfs merge=lfs -text
18
+ *.onnx filter=lfs diff=lfs merge=lfs -text
19
+ *.ot filter=lfs diff=lfs merge=lfs -text
20
+ *.parquet filter=lfs diff=lfs merge=lfs -text
21
+ *.pb filter=lfs diff=lfs merge=lfs -text
22
+ *.pickle filter=lfs diff=lfs merge=lfs -text
23
+ *.pkl filter=lfs diff=lfs merge=lfs -text
24
+ *.pt filter=lfs diff=lfs merge=lfs -text
25
+ *.pth filter=lfs diff=lfs merge=lfs -text
26
+ *.rar filter=lfs diff=lfs merge=lfs -text
27
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
28
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
30
+ *.tar filter=lfs diff=lfs merge=lfs -text
31
+ *.tflite filter=lfs diff=lfs merge=lfs -text
32
+ *.tgz filter=lfs diff=lfs merge=lfs -text
33
+ *.wasm filter=lfs diff=lfs merge=lfs -text
34
+ *.xz filter=lfs diff=lfs merge=lfs -text
35
+ *.zip filter=lfs diff=lfs merge=lfs -text
36
+ *.zst filter=lfs diff=lfs merge=lfs -text
37
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
38
+ # Audio files - uncompressed
39
+ *.pcm filter=lfs diff=lfs merge=lfs -text
40
+ *.sam filter=lfs diff=lfs merge=lfs -text
41
+ *.raw filter=lfs diff=lfs merge=lfs -text
42
+ # Audio files - compressed
43
+ *.aac filter=lfs diff=lfs merge=lfs -text
44
+ *.flac filter=lfs diff=lfs merge=lfs -text
45
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
46
+ *.ogg filter=lfs diff=lfs merge=lfs -text
47
+ *.wav filter=lfs diff=lfs merge=lfs -text
48
+ # Image files - uncompressed
49
+ *.bmp filter=lfs diff=lfs merge=lfs -text
50
+ *.gif filter=lfs diff=lfs merge=lfs -text
51
+ *.png filter=lfs diff=lfs merge=lfs -text
52
+ *.tiff filter=lfs diff=lfs merge=lfs -text
53
+ # Image files - compressed
54
+ *.jpg filter=lfs diff=lfs merge=lfs -text
55
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
56
+ *.webp filter=lfs diff=lfs merge=lfs -text
57
+ # Video files - compressed
58
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ *.webm filter=lfs diff=lfs merge=lfs -text
eval-results-mib-causalgraph/README.md ADDED
File without changes
eval-results-mib-causalgraph/submissions/MCQA_results_Qwen_correct_choice_period_token.json ADDED
@@ -0,0 +1,1332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "Qwen2ForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.21428571428571427
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 0.42857142857142855
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.35714285714285715
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 0.6428571428571429
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 0.42857142857142855
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.14285714285714285
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.21428571428571427
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.42857142857142855
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.2857142857142857
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 0.6428571428571429
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.42857142857142855
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.14285714285714285
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.14285714285714285
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.42857142857142855
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.14285714285714285
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 0.7857142857142857
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.42857142857142855
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.07142857142857142
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.14285714285714285
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.42857142857142855
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.14285714285714285
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 0.8571428571428571
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.42857142857142855
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.07142857142857142
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.14285714285714285
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 0.2857142857142857
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.21428571428571427
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 0.7857142857142857
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 0.2857142857142857
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.07142857142857142
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.14285714285714285
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.2857142857142857
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.21428571428571427
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 0.8571428571428571
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.2857142857142857
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.07142857142857142
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.14285714285714285
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.2857142857142857
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.14285714285714285
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 0.7857142857142857
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.2857142857142857
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.07142857142857142
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.14285714285714285
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.2857142857142857
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.14285714285714285
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 0.8571428571428571
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.2857142857142857
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.07142857142857142
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.07142857142857142
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.2857142857142857
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.14285714285714285
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 0.7857142857142857
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.2857142857142857
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.07142857142857142
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.21428571428571427
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.35714285714285715
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.14285714285714285
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 0.6428571428571429
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.35714285714285715
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.07142857142857142
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.0
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.2857142857142857
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.14285714285714285
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 0.7142857142857143
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.2857142857142857
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.07142857142857142
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.0
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.21428571428571427
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.07142857142857142
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 0.7142857142857143
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.21428571428571427
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.0
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.0
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.14285714285714285
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.07142857142857142
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 0.8571428571428571
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.14285714285714285
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.0
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.07142857142857142
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 0.14285714285714285
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.07142857142857142
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 0.7857142857142857
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 0.14285714285714285
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.0
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.14285714285714285
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.14285714285714285
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.07142857142857142
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 0.7857142857142857
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.14285714285714285
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.0
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.07142857142857142
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 0.0
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.07142857142857142
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 0.8571428571428571
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 0.0
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.0
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.0
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 0.0
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.0
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 1.0
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 0.0
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.0
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.0
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 0.0
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.0
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 1.0
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 0.0
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.0
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.0
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 0.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.0
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 1.0
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 0.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.0
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.0
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 0.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.0
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 1.0
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 0.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.0
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.0
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 0.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.0
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 1.0
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 0.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.0
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.0
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 0.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.0
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 1.0
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 0.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.0
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.0
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 0.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 0.0
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 1.0
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 0.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.0
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 0.0
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 0.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 0.0
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 1.0
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 0.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.0
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ }
1328
+ ]
1329
+ }
1330
+ }
1331
+ ]
1332
+ }
eval-results-mib-causalgraph/submissions/MCQA_results_Qwen_last_correct_choice_token.json ADDED
@@ -0,0 +1,1332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "Qwen2ForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.0
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 0.0
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.0
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 1.0
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 0.0
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.0
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.0
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.0
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.0
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 1.0
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.0
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.0
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.0
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.0
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.0
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 1.0
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.0
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.0
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.0
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.0
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.0
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 1.0
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.0
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.0
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.0
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 0.0
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.0
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 1.0
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 0.0
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.0
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.0
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.0
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.0
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 1.0
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.0
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.0
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.0
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.0
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.0
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 1.0
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.0
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.0
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.0
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.0
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.0
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 1.0
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.0
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.0
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.0
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.0
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.0
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 1.0
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.0
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.0
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.0
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.0
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.0
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 1.0
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.0
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.0
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.0
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.0
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.0
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 1.0
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.0
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.0
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.0
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.0
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.0
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 1.0
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.0
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.0
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.0
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.0
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.0
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 1.0
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.0
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.0
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.0
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 0.0
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.0
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 1.0
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 0.0
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.0
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.0
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.0
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.0
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 1.0
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.0
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.0
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.0
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 0.0
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.0
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 1.0
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 0.0
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.0
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.0
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 0.0
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.0
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 1.0
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 0.0
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.0
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.0
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 0.0
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.0
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 1.0
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 0.0
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.0
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.0
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 0.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.0
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 1.0
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 0.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.0
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.0
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 0.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.0
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 1.0
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 0.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.0
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.0
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 0.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.0
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 1.0
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 0.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.0
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.0
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 0.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.0
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 1.0
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 0.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.0
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.0
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 0.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 0.0
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 1.0
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 0.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.0
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 0.0
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 0.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 0.0
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 1.0
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 0.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.0
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ }
1328
+ ]
1329
+ }
1330
+ }
1331
+ ]
1332
+ }
eval-results-mib-causalgraph/submissions/MCQA_results_Qwen_last_token.json ADDED
@@ -0,0 +1,1332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "Qwen2ForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.0
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 0.0
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.0
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 1.0
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 0.0
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.0
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.0
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.0
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.0
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 1.0
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.0
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.0
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.0
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.0
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.0
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 1.0
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.0
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.0
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.0
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.0
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.0
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 1.0
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.0
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.0
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.0
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 0.0
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.0
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 1.0
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 0.0
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.0
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.0
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.0
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.0
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 1.0
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.0
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.0
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.0
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.0
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.0
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 1.0
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.0
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.0
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.0
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.0
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.0
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 1.0
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.0
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.0
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.0
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.0
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.0
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 1.0
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.0
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.0
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.0
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.0
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.0
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 1.0
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.0
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.0
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.0
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.0
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.0
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 1.0
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.0
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.0
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.0
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.0
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.0
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 1.0
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.0
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.0
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.0
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.0
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.0
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 1.0
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.0
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.0
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.0
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 0.0
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.0
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 1.0
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 0.0
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.0
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.0
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.0
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.0
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 1.0
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.0
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.0
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.0
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 1.0
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.21428571428571427
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 1.0
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 1.0
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.9285714285714286
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.14285714285714285
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 1.0
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.35714285714285715
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 0.7857142857142857
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 1.0
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.7857142857142857
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.14285714285714285
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 1.0
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.35714285714285715
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 0.7857142857142857
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 1.0
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.7857142857142857
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.14285714285714285
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 1.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.35714285714285715
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 0.7857142857142857
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 1.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.7857142857142857
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.14285714285714285
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 1.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.35714285714285715
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 0.7857142857142857
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 1.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.7857142857142857
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.6428571428571429
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 1.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.7857142857142857
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 0.21428571428571427
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 1.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.42857142857142855
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.6428571428571429
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 1.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.7857142857142857
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 0.21428571428571427
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 1.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.42857142857142855
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.9285714285714286
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 1.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 1.0
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 0.07142857142857142
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 1.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.21428571428571427
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 1.0
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 1.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 1.0
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 0.0
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 1.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.21428571428571427
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ }
1328
+ ]
1329
+ }
1330
+ }
1331
+ ]
1332
+ }
eval-results-mib-causalgraph/submissions/MCQA_results_Qwen_second_to_last_token.json ADDED
@@ -0,0 +1,1332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "Qwen2ForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.0
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 0.0
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.0
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 1.0
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 0.0
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.0
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.0
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.0
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.0
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 1.0
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.0
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.0
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.0
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.0
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.0
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 1.0
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.0
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.0
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.0
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.0
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.0
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 1.0
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.0
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.0
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.0
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 0.0
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.0
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 1.0
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 0.0
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.0
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.0
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.0
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.0
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 1.0
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.0
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.0
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.0
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.0
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.0
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 1.0
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.0
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.0
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.0
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.0
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.0
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 1.0
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.0
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.0
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.0
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.0
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.0
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 1.0
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.0
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.0
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.0
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.0
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.0
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 1.0
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.0
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.0
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.0
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.0
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.0
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 1.0
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.0
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.0
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.0
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.0
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.0
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 1.0
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.0
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.0
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.0
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.0
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.0
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 1.0
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.0
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.0
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.0
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 0.0
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.0
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 1.0
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 0.0
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.0
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.0
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.0
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.0
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 1.0
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.0
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.0
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.0
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 0.0
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.0
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 1.0
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 0.0
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.0
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.0
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 0.0
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.0
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 1.0
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 0.0
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.0
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.0
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 0.0
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.0
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 1.0
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 0.0
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.0
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.0
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 0.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.0
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 1.0
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 0.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.0
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.0
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 0.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.0
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 1.0
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 0.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.0
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.0
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 0.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.0
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 1.0
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 0.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.0
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.0
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 0.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.0
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 1.0
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 0.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.0
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.0
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 0.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 0.0
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 1.0
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 0.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.0
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 0.0
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 0.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 0.0
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 1.0
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 0.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.0
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ }
1328
+ ]
1329
+ }
1330
+ }
1331
+ ]
1332
+ }
eval-results-mib-causalgraph/submissions/MCQA_results_google_correct_choice_period_token.json ADDED
@@ -0,0 +1,1442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "Gemma2ForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.7333333333333333
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 1.0
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.6333333333333333
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 0.1
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 1.0
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.16666666666666666
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.7333333333333333
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.9333333333333333
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.6
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 0.16666666666666666
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.9333333333333333
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.16666666666666666
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.5
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.9
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.4
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 0.2
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.9
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.13333333333333333
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.3333333333333333
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.9666666666666667
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.3333333333333333
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 0.2
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.9666666666666667
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.16666666666666666
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.3333333333333333
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 1.0
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.3333333333333333
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 0.2
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 1.0
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.13333333333333333
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.3333333333333333
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.9
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.3333333333333333
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 0.16666666666666666
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.9
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.16666666666666666
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.4666666666666667
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.7666666666666667
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.3
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 0.2
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.7666666666666667
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.13333333333333333
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.43333333333333335
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.7666666666666667
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.3
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 0.2
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.7666666666666667
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.13333333333333333
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.43333333333333335
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.7666666666666667
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.3
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 0.2
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.7666666666666667
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.13333333333333333
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.4666666666666667
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.7666666666666667
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.3
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 0.2
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.7666666666666667
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.13333333333333333
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.5
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.7333333333333333
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.3
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 0.16666666666666666
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.7333333333333333
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.13333333333333333
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.4666666666666667
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.7333333333333333
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.3
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 0.2
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.7333333333333333
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.13333333333333333
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.4666666666666667
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.7333333333333333
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.3
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 0.2
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.7333333333333333
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.13333333333333333
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.43333333333333335
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 0.6333333333333333
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.23333333333333334
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 0.2
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 0.6333333333333333
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.16666666666666666
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.4
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.6333333333333333
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.2
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 0.2
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.6333333333333333
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.13333333333333333
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.4
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 0.5666666666666667
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.16666666666666666
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 0.2
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 0.5666666666666667
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.1
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.1
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 0.2
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.13333333333333333
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 0.9333333333333333
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 0.2
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.0
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.1
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 0.2
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.13333333333333333
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 0.9
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 0.2
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.0
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.1
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 0.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.1
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 0.9333333333333333
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 0.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.0
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.06666666666666667
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 0.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.1
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 0.9666666666666667
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 0.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.0
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.06666666666666667
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 0.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.1
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 0.9666666666666667
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 0.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.0
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.06666666666666667
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 0.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.1
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 0.9666666666666667
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 0.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.0
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.03333333333333333
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 0.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 0.1
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 1.0
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 0.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.0
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 0.03333333333333333
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 0.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 0.1
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 1.0
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 0.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.0
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ },
1328
+ {
1329
+ "layer": "24",
1330
+ "layer_scores": [
1331
+ {
1332
+ "intervention": [
1333
+ "output_token"
1334
+ ],
1335
+ "counterfactual_scores": [
1336
+ {
1337
+ "counterfactual": [
1338
+ "randomLetter_counterfactual"
1339
+ ],
1340
+ "score": 0.03333333333333333
1341
+ },
1342
+ {
1343
+ "counterfactual": [
1344
+ "answerPosition_counterfactual"
1345
+ ],
1346
+ "score": 0.0
1347
+ },
1348
+ {
1349
+ "counterfactual": [
1350
+ "answerPosition_randomLetter_counterfactual"
1351
+ ],
1352
+ "score": 0.1
1353
+ }
1354
+ ]
1355
+ },
1356
+ {
1357
+ "intervention": [
1358
+ "output_location"
1359
+ ],
1360
+ "counterfactual_scores": [
1361
+ {
1362
+ "counterfactual": [
1363
+ "randomLetter_counterfactual"
1364
+ ],
1365
+ "score": 1.0
1366
+ },
1367
+ {
1368
+ "counterfactual": [
1369
+ "answerPosition_counterfactual"
1370
+ ],
1371
+ "score": 0.0
1372
+ },
1373
+ {
1374
+ "counterfactual": [
1375
+ "answerPosition_randomLetter_counterfactual"
1376
+ ],
1377
+ "score": 0.0
1378
+ }
1379
+ ]
1380
+ }
1381
+ ]
1382
+ },
1383
+ {
1384
+ "layer": "25",
1385
+ "layer_scores": [
1386
+ {
1387
+ "intervention": [
1388
+ "output_token"
1389
+ ],
1390
+ "counterfactual_scores": [
1391
+ {
1392
+ "counterfactual": [
1393
+ "randomLetter_counterfactual"
1394
+ ],
1395
+ "score": 0.03333333333333333
1396
+ },
1397
+ {
1398
+ "counterfactual": [
1399
+ "answerPosition_counterfactual"
1400
+ ],
1401
+ "score": 0.0
1402
+ },
1403
+ {
1404
+ "counterfactual": [
1405
+ "answerPosition_randomLetter_counterfactual"
1406
+ ],
1407
+ "score": 0.1
1408
+ }
1409
+ ]
1410
+ },
1411
+ {
1412
+ "intervention": [
1413
+ "output_location"
1414
+ ],
1415
+ "counterfactual_scores": [
1416
+ {
1417
+ "counterfactual": [
1418
+ "randomLetter_counterfactual"
1419
+ ],
1420
+ "score": 1.0
1421
+ },
1422
+ {
1423
+ "counterfactual": [
1424
+ "answerPosition_counterfactual"
1425
+ ],
1426
+ "score": 0.0
1427
+ },
1428
+ {
1429
+ "counterfactual": [
1430
+ "answerPosition_randomLetter_counterfactual"
1431
+ ],
1432
+ "score": 0.0
1433
+ }
1434
+ ]
1435
+ }
1436
+ ]
1437
+ }
1438
+ ]
1439
+ }
1440
+ }
1441
+ ]
1442
+ }
eval-results-mib-causalgraph/submissions/MCQA_results_google_correct_choice_token.json ADDED
@@ -0,0 +1,1442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "Gemma2ForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.03333333333333333
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 0.0
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.1
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 1.0
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 0.0
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.0
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.03333333333333333
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.0
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.1
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 1.0
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.0
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.0
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.03333333333333333
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.0
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.1
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 1.0
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.0
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.0
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.03333333333333333
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.0
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.1
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 1.0
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.0
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.0
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.03333333333333333
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 0.0
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.1
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 1.0
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 0.0
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.0
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.03333333333333333
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.0
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.1
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 1.0
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.0
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.0
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.03333333333333333
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.0
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.1
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 1.0
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.0
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.0
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.03333333333333333
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.0
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.1
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 1.0
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.0
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.0
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.03333333333333333
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.0
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.1
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 1.0
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.0
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.0
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.03333333333333333
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.0
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.1
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 1.0
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.0
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.0
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.03333333333333333
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.0
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.1
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 1.0
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.0
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.0
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.03333333333333333
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.0
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.1
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 1.0
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.0
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.0
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.03333333333333333
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.0
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.1
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 1.0
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.0
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.0
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.03333333333333333
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 0.0
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.1
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 1.0
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 0.0
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.0
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.03333333333333333
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.0
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.1
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 1.0
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.0
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.0
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.03333333333333333
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 0.0
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.1
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 1.0
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 0.0
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.0
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.03333333333333333
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 0.0
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.1
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 1.0
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 0.0
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.0
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.03333333333333333
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 0.0
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.1
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 1.0
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 0.0
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.0
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.03333333333333333
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 0.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.1
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 1.0
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 0.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.0
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.03333333333333333
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 0.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.1
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 1.0
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 0.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.0
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.03333333333333333
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 0.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.1
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 1.0
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 0.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.0
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.03333333333333333
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 0.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.1
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 1.0
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 0.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.0
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.03333333333333333
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 0.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 0.1
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 1.0
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 0.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.0
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 0.03333333333333333
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 0.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 0.1
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 1.0
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 0.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.0
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ },
1328
+ {
1329
+ "layer": "24",
1330
+ "layer_scores": [
1331
+ {
1332
+ "intervention": [
1333
+ "output_token"
1334
+ ],
1335
+ "counterfactual_scores": [
1336
+ {
1337
+ "counterfactual": [
1338
+ "randomLetter_counterfactual"
1339
+ ],
1340
+ "score": 0.03333333333333333
1341
+ },
1342
+ {
1343
+ "counterfactual": [
1344
+ "answerPosition_counterfactual"
1345
+ ],
1346
+ "score": 0.0
1347
+ },
1348
+ {
1349
+ "counterfactual": [
1350
+ "answerPosition_randomLetter_counterfactual"
1351
+ ],
1352
+ "score": 0.1
1353
+ }
1354
+ ]
1355
+ },
1356
+ {
1357
+ "intervention": [
1358
+ "output_location"
1359
+ ],
1360
+ "counterfactual_scores": [
1361
+ {
1362
+ "counterfactual": [
1363
+ "randomLetter_counterfactual"
1364
+ ],
1365
+ "score": 1.0
1366
+ },
1367
+ {
1368
+ "counterfactual": [
1369
+ "answerPosition_counterfactual"
1370
+ ],
1371
+ "score": 0.0
1372
+ },
1373
+ {
1374
+ "counterfactual": [
1375
+ "answerPosition_randomLetter_counterfactual"
1376
+ ],
1377
+ "score": 0.0
1378
+ }
1379
+ ]
1380
+ }
1381
+ ]
1382
+ },
1383
+ {
1384
+ "layer": "25",
1385
+ "layer_scores": [
1386
+ {
1387
+ "intervention": [
1388
+ "output_token"
1389
+ ],
1390
+ "counterfactual_scores": [
1391
+ {
1392
+ "counterfactual": [
1393
+ "randomLetter_counterfactual"
1394
+ ],
1395
+ "score": 0.03333333333333333
1396
+ },
1397
+ {
1398
+ "counterfactual": [
1399
+ "answerPosition_counterfactual"
1400
+ ],
1401
+ "score": 0.0
1402
+ },
1403
+ {
1404
+ "counterfactual": [
1405
+ "answerPosition_randomLetter_counterfactual"
1406
+ ],
1407
+ "score": 0.1
1408
+ }
1409
+ ]
1410
+ },
1411
+ {
1412
+ "intervention": [
1413
+ "output_location"
1414
+ ],
1415
+ "counterfactual_scores": [
1416
+ {
1417
+ "counterfactual": [
1418
+ "randomLetter_counterfactual"
1419
+ ],
1420
+ "score": 1.0
1421
+ },
1422
+ {
1423
+ "counterfactual": [
1424
+ "answerPosition_counterfactual"
1425
+ ],
1426
+ "score": 0.0
1427
+ },
1428
+ {
1429
+ "counterfactual": [
1430
+ "answerPosition_randomLetter_counterfactual"
1431
+ ],
1432
+ "score": 0.0
1433
+ }
1434
+ ]
1435
+ }
1436
+ ]
1437
+ }
1438
+ ]
1439
+ }
1440
+ }
1441
+ ]
1442
+ }
eval-results-mib-causalgraph/submissions/MCQA_results_google_last_token.json ADDED
@@ -0,0 +1,1442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "Gemma2ForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.03333333333333333
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 0.0
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.1
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 1.0
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 0.0
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.0
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.03333333333333333
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.0
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.1
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 1.0
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.0
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.0
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.03333333333333333
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.0
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.1
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 1.0
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.0
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.0
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.03333333333333333
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.0
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.1
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 1.0
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.0
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.0
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.03333333333333333
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 0.0
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.1
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 1.0
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 0.0
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.0
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.03333333333333333
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.0
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.1
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 1.0
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.0
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.0
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.03333333333333333
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.0
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.1
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 1.0
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.0
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.0
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.03333333333333333
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.0
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.1
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 1.0
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.0
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.0
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.03333333333333333
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.0
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.1
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 1.0
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.0
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.0
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.03333333333333333
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.0
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.1
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 1.0
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.0
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.0
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.03333333333333333
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.0
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.1
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 1.0
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.0
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.0
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.03333333333333333
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.0
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.1
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 1.0
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.0
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.0
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.03333333333333333
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.0
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.1
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 1.0
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.0
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.0
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.03333333333333333
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 0.0
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.1
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 1.0
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 0.0
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.0
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.03333333333333333
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.0
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.1
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 1.0
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.0
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.0
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.03333333333333333
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 0.0
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.1
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 1.0
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 0.0
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.0
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.03333333333333333
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 0.0
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.1
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 1.0
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 0.0
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.03333333333333333
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.16666666666666666
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 1.0
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.23333333333333334
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 0.8
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 1.0
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.5666666666666667
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.23333333333333334
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 1.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.3
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 0.8
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 1.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.5
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.4666666666666667
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 1.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.4666666666666667
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 0.43333333333333335
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 1.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.3333333333333333
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.5
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 1.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.4666666666666667
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 0.26666666666666666
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 1.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.26666666666666666
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.4666666666666667
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 1.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.4666666666666667
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 0.26666666666666666
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 1.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.26666666666666666
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.6
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 1.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 0.6333333333333333
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 0.23333333333333334
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 1.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.2
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 0.9666666666666667
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 1.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 0.9
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 0.03333333333333333
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 1.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.2
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ },
1328
+ {
1329
+ "layer": "24",
1330
+ "layer_scores": [
1331
+ {
1332
+ "intervention": [
1333
+ "output_token"
1334
+ ],
1335
+ "counterfactual_scores": [
1336
+ {
1337
+ "counterfactual": [
1338
+ "randomLetter_counterfactual"
1339
+ ],
1340
+ "score": 1.0
1341
+ },
1342
+ {
1343
+ "counterfactual": [
1344
+ "answerPosition_counterfactual"
1345
+ ],
1346
+ "score": 1.0
1347
+ },
1348
+ {
1349
+ "counterfactual": [
1350
+ "answerPosition_randomLetter_counterfactual"
1351
+ ],
1352
+ "score": 0.9333333333333333
1353
+ }
1354
+ ]
1355
+ },
1356
+ {
1357
+ "intervention": [
1358
+ "output_location"
1359
+ ],
1360
+ "counterfactual_scores": [
1361
+ {
1362
+ "counterfactual": [
1363
+ "randomLetter_counterfactual"
1364
+ ],
1365
+ "score": 0.03333333333333333
1366
+ },
1367
+ {
1368
+ "counterfactual": [
1369
+ "answerPosition_counterfactual"
1370
+ ],
1371
+ "score": 1.0
1372
+ },
1373
+ {
1374
+ "counterfactual": [
1375
+ "answerPosition_randomLetter_counterfactual"
1376
+ ],
1377
+ "score": 0.2
1378
+ }
1379
+ ]
1380
+ }
1381
+ ]
1382
+ },
1383
+ {
1384
+ "layer": "25",
1385
+ "layer_scores": [
1386
+ {
1387
+ "intervention": [
1388
+ "output_token"
1389
+ ],
1390
+ "counterfactual_scores": [
1391
+ {
1392
+ "counterfactual": [
1393
+ "randomLetter_counterfactual"
1394
+ ],
1395
+ "score": 1.0
1396
+ },
1397
+ {
1398
+ "counterfactual": [
1399
+ "answerPosition_counterfactual"
1400
+ ],
1401
+ "score": 1.0
1402
+ },
1403
+ {
1404
+ "counterfactual": [
1405
+ "answerPosition_randomLetter_counterfactual"
1406
+ ],
1407
+ "score": 0.9333333333333333
1408
+ }
1409
+ ]
1410
+ },
1411
+ {
1412
+ "intervention": [
1413
+ "output_location"
1414
+ ],
1415
+ "counterfactual_scores": [
1416
+ {
1417
+ "counterfactual": [
1418
+ "randomLetter_counterfactual"
1419
+ ],
1420
+ "score": 0.03333333333333333
1421
+ },
1422
+ {
1423
+ "counterfactual": [
1424
+ "answerPosition_counterfactual"
1425
+ ],
1426
+ "score": 1.0
1427
+ },
1428
+ {
1429
+ "counterfactual": [
1430
+ "answerPosition_randomLetter_counterfactual"
1431
+ ],
1432
+ "score": 0.13333333333333333
1433
+ }
1434
+ ]
1435
+ }
1436
+ ]
1437
+ }
1438
+ ]
1439
+ }
1440
+ }
1441
+ ]
1442
+ }
eval-results-mib-causalgraph/submissions/MCQA_results_google_second_to_last_token.json ADDED
@@ -0,0 +1,1442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "Gemma2ForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.03333333333333333
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 0.0
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.1
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 1.0
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 0.0
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.0
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.03333333333333333
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.0
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.1
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 1.0
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.0
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.0
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.03333333333333333
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.0
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.1
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 1.0
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.0
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.0
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.03333333333333333
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.0
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.1
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 1.0
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.0
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.0
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.03333333333333333
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 0.0
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.1
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 1.0
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 0.0
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.0
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.03333333333333333
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.0
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.1
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 1.0
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.0
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.0
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.03333333333333333
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.0
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.1
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 1.0
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.0
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.0
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.03333333333333333
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.0
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.1
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 1.0
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.0
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.0
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.03333333333333333
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.0
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.1
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 1.0
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.0
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.0
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.03333333333333333
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.0
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.1
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 1.0
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.0
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.0
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.03333333333333333
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.0
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.1
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 1.0
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.0
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.0
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.03333333333333333
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.0
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.1
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 1.0
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.0
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.0
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.03333333333333333
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.0
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.1
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 1.0
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.0
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.0
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.03333333333333333
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 0.0
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.1
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 1.0
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 0.0
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.0
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.03333333333333333
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.0
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.1
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 1.0
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.0
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.0
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.03333333333333333
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 0.0
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.1
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 1.0
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 0.0
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.0
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.03333333333333333
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 0.0
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.1
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 1.0
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 0.0
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.0
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.03333333333333333
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 0.0
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.1
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 1.0
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 0.0
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.0
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.03333333333333333
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 0.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.1
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 1.0
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 0.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.0
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.03333333333333333
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 0.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.1
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 1.0
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 0.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.0
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.03333333333333333
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 0.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.1
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 1.0
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 0.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.0
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.03333333333333333
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 0.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.1
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 1.0
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 0.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.0
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.03333333333333333
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 0.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 0.1
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 1.0
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 0.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.0
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 0.03333333333333333
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 0.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 0.1
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 1.0
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 0.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.0
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ },
1328
+ {
1329
+ "layer": "24",
1330
+ "layer_scores": [
1331
+ {
1332
+ "intervention": [
1333
+ "output_token"
1334
+ ],
1335
+ "counterfactual_scores": [
1336
+ {
1337
+ "counterfactual": [
1338
+ "randomLetter_counterfactual"
1339
+ ],
1340
+ "score": 0.03333333333333333
1341
+ },
1342
+ {
1343
+ "counterfactual": [
1344
+ "answerPosition_counterfactual"
1345
+ ],
1346
+ "score": 0.0
1347
+ },
1348
+ {
1349
+ "counterfactual": [
1350
+ "answerPosition_randomLetter_counterfactual"
1351
+ ],
1352
+ "score": 0.1
1353
+ }
1354
+ ]
1355
+ },
1356
+ {
1357
+ "intervention": [
1358
+ "output_location"
1359
+ ],
1360
+ "counterfactual_scores": [
1361
+ {
1362
+ "counterfactual": [
1363
+ "randomLetter_counterfactual"
1364
+ ],
1365
+ "score": 1.0
1366
+ },
1367
+ {
1368
+ "counterfactual": [
1369
+ "answerPosition_counterfactual"
1370
+ ],
1371
+ "score": 0.0
1372
+ },
1373
+ {
1374
+ "counterfactual": [
1375
+ "answerPosition_randomLetter_counterfactual"
1376
+ ],
1377
+ "score": 0.0
1378
+ }
1379
+ ]
1380
+ }
1381
+ ]
1382
+ },
1383
+ {
1384
+ "layer": "25",
1385
+ "layer_scores": [
1386
+ {
1387
+ "intervention": [
1388
+ "output_token"
1389
+ ],
1390
+ "counterfactual_scores": [
1391
+ {
1392
+ "counterfactual": [
1393
+ "randomLetter_counterfactual"
1394
+ ],
1395
+ "score": 0.03333333333333333
1396
+ },
1397
+ {
1398
+ "counterfactual": [
1399
+ "answerPosition_counterfactual"
1400
+ ],
1401
+ "score": 0.0
1402
+ },
1403
+ {
1404
+ "counterfactual": [
1405
+ "answerPosition_randomLetter_counterfactual"
1406
+ ],
1407
+ "score": 0.1
1408
+ }
1409
+ ]
1410
+ },
1411
+ {
1412
+ "intervention": [
1413
+ "output_location"
1414
+ ],
1415
+ "counterfactual_scores": [
1416
+ {
1417
+ "counterfactual": [
1418
+ "randomLetter_counterfactual"
1419
+ ],
1420
+ "score": 1.0
1421
+ },
1422
+ {
1423
+ "counterfactual": [
1424
+ "answerPosition_counterfactual"
1425
+ ],
1426
+ "score": 0.0
1427
+ },
1428
+ {
1429
+ "counterfactual": [
1430
+ "answerPosition_randomLetter_counterfactual"
1431
+ ],
1432
+ "score": 0.0
1433
+ }
1434
+ ]
1435
+ }
1436
+ ]
1437
+ }
1438
+ ]
1439
+ }
1440
+ }
1441
+ ]
1442
+ }
eval-results-mib-causalgraph/submissions/MCQA_results_meta-llama_correct_choice_period_token.json ADDED
@@ -0,0 +1,1772 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "LlamaForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.9782608695652174
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 0.9782608695652174
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.9347826086956522
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 0.043478260869565216
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 0.9782608695652174
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.10869565217391304
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.6521739130434783
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.8478260869565217
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.6739130434782609
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 0.3695652173913043
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.8478260869565217
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.10869565217391304
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.6956521739130435
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.9565217391304348
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.6521739130434783
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 0.32608695652173914
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.9565217391304348
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.10869565217391304
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.717391304347826
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.9565217391304348
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.7608695652173914
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 0.30434782608695654
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.9565217391304348
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.10869565217391304
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.717391304347826
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 0.9565217391304348
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.7608695652173914
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 0.30434782608695654
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 0.9565217391304348
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.10869565217391304
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.717391304347826
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.9565217391304348
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.782608695652174
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 0.30434782608695654
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.9565217391304348
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.10869565217391304
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.717391304347826
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.9565217391304348
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.8260869565217391
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 0.30434782608695654
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.9565217391304348
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.10869565217391304
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.717391304347826
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.9782608695652174
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.8478260869565217
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 0.30434782608695654
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.9782608695652174
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.13043478260869565
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.717391304347826
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.9782608695652174
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.8260869565217391
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 0.30434782608695654
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.9782608695652174
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.13043478260869565
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.717391304347826
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.9782608695652174
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.782608695652174
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 0.30434782608695654
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.9782608695652174
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.15217391304347827
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.717391304347826
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.9782608695652174
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.782608695652174
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 0.30434782608695654
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.9782608695652174
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.15217391304347827
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.717391304347826
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.9782608695652174
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.782608695652174
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 0.30434782608695654
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.9782608695652174
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.13043478260869565
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.717391304347826
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.9782608695652174
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.6739130434782609
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 0.30434782608695654
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.9782608695652174
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.13043478260869565
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.717391304347826
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 1.0
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.6739130434782609
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 0.30434782608695654
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 1.0
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.15217391304347827
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.6956521739130435
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.9782608695652174
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.5652173913043478
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 0.30434782608695654
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.9782608695652174
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.13043478260869565
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.717391304347826
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 0.9782608695652174
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.5869565217391305
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 0.30434782608695654
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 0.9782608695652174
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.10869565217391304
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.043478260869565216
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 0.021739130434782608
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.08695652173913043
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 0.9782608695652174
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 0.021739130434782608
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.021739130434782608
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.043478260869565216
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 0.021739130434782608
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.08695652173913043
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 0.9782608695652174
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 0.021739130434782608
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.021739130434782608
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.043478260869565216
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 0.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.06521739130434782
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 0.9782608695652174
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 0.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.0
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.043478260869565216
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 0.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.06521739130434782
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 0.9782608695652174
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 0.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.0
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.043478260869565216
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 0.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.06521739130434782
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 0.9782608695652174
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 0.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.0
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.043478260869565216
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 0.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.06521739130434782
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 0.9782608695652174
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 0.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.0
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.021739130434782608
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 0.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 0.06521739130434782
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 1.0
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 0.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.0
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 0.021739130434782608
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 0.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 0.06521739130434782
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 1.0
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 0.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.0
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ },
1328
+ {
1329
+ "layer": "24",
1330
+ "layer_scores": [
1331
+ {
1332
+ "intervention": [
1333
+ "output_token"
1334
+ ],
1335
+ "counterfactual_scores": [
1336
+ {
1337
+ "counterfactual": [
1338
+ "randomLetter_counterfactual"
1339
+ ],
1340
+ "score": 0.021739130434782608
1341
+ },
1342
+ {
1343
+ "counterfactual": [
1344
+ "answerPosition_counterfactual"
1345
+ ],
1346
+ "score": 0.0
1347
+ },
1348
+ {
1349
+ "counterfactual": [
1350
+ "answerPosition_randomLetter_counterfactual"
1351
+ ],
1352
+ "score": 0.06521739130434782
1353
+ }
1354
+ ]
1355
+ },
1356
+ {
1357
+ "intervention": [
1358
+ "output_location"
1359
+ ],
1360
+ "counterfactual_scores": [
1361
+ {
1362
+ "counterfactual": [
1363
+ "randomLetter_counterfactual"
1364
+ ],
1365
+ "score": 1.0
1366
+ },
1367
+ {
1368
+ "counterfactual": [
1369
+ "answerPosition_counterfactual"
1370
+ ],
1371
+ "score": 0.0
1372
+ },
1373
+ {
1374
+ "counterfactual": [
1375
+ "answerPosition_randomLetter_counterfactual"
1376
+ ],
1377
+ "score": 0.0
1378
+ }
1379
+ ]
1380
+ }
1381
+ ]
1382
+ },
1383
+ {
1384
+ "layer": "25",
1385
+ "layer_scores": [
1386
+ {
1387
+ "intervention": [
1388
+ "output_token"
1389
+ ],
1390
+ "counterfactual_scores": [
1391
+ {
1392
+ "counterfactual": [
1393
+ "randomLetter_counterfactual"
1394
+ ],
1395
+ "score": 0.021739130434782608
1396
+ },
1397
+ {
1398
+ "counterfactual": [
1399
+ "answerPosition_counterfactual"
1400
+ ],
1401
+ "score": 0.0
1402
+ },
1403
+ {
1404
+ "counterfactual": [
1405
+ "answerPosition_randomLetter_counterfactual"
1406
+ ],
1407
+ "score": 0.06521739130434782
1408
+ }
1409
+ ]
1410
+ },
1411
+ {
1412
+ "intervention": [
1413
+ "output_location"
1414
+ ],
1415
+ "counterfactual_scores": [
1416
+ {
1417
+ "counterfactual": [
1418
+ "randomLetter_counterfactual"
1419
+ ],
1420
+ "score": 1.0
1421
+ },
1422
+ {
1423
+ "counterfactual": [
1424
+ "answerPosition_counterfactual"
1425
+ ],
1426
+ "score": 0.0
1427
+ },
1428
+ {
1429
+ "counterfactual": [
1430
+ "answerPosition_randomLetter_counterfactual"
1431
+ ],
1432
+ "score": 0.0
1433
+ }
1434
+ ]
1435
+ }
1436
+ ]
1437
+ },
1438
+ {
1439
+ "layer": "26",
1440
+ "layer_scores": [
1441
+ {
1442
+ "intervention": [
1443
+ "output_token"
1444
+ ],
1445
+ "counterfactual_scores": [
1446
+ {
1447
+ "counterfactual": [
1448
+ "randomLetter_counterfactual"
1449
+ ],
1450
+ "score": 0.021739130434782608
1451
+ },
1452
+ {
1453
+ "counterfactual": [
1454
+ "answerPosition_counterfactual"
1455
+ ],
1456
+ "score": 0.0
1457
+ },
1458
+ {
1459
+ "counterfactual": [
1460
+ "answerPosition_randomLetter_counterfactual"
1461
+ ],
1462
+ "score": 0.06521739130434782
1463
+ }
1464
+ ]
1465
+ },
1466
+ {
1467
+ "intervention": [
1468
+ "output_location"
1469
+ ],
1470
+ "counterfactual_scores": [
1471
+ {
1472
+ "counterfactual": [
1473
+ "randomLetter_counterfactual"
1474
+ ],
1475
+ "score": 1.0
1476
+ },
1477
+ {
1478
+ "counterfactual": [
1479
+ "answerPosition_counterfactual"
1480
+ ],
1481
+ "score": 0.0
1482
+ },
1483
+ {
1484
+ "counterfactual": [
1485
+ "answerPosition_randomLetter_counterfactual"
1486
+ ],
1487
+ "score": 0.0
1488
+ }
1489
+ ]
1490
+ }
1491
+ ]
1492
+ },
1493
+ {
1494
+ "layer": "27",
1495
+ "layer_scores": [
1496
+ {
1497
+ "intervention": [
1498
+ "output_token"
1499
+ ],
1500
+ "counterfactual_scores": [
1501
+ {
1502
+ "counterfactual": [
1503
+ "randomLetter_counterfactual"
1504
+ ],
1505
+ "score": 0.021739130434782608
1506
+ },
1507
+ {
1508
+ "counterfactual": [
1509
+ "answerPosition_counterfactual"
1510
+ ],
1511
+ "score": 0.0
1512
+ },
1513
+ {
1514
+ "counterfactual": [
1515
+ "answerPosition_randomLetter_counterfactual"
1516
+ ],
1517
+ "score": 0.06521739130434782
1518
+ }
1519
+ ]
1520
+ },
1521
+ {
1522
+ "intervention": [
1523
+ "output_location"
1524
+ ],
1525
+ "counterfactual_scores": [
1526
+ {
1527
+ "counterfactual": [
1528
+ "randomLetter_counterfactual"
1529
+ ],
1530
+ "score": 1.0
1531
+ },
1532
+ {
1533
+ "counterfactual": [
1534
+ "answerPosition_counterfactual"
1535
+ ],
1536
+ "score": 0.0
1537
+ },
1538
+ {
1539
+ "counterfactual": [
1540
+ "answerPosition_randomLetter_counterfactual"
1541
+ ],
1542
+ "score": 0.0
1543
+ }
1544
+ ]
1545
+ }
1546
+ ]
1547
+ },
1548
+ {
1549
+ "layer": "28",
1550
+ "layer_scores": [
1551
+ {
1552
+ "intervention": [
1553
+ "output_token"
1554
+ ],
1555
+ "counterfactual_scores": [
1556
+ {
1557
+ "counterfactual": [
1558
+ "randomLetter_counterfactual"
1559
+ ],
1560
+ "score": 0.021739130434782608
1561
+ },
1562
+ {
1563
+ "counterfactual": [
1564
+ "answerPosition_counterfactual"
1565
+ ],
1566
+ "score": 0.0
1567
+ },
1568
+ {
1569
+ "counterfactual": [
1570
+ "answerPosition_randomLetter_counterfactual"
1571
+ ],
1572
+ "score": 0.06521739130434782
1573
+ }
1574
+ ]
1575
+ },
1576
+ {
1577
+ "intervention": [
1578
+ "output_location"
1579
+ ],
1580
+ "counterfactual_scores": [
1581
+ {
1582
+ "counterfactual": [
1583
+ "randomLetter_counterfactual"
1584
+ ],
1585
+ "score": 1.0
1586
+ },
1587
+ {
1588
+ "counterfactual": [
1589
+ "answerPosition_counterfactual"
1590
+ ],
1591
+ "score": 0.0
1592
+ },
1593
+ {
1594
+ "counterfactual": [
1595
+ "answerPosition_randomLetter_counterfactual"
1596
+ ],
1597
+ "score": 0.0
1598
+ }
1599
+ ]
1600
+ }
1601
+ ]
1602
+ },
1603
+ {
1604
+ "layer": "29",
1605
+ "layer_scores": [
1606
+ {
1607
+ "intervention": [
1608
+ "output_token"
1609
+ ],
1610
+ "counterfactual_scores": [
1611
+ {
1612
+ "counterfactual": [
1613
+ "randomLetter_counterfactual"
1614
+ ],
1615
+ "score": 0.021739130434782608
1616
+ },
1617
+ {
1618
+ "counterfactual": [
1619
+ "answerPosition_counterfactual"
1620
+ ],
1621
+ "score": 0.0
1622
+ },
1623
+ {
1624
+ "counterfactual": [
1625
+ "answerPosition_randomLetter_counterfactual"
1626
+ ],
1627
+ "score": 0.06521739130434782
1628
+ }
1629
+ ]
1630
+ },
1631
+ {
1632
+ "intervention": [
1633
+ "output_location"
1634
+ ],
1635
+ "counterfactual_scores": [
1636
+ {
1637
+ "counterfactual": [
1638
+ "randomLetter_counterfactual"
1639
+ ],
1640
+ "score": 1.0
1641
+ },
1642
+ {
1643
+ "counterfactual": [
1644
+ "answerPosition_counterfactual"
1645
+ ],
1646
+ "score": 0.0
1647
+ },
1648
+ {
1649
+ "counterfactual": [
1650
+ "answerPosition_randomLetter_counterfactual"
1651
+ ],
1652
+ "score": 0.0
1653
+ }
1654
+ ]
1655
+ }
1656
+ ]
1657
+ },
1658
+ {
1659
+ "layer": "30",
1660
+ "layer_scores": [
1661
+ {
1662
+ "intervention": [
1663
+ "output_token"
1664
+ ],
1665
+ "counterfactual_scores": [
1666
+ {
1667
+ "counterfactual": [
1668
+ "randomLetter_counterfactual"
1669
+ ],
1670
+ "score": 0.021739130434782608
1671
+ },
1672
+ {
1673
+ "counterfactual": [
1674
+ "answerPosition_counterfactual"
1675
+ ],
1676
+ "score": 0.0
1677
+ },
1678
+ {
1679
+ "counterfactual": [
1680
+ "answerPosition_randomLetter_counterfactual"
1681
+ ],
1682
+ "score": 0.06521739130434782
1683
+ }
1684
+ ]
1685
+ },
1686
+ {
1687
+ "intervention": [
1688
+ "output_location"
1689
+ ],
1690
+ "counterfactual_scores": [
1691
+ {
1692
+ "counterfactual": [
1693
+ "randomLetter_counterfactual"
1694
+ ],
1695
+ "score": 1.0
1696
+ },
1697
+ {
1698
+ "counterfactual": [
1699
+ "answerPosition_counterfactual"
1700
+ ],
1701
+ "score": 0.0
1702
+ },
1703
+ {
1704
+ "counterfactual": [
1705
+ "answerPosition_randomLetter_counterfactual"
1706
+ ],
1707
+ "score": 0.0
1708
+ }
1709
+ ]
1710
+ }
1711
+ ]
1712
+ },
1713
+ {
1714
+ "layer": "31",
1715
+ "layer_scores": [
1716
+ {
1717
+ "intervention": [
1718
+ "output_token"
1719
+ ],
1720
+ "counterfactual_scores": [
1721
+ {
1722
+ "counterfactual": [
1723
+ "randomLetter_counterfactual"
1724
+ ],
1725
+ "score": 0.021739130434782608
1726
+ },
1727
+ {
1728
+ "counterfactual": [
1729
+ "answerPosition_counterfactual"
1730
+ ],
1731
+ "score": 0.0
1732
+ },
1733
+ {
1734
+ "counterfactual": [
1735
+ "answerPosition_randomLetter_counterfactual"
1736
+ ],
1737
+ "score": 0.06521739130434782
1738
+ }
1739
+ ]
1740
+ },
1741
+ {
1742
+ "intervention": [
1743
+ "output_location"
1744
+ ],
1745
+ "counterfactual_scores": [
1746
+ {
1747
+ "counterfactual": [
1748
+ "randomLetter_counterfactual"
1749
+ ],
1750
+ "score": 1.0
1751
+ },
1752
+ {
1753
+ "counterfactual": [
1754
+ "answerPosition_counterfactual"
1755
+ ],
1756
+ "score": 0.0
1757
+ },
1758
+ {
1759
+ "counterfactual": [
1760
+ "answerPosition_randomLetter_counterfactual"
1761
+ ],
1762
+ "score": 0.0
1763
+ }
1764
+ ]
1765
+ }
1766
+ ]
1767
+ }
1768
+ ]
1769
+ }
1770
+ }
1771
+ ]
1772
+ }
eval-results-mib-causalgraph/submissions/MCQA_results_meta-llama_correct_choice_token.json ADDED
@@ -0,0 +1,1772 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "LlamaForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.021739130434782608
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 0.0
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.06521739130434782
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 1.0
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 0.0
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.0
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.021739130434782608
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.0
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.06521739130434782
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 1.0
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.0
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.0
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.021739130434782608
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.0
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.06521739130434782
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 1.0
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.0
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.0
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.021739130434782608
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.0
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.06521739130434782
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 1.0
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.0
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.0
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.021739130434782608
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 0.0
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.06521739130434782
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 1.0
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 0.0
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.0
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.021739130434782608
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.0
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.06521739130434782
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 1.0
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.0
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.0
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.021739130434782608
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.0
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.06521739130434782
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 1.0
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.0
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.0
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.021739130434782608
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.0
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.06521739130434782
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 1.0
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.0
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.0
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.021739130434782608
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.0
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.06521739130434782
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 1.0
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.0
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.0
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.021739130434782608
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.0
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.06521739130434782
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 1.0
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.0
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.0
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.021739130434782608
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.0
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.06521739130434782
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 1.0
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.0
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.0
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.021739130434782608
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.0
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.06521739130434782
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 1.0
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.0
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.0
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.021739130434782608
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.0
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.06521739130434782
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 1.0
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.0
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.0
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.021739130434782608
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 0.0
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.06521739130434782
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 1.0
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 0.0
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.0
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.021739130434782608
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.0
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.06521739130434782
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 1.0
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.0
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.0
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.021739130434782608
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 0.0
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.06521739130434782
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 1.0
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 0.0
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.0
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.021739130434782608
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 0.0
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.06521739130434782
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 1.0
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 0.0
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.0
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.021739130434782608
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 0.0
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.06521739130434782
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 1.0
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 0.0
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.0
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.021739130434782608
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 0.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.06521739130434782
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 1.0
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 0.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.0
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.021739130434782608
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 0.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.06521739130434782
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 1.0
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 0.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.0
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.021739130434782608
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 0.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.06521739130434782
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 1.0
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 0.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.0
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.021739130434782608
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 0.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.06521739130434782
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 1.0
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 0.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.0
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.021739130434782608
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 0.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 0.06521739130434782
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 1.0
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 0.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.0
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 0.021739130434782608
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 0.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 0.06521739130434782
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 1.0
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 0.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.0
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ },
1328
+ {
1329
+ "layer": "24",
1330
+ "layer_scores": [
1331
+ {
1332
+ "intervention": [
1333
+ "output_token"
1334
+ ],
1335
+ "counterfactual_scores": [
1336
+ {
1337
+ "counterfactual": [
1338
+ "randomLetter_counterfactual"
1339
+ ],
1340
+ "score": 0.021739130434782608
1341
+ },
1342
+ {
1343
+ "counterfactual": [
1344
+ "answerPosition_counterfactual"
1345
+ ],
1346
+ "score": 0.0
1347
+ },
1348
+ {
1349
+ "counterfactual": [
1350
+ "answerPosition_randomLetter_counterfactual"
1351
+ ],
1352
+ "score": 0.06521739130434782
1353
+ }
1354
+ ]
1355
+ },
1356
+ {
1357
+ "intervention": [
1358
+ "output_location"
1359
+ ],
1360
+ "counterfactual_scores": [
1361
+ {
1362
+ "counterfactual": [
1363
+ "randomLetter_counterfactual"
1364
+ ],
1365
+ "score": 1.0
1366
+ },
1367
+ {
1368
+ "counterfactual": [
1369
+ "answerPosition_counterfactual"
1370
+ ],
1371
+ "score": 0.0
1372
+ },
1373
+ {
1374
+ "counterfactual": [
1375
+ "answerPosition_randomLetter_counterfactual"
1376
+ ],
1377
+ "score": 0.0
1378
+ }
1379
+ ]
1380
+ }
1381
+ ]
1382
+ },
1383
+ {
1384
+ "layer": "25",
1385
+ "layer_scores": [
1386
+ {
1387
+ "intervention": [
1388
+ "output_token"
1389
+ ],
1390
+ "counterfactual_scores": [
1391
+ {
1392
+ "counterfactual": [
1393
+ "randomLetter_counterfactual"
1394
+ ],
1395
+ "score": 0.021739130434782608
1396
+ },
1397
+ {
1398
+ "counterfactual": [
1399
+ "answerPosition_counterfactual"
1400
+ ],
1401
+ "score": 0.0
1402
+ },
1403
+ {
1404
+ "counterfactual": [
1405
+ "answerPosition_randomLetter_counterfactual"
1406
+ ],
1407
+ "score": 0.06521739130434782
1408
+ }
1409
+ ]
1410
+ },
1411
+ {
1412
+ "intervention": [
1413
+ "output_location"
1414
+ ],
1415
+ "counterfactual_scores": [
1416
+ {
1417
+ "counterfactual": [
1418
+ "randomLetter_counterfactual"
1419
+ ],
1420
+ "score": 1.0
1421
+ },
1422
+ {
1423
+ "counterfactual": [
1424
+ "answerPosition_counterfactual"
1425
+ ],
1426
+ "score": 0.0
1427
+ },
1428
+ {
1429
+ "counterfactual": [
1430
+ "answerPosition_randomLetter_counterfactual"
1431
+ ],
1432
+ "score": 0.0
1433
+ }
1434
+ ]
1435
+ }
1436
+ ]
1437
+ },
1438
+ {
1439
+ "layer": "26",
1440
+ "layer_scores": [
1441
+ {
1442
+ "intervention": [
1443
+ "output_token"
1444
+ ],
1445
+ "counterfactual_scores": [
1446
+ {
1447
+ "counterfactual": [
1448
+ "randomLetter_counterfactual"
1449
+ ],
1450
+ "score": 0.021739130434782608
1451
+ },
1452
+ {
1453
+ "counterfactual": [
1454
+ "answerPosition_counterfactual"
1455
+ ],
1456
+ "score": 0.0
1457
+ },
1458
+ {
1459
+ "counterfactual": [
1460
+ "answerPosition_randomLetter_counterfactual"
1461
+ ],
1462
+ "score": 0.06521739130434782
1463
+ }
1464
+ ]
1465
+ },
1466
+ {
1467
+ "intervention": [
1468
+ "output_location"
1469
+ ],
1470
+ "counterfactual_scores": [
1471
+ {
1472
+ "counterfactual": [
1473
+ "randomLetter_counterfactual"
1474
+ ],
1475
+ "score": 1.0
1476
+ },
1477
+ {
1478
+ "counterfactual": [
1479
+ "answerPosition_counterfactual"
1480
+ ],
1481
+ "score": 0.0
1482
+ },
1483
+ {
1484
+ "counterfactual": [
1485
+ "answerPosition_randomLetter_counterfactual"
1486
+ ],
1487
+ "score": 0.0
1488
+ }
1489
+ ]
1490
+ }
1491
+ ]
1492
+ },
1493
+ {
1494
+ "layer": "27",
1495
+ "layer_scores": [
1496
+ {
1497
+ "intervention": [
1498
+ "output_token"
1499
+ ],
1500
+ "counterfactual_scores": [
1501
+ {
1502
+ "counterfactual": [
1503
+ "randomLetter_counterfactual"
1504
+ ],
1505
+ "score": 0.021739130434782608
1506
+ },
1507
+ {
1508
+ "counterfactual": [
1509
+ "answerPosition_counterfactual"
1510
+ ],
1511
+ "score": 0.0
1512
+ },
1513
+ {
1514
+ "counterfactual": [
1515
+ "answerPosition_randomLetter_counterfactual"
1516
+ ],
1517
+ "score": 0.06521739130434782
1518
+ }
1519
+ ]
1520
+ },
1521
+ {
1522
+ "intervention": [
1523
+ "output_location"
1524
+ ],
1525
+ "counterfactual_scores": [
1526
+ {
1527
+ "counterfactual": [
1528
+ "randomLetter_counterfactual"
1529
+ ],
1530
+ "score": 1.0
1531
+ },
1532
+ {
1533
+ "counterfactual": [
1534
+ "answerPosition_counterfactual"
1535
+ ],
1536
+ "score": 0.0
1537
+ },
1538
+ {
1539
+ "counterfactual": [
1540
+ "answerPosition_randomLetter_counterfactual"
1541
+ ],
1542
+ "score": 0.0
1543
+ }
1544
+ ]
1545
+ }
1546
+ ]
1547
+ },
1548
+ {
1549
+ "layer": "28",
1550
+ "layer_scores": [
1551
+ {
1552
+ "intervention": [
1553
+ "output_token"
1554
+ ],
1555
+ "counterfactual_scores": [
1556
+ {
1557
+ "counterfactual": [
1558
+ "randomLetter_counterfactual"
1559
+ ],
1560
+ "score": 0.021739130434782608
1561
+ },
1562
+ {
1563
+ "counterfactual": [
1564
+ "answerPosition_counterfactual"
1565
+ ],
1566
+ "score": 0.0
1567
+ },
1568
+ {
1569
+ "counterfactual": [
1570
+ "answerPosition_randomLetter_counterfactual"
1571
+ ],
1572
+ "score": 0.06521739130434782
1573
+ }
1574
+ ]
1575
+ },
1576
+ {
1577
+ "intervention": [
1578
+ "output_location"
1579
+ ],
1580
+ "counterfactual_scores": [
1581
+ {
1582
+ "counterfactual": [
1583
+ "randomLetter_counterfactual"
1584
+ ],
1585
+ "score": 1.0
1586
+ },
1587
+ {
1588
+ "counterfactual": [
1589
+ "answerPosition_counterfactual"
1590
+ ],
1591
+ "score": 0.0
1592
+ },
1593
+ {
1594
+ "counterfactual": [
1595
+ "answerPosition_randomLetter_counterfactual"
1596
+ ],
1597
+ "score": 0.0
1598
+ }
1599
+ ]
1600
+ }
1601
+ ]
1602
+ },
1603
+ {
1604
+ "layer": "29",
1605
+ "layer_scores": [
1606
+ {
1607
+ "intervention": [
1608
+ "output_token"
1609
+ ],
1610
+ "counterfactual_scores": [
1611
+ {
1612
+ "counterfactual": [
1613
+ "randomLetter_counterfactual"
1614
+ ],
1615
+ "score": 0.021739130434782608
1616
+ },
1617
+ {
1618
+ "counterfactual": [
1619
+ "answerPosition_counterfactual"
1620
+ ],
1621
+ "score": 0.0
1622
+ },
1623
+ {
1624
+ "counterfactual": [
1625
+ "answerPosition_randomLetter_counterfactual"
1626
+ ],
1627
+ "score": 0.06521739130434782
1628
+ }
1629
+ ]
1630
+ },
1631
+ {
1632
+ "intervention": [
1633
+ "output_location"
1634
+ ],
1635
+ "counterfactual_scores": [
1636
+ {
1637
+ "counterfactual": [
1638
+ "randomLetter_counterfactual"
1639
+ ],
1640
+ "score": 1.0
1641
+ },
1642
+ {
1643
+ "counterfactual": [
1644
+ "answerPosition_counterfactual"
1645
+ ],
1646
+ "score": 0.0
1647
+ },
1648
+ {
1649
+ "counterfactual": [
1650
+ "answerPosition_randomLetter_counterfactual"
1651
+ ],
1652
+ "score": 0.0
1653
+ }
1654
+ ]
1655
+ }
1656
+ ]
1657
+ },
1658
+ {
1659
+ "layer": "30",
1660
+ "layer_scores": [
1661
+ {
1662
+ "intervention": [
1663
+ "output_token"
1664
+ ],
1665
+ "counterfactual_scores": [
1666
+ {
1667
+ "counterfactual": [
1668
+ "randomLetter_counterfactual"
1669
+ ],
1670
+ "score": 0.021739130434782608
1671
+ },
1672
+ {
1673
+ "counterfactual": [
1674
+ "answerPosition_counterfactual"
1675
+ ],
1676
+ "score": 0.0
1677
+ },
1678
+ {
1679
+ "counterfactual": [
1680
+ "answerPosition_randomLetter_counterfactual"
1681
+ ],
1682
+ "score": 0.06521739130434782
1683
+ }
1684
+ ]
1685
+ },
1686
+ {
1687
+ "intervention": [
1688
+ "output_location"
1689
+ ],
1690
+ "counterfactual_scores": [
1691
+ {
1692
+ "counterfactual": [
1693
+ "randomLetter_counterfactual"
1694
+ ],
1695
+ "score": 1.0
1696
+ },
1697
+ {
1698
+ "counterfactual": [
1699
+ "answerPosition_counterfactual"
1700
+ ],
1701
+ "score": 0.0
1702
+ },
1703
+ {
1704
+ "counterfactual": [
1705
+ "answerPosition_randomLetter_counterfactual"
1706
+ ],
1707
+ "score": 0.0
1708
+ }
1709
+ ]
1710
+ }
1711
+ ]
1712
+ },
1713
+ {
1714
+ "layer": "31",
1715
+ "layer_scores": [
1716
+ {
1717
+ "intervention": [
1718
+ "output_token"
1719
+ ],
1720
+ "counterfactual_scores": [
1721
+ {
1722
+ "counterfactual": [
1723
+ "randomLetter_counterfactual"
1724
+ ],
1725
+ "score": 0.021739130434782608
1726
+ },
1727
+ {
1728
+ "counterfactual": [
1729
+ "answerPosition_counterfactual"
1730
+ ],
1731
+ "score": 0.0
1732
+ },
1733
+ {
1734
+ "counterfactual": [
1735
+ "answerPosition_randomLetter_counterfactual"
1736
+ ],
1737
+ "score": 0.06521739130434782
1738
+ }
1739
+ ]
1740
+ },
1741
+ {
1742
+ "intervention": [
1743
+ "output_location"
1744
+ ],
1745
+ "counterfactual_scores": [
1746
+ {
1747
+ "counterfactual": [
1748
+ "randomLetter_counterfactual"
1749
+ ],
1750
+ "score": 1.0
1751
+ },
1752
+ {
1753
+ "counterfactual": [
1754
+ "answerPosition_counterfactual"
1755
+ ],
1756
+ "score": 0.0
1757
+ },
1758
+ {
1759
+ "counterfactual": [
1760
+ "answerPosition_randomLetter_counterfactual"
1761
+ ],
1762
+ "score": 0.0
1763
+ }
1764
+ ]
1765
+ }
1766
+ ]
1767
+ }
1768
+ ]
1769
+ }
1770
+ }
1771
+ ]
1772
+ }
eval-results-mib-causalgraph/submissions/MCQA_results_meta-llama_last_token.json ADDED
@@ -0,0 +1,1772 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "LlamaForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.021739130434782608
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 0.0
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.06521739130434782
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 1.0
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 0.0
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.0
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.021739130434782608
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.0
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.06521739130434782
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 1.0
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.0
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.0
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.021739130434782608
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.0
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.06521739130434782
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 1.0
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.0
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.0
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.021739130434782608
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.0
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.06521739130434782
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 1.0
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.0
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.0
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.021739130434782608
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 0.0
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.06521739130434782
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 1.0
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 0.0
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.0
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.021739130434782608
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.0
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.06521739130434782
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 1.0
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.0
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.0
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.021739130434782608
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.0
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.06521739130434782
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 1.0
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.0
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.0
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.021739130434782608
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.0
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.06521739130434782
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 1.0
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.0
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.0
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.021739130434782608
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.0
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.06521739130434782
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 1.0
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.0
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.0
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.021739130434782608
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.0
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.06521739130434782
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 1.0
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.0
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.0
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.021739130434782608
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.0
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.06521739130434782
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 1.0
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.0
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.0
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.021739130434782608
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.0
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.06521739130434782
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 1.0
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.0
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.0
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.021739130434782608
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.0
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.06521739130434782
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 1.0
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.0
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.0
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.021739130434782608
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 0.0
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.06521739130434782
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 1.0
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 0.0
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.0
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.021739130434782608
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.0
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.06521739130434782
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 1.0
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.0
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.0
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.021739130434782608
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 0.0
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.06521739130434782
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 1.0
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 0.0
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.0
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.021739130434782608
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 0.0
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.06521739130434782
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 1.0
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 0.0
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.0
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.15217391304347827
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 0.9782608695652174
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.2391304347826087
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 0.782608695652174
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 0.9782608695652174
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.2608695652173913
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.1956521739130435
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 1.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.30434782608695654
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 0.6086956521739131
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 1.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.391304347826087
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.17391304347826086
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 1.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.2608695652173913
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 0.6739130434782609
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 1.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.3695652173913043
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.2391304347826087
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 1.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.30434782608695654
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 0.6086956521739131
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 1.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.391304347826087
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.2608695652173913
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 1.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.32608695652173914
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 0.5869565217391305
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 1.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.391304347826087
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.4782608695652174
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 1.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 0.5869565217391305
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 0.34782608695652173
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 1.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.2391304347826087
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 0.6956521739130435
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 1.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 0.717391304347826
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 0.2391304347826087
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 1.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.1956521739130435
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ },
1328
+ {
1329
+ "layer": "24",
1330
+ "layer_scores": [
1331
+ {
1332
+ "intervention": [
1333
+ "output_token"
1334
+ ],
1335
+ "counterfactual_scores": [
1336
+ {
1337
+ "counterfactual": [
1338
+ "randomLetter_counterfactual"
1339
+ ],
1340
+ "score": 1.0
1341
+ },
1342
+ {
1343
+ "counterfactual": [
1344
+ "answerPosition_counterfactual"
1345
+ ],
1346
+ "score": 1.0
1347
+ },
1348
+ {
1349
+ "counterfactual": [
1350
+ "answerPosition_randomLetter_counterfactual"
1351
+ ],
1352
+ "score": 0.9782608695652174
1353
+ }
1354
+ ]
1355
+ },
1356
+ {
1357
+ "intervention": [
1358
+ "output_location"
1359
+ ],
1360
+ "counterfactual_scores": [
1361
+ {
1362
+ "counterfactual": [
1363
+ "randomLetter_counterfactual"
1364
+ ],
1365
+ "score": 0.021739130434782608
1366
+ },
1367
+ {
1368
+ "counterfactual": [
1369
+ "answerPosition_counterfactual"
1370
+ ],
1371
+ "score": 1.0
1372
+ },
1373
+ {
1374
+ "counterfactual": [
1375
+ "answerPosition_randomLetter_counterfactual"
1376
+ ],
1377
+ "score": 0.13043478260869565
1378
+ }
1379
+ ]
1380
+ }
1381
+ ]
1382
+ },
1383
+ {
1384
+ "layer": "25",
1385
+ "layer_scores": [
1386
+ {
1387
+ "intervention": [
1388
+ "output_token"
1389
+ ],
1390
+ "counterfactual_scores": [
1391
+ {
1392
+ "counterfactual": [
1393
+ "randomLetter_counterfactual"
1394
+ ],
1395
+ "score": 1.0
1396
+ },
1397
+ {
1398
+ "counterfactual": [
1399
+ "answerPosition_counterfactual"
1400
+ ],
1401
+ "score": 1.0
1402
+ },
1403
+ {
1404
+ "counterfactual": [
1405
+ "answerPosition_randomLetter_counterfactual"
1406
+ ],
1407
+ "score": 0.9782608695652174
1408
+ }
1409
+ ]
1410
+ },
1411
+ {
1412
+ "intervention": [
1413
+ "output_location"
1414
+ ],
1415
+ "counterfactual_scores": [
1416
+ {
1417
+ "counterfactual": [
1418
+ "randomLetter_counterfactual"
1419
+ ],
1420
+ "score": 0.021739130434782608
1421
+ },
1422
+ {
1423
+ "counterfactual": [
1424
+ "answerPosition_counterfactual"
1425
+ ],
1426
+ "score": 1.0
1427
+ },
1428
+ {
1429
+ "counterfactual": [
1430
+ "answerPosition_randomLetter_counterfactual"
1431
+ ],
1432
+ "score": 0.13043478260869565
1433
+ }
1434
+ ]
1435
+ }
1436
+ ]
1437
+ },
1438
+ {
1439
+ "layer": "26",
1440
+ "layer_scores": [
1441
+ {
1442
+ "intervention": [
1443
+ "output_token"
1444
+ ],
1445
+ "counterfactual_scores": [
1446
+ {
1447
+ "counterfactual": [
1448
+ "randomLetter_counterfactual"
1449
+ ],
1450
+ "score": 1.0
1451
+ },
1452
+ {
1453
+ "counterfactual": [
1454
+ "answerPosition_counterfactual"
1455
+ ],
1456
+ "score": 1.0
1457
+ },
1458
+ {
1459
+ "counterfactual": [
1460
+ "answerPosition_randomLetter_counterfactual"
1461
+ ],
1462
+ "score": 0.9782608695652174
1463
+ }
1464
+ ]
1465
+ },
1466
+ {
1467
+ "intervention": [
1468
+ "output_location"
1469
+ ],
1470
+ "counterfactual_scores": [
1471
+ {
1472
+ "counterfactual": [
1473
+ "randomLetter_counterfactual"
1474
+ ],
1475
+ "score": 0.021739130434782608
1476
+ },
1477
+ {
1478
+ "counterfactual": [
1479
+ "answerPosition_counterfactual"
1480
+ ],
1481
+ "score": 1.0
1482
+ },
1483
+ {
1484
+ "counterfactual": [
1485
+ "answerPosition_randomLetter_counterfactual"
1486
+ ],
1487
+ "score": 0.13043478260869565
1488
+ }
1489
+ ]
1490
+ }
1491
+ ]
1492
+ },
1493
+ {
1494
+ "layer": "27",
1495
+ "layer_scores": [
1496
+ {
1497
+ "intervention": [
1498
+ "output_token"
1499
+ ],
1500
+ "counterfactual_scores": [
1501
+ {
1502
+ "counterfactual": [
1503
+ "randomLetter_counterfactual"
1504
+ ],
1505
+ "score": 1.0
1506
+ },
1507
+ {
1508
+ "counterfactual": [
1509
+ "answerPosition_counterfactual"
1510
+ ],
1511
+ "score": 1.0
1512
+ },
1513
+ {
1514
+ "counterfactual": [
1515
+ "answerPosition_randomLetter_counterfactual"
1516
+ ],
1517
+ "score": 0.9782608695652174
1518
+ }
1519
+ ]
1520
+ },
1521
+ {
1522
+ "intervention": [
1523
+ "output_location"
1524
+ ],
1525
+ "counterfactual_scores": [
1526
+ {
1527
+ "counterfactual": [
1528
+ "randomLetter_counterfactual"
1529
+ ],
1530
+ "score": 0.021739130434782608
1531
+ },
1532
+ {
1533
+ "counterfactual": [
1534
+ "answerPosition_counterfactual"
1535
+ ],
1536
+ "score": 1.0
1537
+ },
1538
+ {
1539
+ "counterfactual": [
1540
+ "answerPosition_randomLetter_counterfactual"
1541
+ ],
1542
+ "score": 0.13043478260869565
1543
+ }
1544
+ ]
1545
+ }
1546
+ ]
1547
+ },
1548
+ {
1549
+ "layer": "28",
1550
+ "layer_scores": [
1551
+ {
1552
+ "intervention": [
1553
+ "output_token"
1554
+ ],
1555
+ "counterfactual_scores": [
1556
+ {
1557
+ "counterfactual": [
1558
+ "randomLetter_counterfactual"
1559
+ ],
1560
+ "score": 1.0
1561
+ },
1562
+ {
1563
+ "counterfactual": [
1564
+ "answerPosition_counterfactual"
1565
+ ],
1566
+ "score": 1.0
1567
+ },
1568
+ {
1569
+ "counterfactual": [
1570
+ "answerPosition_randomLetter_counterfactual"
1571
+ ],
1572
+ "score": 0.9782608695652174
1573
+ }
1574
+ ]
1575
+ },
1576
+ {
1577
+ "intervention": [
1578
+ "output_location"
1579
+ ],
1580
+ "counterfactual_scores": [
1581
+ {
1582
+ "counterfactual": [
1583
+ "randomLetter_counterfactual"
1584
+ ],
1585
+ "score": 0.021739130434782608
1586
+ },
1587
+ {
1588
+ "counterfactual": [
1589
+ "answerPosition_counterfactual"
1590
+ ],
1591
+ "score": 1.0
1592
+ },
1593
+ {
1594
+ "counterfactual": [
1595
+ "answerPosition_randomLetter_counterfactual"
1596
+ ],
1597
+ "score": 0.13043478260869565
1598
+ }
1599
+ ]
1600
+ }
1601
+ ]
1602
+ },
1603
+ {
1604
+ "layer": "29",
1605
+ "layer_scores": [
1606
+ {
1607
+ "intervention": [
1608
+ "output_token"
1609
+ ],
1610
+ "counterfactual_scores": [
1611
+ {
1612
+ "counterfactual": [
1613
+ "randomLetter_counterfactual"
1614
+ ],
1615
+ "score": 1.0
1616
+ },
1617
+ {
1618
+ "counterfactual": [
1619
+ "answerPosition_counterfactual"
1620
+ ],
1621
+ "score": 1.0
1622
+ },
1623
+ {
1624
+ "counterfactual": [
1625
+ "answerPosition_randomLetter_counterfactual"
1626
+ ],
1627
+ "score": 0.9782608695652174
1628
+ }
1629
+ ]
1630
+ },
1631
+ {
1632
+ "intervention": [
1633
+ "output_location"
1634
+ ],
1635
+ "counterfactual_scores": [
1636
+ {
1637
+ "counterfactual": [
1638
+ "randomLetter_counterfactual"
1639
+ ],
1640
+ "score": 0.021739130434782608
1641
+ },
1642
+ {
1643
+ "counterfactual": [
1644
+ "answerPosition_counterfactual"
1645
+ ],
1646
+ "score": 1.0
1647
+ },
1648
+ {
1649
+ "counterfactual": [
1650
+ "answerPosition_randomLetter_counterfactual"
1651
+ ],
1652
+ "score": 0.13043478260869565
1653
+ }
1654
+ ]
1655
+ }
1656
+ ]
1657
+ },
1658
+ {
1659
+ "layer": "30",
1660
+ "layer_scores": [
1661
+ {
1662
+ "intervention": [
1663
+ "output_token"
1664
+ ],
1665
+ "counterfactual_scores": [
1666
+ {
1667
+ "counterfactual": [
1668
+ "randomLetter_counterfactual"
1669
+ ],
1670
+ "score": 1.0
1671
+ },
1672
+ {
1673
+ "counterfactual": [
1674
+ "answerPosition_counterfactual"
1675
+ ],
1676
+ "score": 1.0
1677
+ },
1678
+ {
1679
+ "counterfactual": [
1680
+ "answerPosition_randomLetter_counterfactual"
1681
+ ],
1682
+ "score": 1.0
1683
+ }
1684
+ ]
1685
+ },
1686
+ {
1687
+ "intervention": [
1688
+ "output_location"
1689
+ ],
1690
+ "counterfactual_scores": [
1691
+ {
1692
+ "counterfactual": [
1693
+ "randomLetter_counterfactual"
1694
+ ],
1695
+ "score": 0.021739130434782608
1696
+ },
1697
+ {
1698
+ "counterfactual": [
1699
+ "answerPosition_counterfactual"
1700
+ ],
1701
+ "score": 1.0
1702
+ },
1703
+ {
1704
+ "counterfactual": [
1705
+ "answerPosition_randomLetter_counterfactual"
1706
+ ],
1707
+ "score": 0.13043478260869565
1708
+ }
1709
+ ]
1710
+ }
1711
+ ]
1712
+ },
1713
+ {
1714
+ "layer": "31",
1715
+ "layer_scores": [
1716
+ {
1717
+ "intervention": [
1718
+ "output_token"
1719
+ ],
1720
+ "counterfactual_scores": [
1721
+ {
1722
+ "counterfactual": [
1723
+ "randomLetter_counterfactual"
1724
+ ],
1725
+ "score": 1.0
1726
+ },
1727
+ {
1728
+ "counterfactual": [
1729
+ "answerPosition_counterfactual"
1730
+ ],
1731
+ "score": 1.0
1732
+ },
1733
+ {
1734
+ "counterfactual": [
1735
+ "answerPosition_randomLetter_counterfactual"
1736
+ ],
1737
+ "score": 1.0
1738
+ }
1739
+ ]
1740
+ },
1741
+ {
1742
+ "intervention": [
1743
+ "output_location"
1744
+ ],
1745
+ "counterfactual_scores": [
1746
+ {
1747
+ "counterfactual": [
1748
+ "randomLetter_counterfactual"
1749
+ ],
1750
+ "score": 0.021739130434782608
1751
+ },
1752
+ {
1753
+ "counterfactual": [
1754
+ "answerPosition_counterfactual"
1755
+ ],
1756
+ "score": 1.0
1757
+ },
1758
+ {
1759
+ "counterfactual": [
1760
+ "answerPosition_randomLetter_counterfactual"
1761
+ ],
1762
+ "score": 0.13043478260869565
1763
+ }
1764
+ ]
1765
+ }
1766
+ ]
1767
+ }
1768
+ ]
1769
+ }
1770
+ }
1771
+ ]
1772
+ }
eval-results-mib-causalgraph/submissions/MCQA_results_meta-llama_second_to_last_token.json ADDED
@@ -0,0 +1,1772 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "full_vector",
3
+ "results": [
4
+ {
5
+ "model_id": "LlamaForCausalLM",
6
+ "task_scores": {
7
+ "MCQA": [
8
+ {
9
+ "layer": "0",
10
+ "layer_scores": [
11
+ {
12
+ "intervention": [
13
+ "output_token"
14
+ ],
15
+ "counterfactual_scores": [
16
+ {
17
+ "counterfactual": [
18
+ "randomLetter_counterfactual"
19
+ ],
20
+ "score": 0.021739130434782608
21
+ },
22
+ {
23
+ "counterfactual": [
24
+ "answerPosition_counterfactual"
25
+ ],
26
+ "score": 0.0
27
+ },
28
+ {
29
+ "counterfactual": [
30
+ "answerPosition_randomLetter_counterfactual"
31
+ ],
32
+ "score": 0.06521739130434782
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "intervention": [
38
+ "output_location"
39
+ ],
40
+ "counterfactual_scores": [
41
+ {
42
+ "counterfactual": [
43
+ "randomLetter_counterfactual"
44
+ ],
45
+ "score": 1.0
46
+ },
47
+ {
48
+ "counterfactual": [
49
+ "answerPosition_counterfactual"
50
+ ],
51
+ "score": 0.0
52
+ },
53
+ {
54
+ "counterfactual": [
55
+ "answerPosition_randomLetter_counterfactual"
56
+ ],
57
+ "score": 0.0
58
+ }
59
+ ]
60
+ }
61
+ ]
62
+ },
63
+ {
64
+ "layer": "1",
65
+ "layer_scores": [
66
+ {
67
+ "intervention": [
68
+ "output_token"
69
+ ],
70
+ "counterfactual_scores": [
71
+ {
72
+ "counterfactual": [
73
+ "randomLetter_counterfactual"
74
+ ],
75
+ "score": 0.021739130434782608
76
+ },
77
+ {
78
+ "counterfactual": [
79
+ "answerPosition_counterfactual"
80
+ ],
81
+ "score": 0.0
82
+ },
83
+ {
84
+ "counterfactual": [
85
+ "answerPosition_randomLetter_counterfactual"
86
+ ],
87
+ "score": 0.06521739130434782
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "intervention": [
93
+ "output_location"
94
+ ],
95
+ "counterfactual_scores": [
96
+ {
97
+ "counterfactual": [
98
+ "randomLetter_counterfactual"
99
+ ],
100
+ "score": 1.0
101
+ },
102
+ {
103
+ "counterfactual": [
104
+ "answerPosition_counterfactual"
105
+ ],
106
+ "score": 0.0
107
+ },
108
+ {
109
+ "counterfactual": [
110
+ "answerPosition_randomLetter_counterfactual"
111
+ ],
112
+ "score": 0.0
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "layer": "2",
120
+ "layer_scores": [
121
+ {
122
+ "intervention": [
123
+ "output_token"
124
+ ],
125
+ "counterfactual_scores": [
126
+ {
127
+ "counterfactual": [
128
+ "randomLetter_counterfactual"
129
+ ],
130
+ "score": 0.021739130434782608
131
+ },
132
+ {
133
+ "counterfactual": [
134
+ "answerPosition_counterfactual"
135
+ ],
136
+ "score": 0.0
137
+ },
138
+ {
139
+ "counterfactual": [
140
+ "answerPosition_randomLetter_counterfactual"
141
+ ],
142
+ "score": 0.06521739130434782
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "intervention": [
148
+ "output_location"
149
+ ],
150
+ "counterfactual_scores": [
151
+ {
152
+ "counterfactual": [
153
+ "randomLetter_counterfactual"
154
+ ],
155
+ "score": 1.0
156
+ },
157
+ {
158
+ "counterfactual": [
159
+ "answerPosition_counterfactual"
160
+ ],
161
+ "score": 0.0
162
+ },
163
+ {
164
+ "counterfactual": [
165
+ "answerPosition_randomLetter_counterfactual"
166
+ ],
167
+ "score": 0.0
168
+ }
169
+ ]
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "layer": "3",
175
+ "layer_scores": [
176
+ {
177
+ "intervention": [
178
+ "output_token"
179
+ ],
180
+ "counterfactual_scores": [
181
+ {
182
+ "counterfactual": [
183
+ "randomLetter_counterfactual"
184
+ ],
185
+ "score": 0.021739130434782608
186
+ },
187
+ {
188
+ "counterfactual": [
189
+ "answerPosition_counterfactual"
190
+ ],
191
+ "score": 0.0
192
+ },
193
+ {
194
+ "counterfactual": [
195
+ "answerPosition_randomLetter_counterfactual"
196
+ ],
197
+ "score": 0.06521739130434782
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "intervention": [
203
+ "output_location"
204
+ ],
205
+ "counterfactual_scores": [
206
+ {
207
+ "counterfactual": [
208
+ "randomLetter_counterfactual"
209
+ ],
210
+ "score": 1.0
211
+ },
212
+ {
213
+ "counterfactual": [
214
+ "answerPosition_counterfactual"
215
+ ],
216
+ "score": 0.0
217
+ },
218
+ {
219
+ "counterfactual": [
220
+ "answerPosition_randomLetter_counterfactual"
221
+ ],
222
+ "score": 0.0
223
+ }
224
+ ]
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "layer": "4",
230
+ "layer_scores": [
231
+ {
232
+ "intervention": [
233
+ "output_token"
234
+ ],
235
+ "counterfactual_scores": [
236
+ {
237
+ "counterfactual": [
238
+ "randomLetter_counterfactual"
239
+ ],
240
+ "score": 0.021739130434782608
241
+ },
242
+ {
243
+ "counterfactual": [
244
+ "answerPosition_counterfactual"
245
+ ],
246
+ "score": 0.0
247
+ },
248
+ {
249
+ "counterfactual": [
250
+ "answerPosition_randomLetter_counterfactual"
251
+ ],
252
+ "score": 0.06521739130434782
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "intervention": [
258
+ "output_location"
259
+ ],
260
+ "counterfactual_scores": [
261
+ {
262
+ "counterfactual": [
263
+ "randomLetter_counterfactual"
264
+ ],
265
+ "score": 1.0
266
+ },
267
+ {
268
+ "counterfactual": [
269
+ "answerPosition_counterfactual"
270
+ ],
271
+ "score": 0.0
272
+ },
273
+ {
274
+ "counterfactual": [
275
+ "answerPosition_randomLetter_counterfactual"
276
+ ],
277
+ "score": 0.0
278
+ }
279
+ ]
280
+ }
281
+ ]
282
+ },
283
+ {
284
+ "layer": "5",
285
+ "layer_scores": [
286
+ {
287
+ "intervention": [
288
+ "output_token"
289
+ ],
290
+ "counterfactual_scores": [
291
+ {
292
+ "counterfactual": [
293
+ "randomLetter_counterfactual"
294
+ ],
295
+ "score": 0.021739130434782608
296
+ },
297
+ {
298
+ "counterfactual": [
299
+ "answerPosition_counterfactual"
300
+ ],
301
+ "score": 0.0
302
+ },
303
+ {
304
+ "counterfactual": [
305
+ "answerPosition_randomLetter_counterfactual"
306
+ ],
307
+ "score": 0.06521739130434782
308
+ }
309
+ ]
310
+ },
311
+ {
312
+ "intervention": [
313
+ "output_location"
314
+ ],
315
+ "counterfactual_scores": [
316
+ {
317
+ "counterfactual": [
318
+ "randomLetter_counterfactual"
319
+ ],
320
+ "score": 1.0
321
+ },
322
+ {
323
+ "counterfactual": [
324
+ "answerPosition_counterfactual"
325
+ ],
326
+ "score": 0.0
327
+ },
328
+ {
329
+ "counterfactual": [
330
+ "answerPosition_randomLetter_counterfactual"
331
+ ],
332
+ "score": 0.0
333
+ }
334
+ ]
335
+ }
336
+ ]
337
+ },
338
+ {
339
+ "layer": "6",
340
+ "layer_scores": [
341
+ {
342
+ "intervention": [
343
+ "output_token"
344
+ ],
345
+ "counterfactual_scores": [
346
+ {
347
+ "counterfactual": [
348
+ "randomLetter_counterfactual"
349
+ ],
350
+ "score": 0.021739130434782608
351
+ },
352
+ {
353
+ "counterfactual": [
354
+ "answerPosition_counterfactual"
355
+ ],
356
+ "score": 0.0
357
+ },
358
+ {
359
+ "counterfactual": [
360
+ "answerPosition_randomLetter_counterfactual"
361
+ ],
362
+ "score": 0.06521739130434782
363
+ }
364
+ ]
365
+ },
366
+ {
367
+ "intervention": [
368
+ "output_location"
369
+ ],
370
+ "counterfactual_scores": [
371
+ {
372
+ "counterfactual": [
373
+ "randomLetter_counterfactual"
374
+ ],
375
+ "score": 1.0
376
+ },
377
+ {
378
+ "counterfactual": [
379
+ "answerPosition_counterfactual"
380
+ ],
381
+ "score": 0.0
382
+ },
383
+ {
384
+ "counterfactual": [
385
+ "answerPosition_randomLetter_counterfactual"
386
+ ],
387
+ "score": 0.0
388
+ }
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "layer": "7",
395
+ "layer_scores": [
396
+ {
397
+ "intervention": [
398
+ "output_token"
399
+ ],
400
+ "counterfactual_scores": [
401
+ {
402
+ "counterfactual": [
403
+ "randomLetter_counterfactual"
404
+ ],
405
+ "score": 0.021739130434782608
406
+ },
407
+ {
408
+ "counterfactual": [
409
+ "answerPosition_counterfactual"
410
+ ],
411
+ "score": 0.0
412
+ },
413
+ {
414
+ "counterfactual": [
415
+ "answerPosition_randomLetter_counterfactual"
416
+ ],
417
+ "score": 0.06521739130434782
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "intervention": [
423
+ "output_location"
424
+ ],
425
+ "counterfactual_scores": [
426
+ {
427
+ "counterfactual": [
428
+ "randomLetter_counterfactual"
429
+ ],
430
+ "score": 1.0
431
+ },
432
+ {
433
+ "counterfactual": [
434
+ "answerPosition_counterfactual"
435
+ ],
436
+ "score": 0.0
437
+ },
438
+ {
439
+ "counterfactual": [
440
+ "answerPosition_randomLetter_counterfactual"
441
+ ],
442
+ "score": 0.0
443
+ }
444
+ ]
445
+ }
446
+ ]
447
+ },
448
+ {
449
+ "layer": "8",
450
+ "layer_scores": [
451
+ {
452
+ "intervention": [
453
+ "output_token"
454
+ ],
455
+ "counterfactual_scores": [
456
+ {
457
+ "counterfactual": [
458
+ "randomLetter_counterfactual"
459
+ ],
460
+ "score": 0.021739130434782608
461
+ },
462
+ {
463
+ "counterfactual": [
464
+ "answerPosition_counterfactual"
465
+ ],
466
+ "score": 0.0
467
+ },
468
+ {
469
+ "counterfactual": [
470
+ "answerPosition_randomLetter_counterfactual"
471
+ ],
472
+ "score": 0.06521739130434782
473
+ }
474
+ ]
475
+ },
476
+ {
477
+ "intervention": [
478
+ "output_location"
479
+ ],
480
+ "counterfactual_scores": [
481
+ {
482
+ "counterfactual": [
483
+ "randomLetter_counterfactual"
484
+ ],
485
+ "score": 1.0
486
+ },
487
+ {
488
+ "counterfactual": [
489
+ "answerPosition_counterfactual"
490
+ ],
491
+ "score": 0.0
492
+ },
493
+ {
494
+ "counterfactual": [
495
+ "answerPosition_randomLetter_counterfactual"
496
+ ],
497
+ "score": 0.0
498
+ }
499
+ ]
500
+ }
501
+ ]
502
+ },
503
+ {
504
+ "layer": "9",
505
+ "layer_scores": [
506
+ {
507
+ "intervention": [
508
+ "output_token"
509
+ ],
510
+ "counterfactual_scores": [
511
+ {
512
+ "counterfactual": [
513
+ "randomLetter_counterfactual"
514
+ ],
515
+ "score": 0.021739130434782608
516
+ },
517
+ {
518
+ "counterfactual": [
519
+ "answerPosition_counterfactual"
520
+ ],
521
+ "score": 0.0
522
+ },
523
+ {
524
+ "counterfactual": [
525
+ "answerPosition_randomLetter_counterfactual"
526
+ ],
527
+ "score": 0.06521739130434782
528
+ }
529
+ ]
530
+ },
531
+ {
532
+ "intervention": [
533
+ "output_location"
534
+ ],
535
+ "counterfactual_scores": [
536
+ {
537
+ "counterfactual": [
538
+ "randomLetter_counterfactual"
539
+ ],
540
+ "score": 1.0
541
+ },
542
+ {
543
+ "counterfactual": [
544
+ "answerPosition_counterfactual"
545
+ ],
546
+ "score": 0.0
547
+ },
548
+ {
549
+ "counterfactual": [
550
+ "answerPosition_randomLetter_counterfactual"
551
+ ],
552
+ "score": 0.0
553
+ }
554
+ ]
555
+ }
556
+ ]
557
+ },
558
+ {
559
+ "layer": "10",
560
+ "layer_scores": [
561
+ {
562
+ "intervention": [
563
+ "output_token"
564
+ ],
565
+ "counterfactual_scores": [
566
+ {
567
+ "counterfactual": [
568
+ "randomLetter_counterfactual"
569
+ ],
570
+ "score": 0.021739130434782608
571
+ },
572
+ {
573
+ "counterfactual": [
574
+ "answerPosition_counterfactual"
575
+ ],
576
+ "score": 0.0
577
+ },
578
+ {
579
+ "counterfactual": [
580
+ "answerPosition_randomLetter_counterfactual"
581
+ ],
582
+ "score": 0.06521739130434782
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "intervention": [
588
+ "output_location"
589
+ ],
590
+ "counterfactual_scores": [
591
+ {
592
+ "counterfactual": [
593
+ "randomLetter_counterfactual"
594
+ ],
595
+ "score": 1.0
596
+ },
597
+ {
598
+ "counterfactual": [
599
+ "answerPosition_counterfactual"
600
+ ],
601
+ "score": 0.0
602
+ },
603
+ {
604
+ "counterfactual": [
605
+ "answerPosition_randomLetter_counterfactual"
606
+ ],
607
+ "score": 0.0
608
+ }
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "layer": "11",
615
+ "layer_scores": [
616
+ {
617
+ "intervention": [
618
+ "output_token"
619
+ ],
620
+ "counterfactual_scores": [
621
+ {
622
+ "counterfactual": [
623
+ "randomLetter_counterfactual"
624
+ ],
625
+ "score": 0.021739130434782608
626
+ },
627
+ {
628
+ "counterfactual": [
629
+ "answerPosition_counterfactual"
630
+ ],
631
+ "score": 0.0
632
+ },
633
+ {
634
+ "counterfactual": [
635
+ "answerPosition_randomLetter_counterfactual"
636
+ ],
637
+ "score": 0.06521739130434782
638
+ }
639
+ ]
640
+ },
641
+ {
642
+ "intervention": [
643
+ "output_location"
644
+ ],
645
+ "counterfactual_scores": [
646
+ {
647
+ "counterfactual": [
648
+ "randomLetter_counterfactual"
649
+ ],
650
+ "score": 1.0
651
+ },
652
+ {
653
+ "counterfactual": [
654
+ "answerPosition_counterfactual"
655
+ ],
656
+ "score": 0.0
657
+ },
658
+ {
659
+ "counterfactual": [
660
+ "answerPosition_randomLetter_counterfactual"
661
+ ],
662
+ "score": 0.0
663
+ }
664
+ ]
665
+ }
666
+ ]
667
+ },
668
+ {
669
+ "layer": "12",
670
+ "layer_scores": [
671
+ {
672
+ "intervention": [
673
+ "output_token"
674
+ ],
675
+ "counterfactual_scores": [
676
+ {
677
+ "counterfactual": [
678
+ "randomLetter_counterfactual"
679
+ ],
680
+ "score": 0.021739130434782608
681
+ },
682
+ {
683
+ "counterfactual": [
684
+ "answerPosition_counterfactual"
685
+ ],
686
+ "score": 0.0
687
+ },
688
+ {
689
+ "counterfactual": [
690
+ "answerPosition_randomLetter_counterfactual"
691
+ ],
692
+ "score": 0.06521739130434782
693
+ }
694
+ ]
695
+ },
696
+ {
697
+ "intervention": [
698
+ "output_location"
699
+ ],
700
+ "counterfactual_scores": [
701
+ {
702
+ "counterfactual": [
703
+ "randomLetter_counterfactual"
704
+ ],
705
+ "score": 1.0
706
+ },
707
+ {
708
+ "counterfactual": [
709
+ "answerPosition_counterfactual"
710
+ ],
711
+ "score": 0.0
712
+ },
713
+ {
714
+ "counterfactual": [
715
+ "answerPosition_randomLetter_counterfactual"
716
+ ],
717
+ "score": 0.0
718
+ }
719
+ ]
720
+ }
721
+ ]
722
+ },
723
+ {
724
+ "layer": "13",
725
+ "layer_scores": [
726
+ {
727
+ "intervention": [
728
+ "output_token"
729
+ ],
730
+ "counterfactual_scores": [
731
+ {
732
+ "counterfactual": [
733
+ "randomLetter_counterfactual"
734
+ ],
735
+ "score": 0.021739130434782608
736
+ },
737
+ {
738
+ "counterfactual": [
739
+ "answerPosition_counterfactual"
740
+ ],
741
+ "score": 0.0
742
+ },
743
+ {
744
+ "counterfactual": [
745
+ "answerPosition_randomLetter_counterfactual"
746
+ ],
747
+ "score": 0.06521739130434782
748
+ }
749
+ ]
750
+ },
751
+ {
752
+ "intervention": [
753
+ "output_location"
754
+ ],
755
+ "counterfactual_scores": [
756
+ {
757
+ "counterfactual": [
758
+ "randomLetter_counterfactual"
759
+ ],
760
+ "score": 1.0
761
+ },
762
+ {
763
+ "counterfactual": [
764
+ "answerPosition_counterfactual"
765
+ ],
766
+ "score": 0.0
767
+ },
768
+ {
769
+ "counterfactual": [
770
+ "answerPosition_randomLetter_counterfactual"
771
+ ],
772
+ "score": 0.0
773
+ }
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "layer": "14",
780
+ "layer_scores": [
781
+ {
782
+ "intervention": [
783
+ "output_token"
784
+ ],
785
+ "counterfactual_scores": [
786
+ {
787
+ "counterfactual": [
788
+ "randomLetter_counterfactual"
789
+ ],
790
+ "score": 0.021739130434782608
791
+ },
792
+ {
793
+ "counterfactual": [
794
+ "answerPosition_counterfactual"
795
+ ],
796
+ "score": 0.0
797
+ },
798
+ {
799
+ "counterfactual": [
800
+ "answerPosition_randomLetter_counterfactual"
801
+ ],
802
+ "score": 0.06521739130434782
803
+ }
804
+ ]
805
+ },
806
+ {
807
+ "intervention": [
808
+ "output_location"
809
+ ],
810
+ "counterfactual_scores": [
811
+ {
812
+ "counterfactual": [
813
+ "randomLetter_counterfactual"
814
+ ],
815
+ "score": 1.0
816
+ },
817
+ {
818
+ "counterfactual": [
819
+ "answerPosition_counterfactual"
820
+ ],
821
+ "score": 0.0
822
+ },
823
+ {
824
+ "counterfactual": [
825
+ "answerPosition_randomLetter_counterfactual"
826
+ ],
827
+ "score": 0.0
828
+ }
829
+ ]
830
+ }
831
+ ]
832
+ },
833
+ {
834
+ "layer": "15",
835
+ "layer_scores": [
836
+ {
837
+ "intervention": [
838
+ "output_token"
839
+ ],
840
+ "counterfactual_scores": [
841
+ {
842
+ "counterfactual": [
843
+ "randomLetter_counterfactual"
844
+ ],
845
+ "score": 0.021739130434782608
846
+ },
847
+ {
848
+ "counterfactual": [
849
+ "answerPosition_counterfactual"
850
+ ],
851
+ "score": 0.0
852
+ },
853
+ {
854
+ "counterfactual": [
855
+ "answerPosition_randomLetter_counterfactual"
856
+ ],
857
+ "score": 0.06521739130434782
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "intervention": [
863
+ "output_location"
864
+ ],
865
+ "counterfactual_scores": [
866
+ {
867
+ "counterfactual": [
868
+ "randomLetter_counterfactual"
869
+ ],
870
+ "score": 1.0
871
+ },
872
+ {
873
+ "counterfactual": [
874
+ "answerPosition_counterfactual"
875
+ ],
876
+ "score": 0.0
877
+ },
878
+ {
879
+ "counterfactual": [
880
+ "answerPosition_randomLetter_counterfactual"
881
+ ],
882
+ "score": 0.0
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ },
888
+ {
889
+ "layer": "16",
890
+ "layer_scores": [
891
+ {
892
+ "intervention": [
893
+ "output_token"
894
+ ],
895
+ "counterfactual_scores": [
896
+ {
897
+ "counterfactual": [
898
+ "randomLetter_counterfactual"
899
+ ],
900
+ "score": 0.021739130434782608
901
+ },
902
+ {
903
+ "counterfactual": [
904
+ "answerPosition_counterfactual"
905
+ ],
906
+ "score": 0.0
907
+ },
908
+ {
909
+ "counterfactual": [
910
+ "answerPosition_randomLetter_counterfactual"
911
+ ],
912
+ "score": 0.06521739130434782
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "intervention": [
918
+ "output_location"
919
+ ],
920
+ "counterfactual_scores": [
921
+ {
922
+ "counterfactual": [
923
+ "randomLetter_counterfactual"
924
+ ],
925
+ "score": 1.0
926
+ },
927
+ {
928
+ "counterfactual": [
929
+ "answerPosition_counterfactual"
930
+ ],
931
+ "score": 0.0
932
+ },
933
+ {
934
+ "counterfactual": [
935
+ "answerPosition_randomLetter_counterfactual"
936
+ ],
937
+ "score": 0.0
938
+ }
939
+ ]
940
+ }
941
+ ]
942
+ },
943
+ {
944
+ "layer": "17",
945
+ "layer_scores": [
946
+ {
947
+ "intervention": [
948
+ "output_token"
949
+ ],
950
+ "counterfactual_scores": [
951
+ {
952
+ "counterfactual": [
953
+ "randomLetter_counterfactual"
954
+ ],
955
+ "score": 0.021739130434782608
956
+ },
957
+ {
958
+ "counterfactual": [
959
+ "answerPosition_counterfactual"
960
+ ],
961
+ "score": 0.0
962
+ },
963
+ {
964
+ "counterfactual": [
965
+ "answerPosition_randomLetter_counterfactual"
966
+ ],
967
+ "score": 0.06521739130434782
968
+ }
969
+ ]
970
+ },
971
+ {
972
+ "intervention": [
973
+ "output_location"
974
+ ],
975
+ "counterfactual_scores": [
976
+ {
977
+ "counterfactual": [
978
+ "randomLetter_counterfactual"
979
+ ],
980
+ "score": 1.0
981
+ },
982
+ {
983
+ "counterfactual": [
984
+ "answerPosition_counterfactual"
985
+ ],
986
+ "score": 0.0
987
+ },
988
+ {
989
+ "counterfactual": [
990
+ "answerPosition_randomLetter_counterfactual"
991
+ ],
992
+ "score": 0.0
993
+ }
994
+ ]
995
+ }
996
+ ]
997
+ },
998
+ {
999
+ "layer": "18",
1000
+ "layer_scores": [
1001
+ {
1002
+ "intervention": [
1003
+ "output_token"
1004
+ ],
1005
+ "counterfactual_scores": [
1006
+ {
1007
+ "counterfactual": [
1008
+ "randomLetter_counterfactual"
1009
+ ],
1010
+ "score": 0.021739130434782608
1011
+ },
1012
+ {
1013
+ "counterfactual": [
1014
+ "answerPosition_counterfactual"
1015
+ ],
1016
+ "score": 0.0
1017
+ },
1018
+ {
1019
+ "counterfactual": [
1020
+ "answerPosition_randomLetter_counterfactual"
1021
+ ],
1022
+ "score": 0.06521739130434782
1023
+ }
1024
+ ]
1025
+ },
1026
+ {
1027
+ "intervention": [
1028
+ "output_location"
1029
+ ],
1030
+ "counterfactual_scores": [
1031
+ {
1032
+ "counterfactual": [
1033
+ "randomLetter_counterfactual"
1034
+ ],
1035
+ "score": 1.0
1036
+ },
1037
+ {
1038
+ "counterfactual": [
1039
+ "answerPosition_counterfactual"
1040
+ ],
1041
+ "score": 0.0
1042
+ },
1043
+ {
1044
+ "counterfactual": [
1045
+ "answerPosition_randomLetter_counterfactual"
1046
+ ],
1047
+ "score": 0.0
1048
+ }
1049
+ ]
1050
+ }
1051
+ ]
1052
+ },
1053
+ {
1054
+ "layer": "19",
1055
+ "layer_scores": [
1056
+ {
1057
+ "intervention": [
1058
+ "output_token"
1059
+ ],
1060
+ "counterfactual_scores": [
1061
+ {
1062
+ "counterfactual": [
1063
+ "randomLetter_counterfactual"
1064
+ ],
1065
+ "score": 0.021739130434782608
1066
+ },
1067
+ {
1068
+ "counterfactual": [
1069
+ "answerPosition_counterfactual"
1070
+ ],
1071
+ "score": 0.0
1072
+ },
1073
+ {
1074
+ "counterfactual": [
1075
+ "answerPosition_randomLetter_counterfactual"
1076
+ ],
1077
+ "score": 0.06521739130434782
1078
+ }
1079
+ ]
1080
+ },
1081
+ {
1082
+ "intervention": [
1083
+ "output_location"
1084
+ ],
1085
+ "counterfactual_scores": [
1086
+ {
1087
+ "counterfactual": [
1088
+ "randomLetter_counterfactual"
1089
+ ],
1090
+ "score": 1.0
1091
+ },
1092
+ {
1093
+ "counterfactual": [
1094
+ "answerPosition_counterfactual"
1095
+ ],
1096
+ "score": 0.0
1097
+ },
1098
+ {
1099
+ "counterfactual": [
1100
+ "answerPosition_randomLetter_counterfactual"
1101
+ ],
1102
+ "score": 0.0
1103
+ }
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "layer": "20",
1110
+ "layer_scores": [
1111
+ {
1112
+ "intervention": [
1113
+ "output_token"
1114
+ ],
1115
+ "counterfactual_scores": [
1116
+ {
1117
+ "counterfactual": [
1118
+ "randomLetter_counterfactual"
1119
+ ],
1120
+ "score": 0.021739130434782608
1121
+ },
1122
+ {
1123
+ "counterfactual": [
1124
+ "answerPosition_counterfactual"
1125
+ ],
1126
+ "score": 0.0
1127
+ },
1128
+ {
1129
+ "counterfactual": [
1130
+ "answerPosition_randomLetter_counterfactual"
1131
+ ],
1132
+ "score": 0.06521739130434782
1133
+ }
1134
+ ]
1135
+ },
1136
+ {
1137
+ "intervention": [
1138
+ "output_location"
1139
+ ],
1140
+ "counterfactual_scores": [
1141
+ {
1142
+ "counterfactual": [
1143
+ "randomLetter_counterfactual"
1144
+ ],
1145
+ "score": 1.0
1146
+ },
1147
+ {
1148
+ "counterfactual": [
1149
+ "answerPosition_counterfactual"
1150
+ ],
1151
+ "score": 0.0
1152
+ },
1153
+ {
1154
+ "counterfactual": [
1155
+ "answerPosition_randomLetter_counterfactual"
1156
+ ],
1157
+ "score": 0.0
1158
+ }
1159
+ ]
1160
+ }
1161
+ ]
1162
+ },
1163
+ {
1164
+ "layer": "21",
1165
+ "layer_scores": [
1166
+ {
1167
+ "intervention": [
1168
+ "output_token"
1169
+ ],
1170
+ "counterfactual_scores": [
1171
+ {
1172
+ "counterfactual": [
1173
+ "randomLetter_counterfactual"
1174
+ ],
1175
+ "score": 0.021739130434782608
1176
+ },
1177
+ {
1178
+ "counterfactual": [
1179
+ "answerPosition_counterfactual"
1180
+ ],
1181
+ "score": 0.0
1182
+ },
1183
+ {
1184
+ "counterfactual": [
1185
+ "answerPosition_randomLetter_counterfactual"
1186
+ ],
1187
+ "score": 0.06521739130434782
1188
+ }
1189
+ ]
1190
+ },
1191
+ {
1192
+ "intervention": [
1193
+ "output_location"
1194
+ ],
1195
+ "counterfactual_scores": [
1196
+ {
1197
+ "counterfactual": [
1198
+ "randomLetter_counterfactual"
1199
+ ],
1200
+ "score": 1.0
1201
+ },
1202
+ {
1203
+ "counterfactual": [
1204
+ "answerPosition_counterfactual"
1205
+ ],
1206
+ "score": 0.0
1207
+ },
1208
+ {
1209
+ "counterfactual": [
1210
+ "answerPosition_randomLetter_counterfactual"
1211
+ ],
1212
+ "score": 0.0
1213
+ }
1214
+ ]
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "layer": "22",
1220
+ "layer_scores": [
1221
+ {
1222
+ "intervention": [
1223
+ "output_token"
1224
+ ],
1225
+ "counterfactual_scores": [
1226
+ {
1227
+ "counterfactual": [
1228
+ "randomLetter_counterfactual"
1229
+ ],
1230
+ "score": 0.021739130434782608
1231
+ },
1232
+ {
1233
+ "counterfactual": [
1234
+ "answerPosition_counterfactual"
1235
+ ],
1236
+ "score": 0.0
1237
+ },
1238
+ {
1239
+ "counterfactual": [
1240
+ "answerPosition_randomLetter_counterfactual"
1241
+ ],
1242
+ "score": 0.06521739130434782
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "intervention": [
1248
+ "output_location"
1249
+ ],
1250
+ "counterfactual_scores": [
1251
+ {
1252
+ "counterfactual": [
1253
+ "randomLetter_counterfactual"
1254
+ ],
1255
+ "score": 1.0
1256
+ },
1257
+ {
1258
+ "counterfactual": [
1259
+ "answerPosition_counterfactual"
1260
+ ],
1261
+ "score": 0.0
1262
+ },
1263
+ {
1264
+ "counterfactual": [
1265
+ "answerPosition_randomLetter_counterfactual"
1266
+ ],
1267
+ "score": 0.0
1268
+ }
1269
+ ]
1270
+ }
1271
+ ]
1272
+ },
1273
+ {
1274
+ "layer": "23",
1275
+ "layer_scores": [
1276
+ {
1277
+ "intervention": [
1278
+ "output_token"
1279
+ ],
1280
+ "counterfactual_scores": [
1281
+ {
1282
+ "counterfactual": [
1283
+ "randomLetter_counterfactual"
1284
+ ],
1285
+ "score": 0.021739130434782608
1286
+ },
1287
+ {
1288
+ "counterfactual": [
1289
+ "answerPosition_counterfactual"
1290
+ ],
1291
+ "score": 0.0
1292
+ },
1293
+ {
1294
+ "counterfactual": [
1295
+ "answerPosition_randomLetter_counterfactual"
1296
+ ],
1297
+ "score": 0.06521739130434782
1298
+ }
1299
+ ]
1300
+ },
1301
+ {
1302
+ "intervention": [
1303
+ "output_location"
1304
+ ],
1305
+ "counterfactual_scores": [
1306
+ {
1307
+ "counterfactual": [
1308
+ "randomLetter_counterfactual"
1309
+ ],
1310
+ "score": 1.0
1311
+ },
1312
+ {
1313
+ "counterfactual": [
1314
+ "answerPosition_counterfactual"
1315
+ ],
1316
+ "score": 0.0
1317
+ },
1318
+ {
1319
+ "counterfactual": [
1320
+ "answerPosition_randomLetter_counterfactual"
1321
+ ],
1322
+ "score": 0.0
1323
+ }
1324
+ ]
1325
+ }
1326
+ ]
1327
+ },
1328
+ {
1329
+ "layer": "24",
1330
+ "layer_scores": [
1331
+ {
1332
+ "intervention": [
1333
+ "output_token"
1334
+ ],
1335
+ "counterfactual_scores": [
1336
+ {
1337
+ "counterfactual": [
1338
+ "randomLetter_counterfactual"
1339
+ ],
1340
+ "score": 0.021739130434782608
1341
+ },
1342
+ {
1343
+ "counterfactual": [
1344
+ "answerPosition_counterfactual"
1345
+ ],
1346
+ "score": 0.0
1347
+ },
1348
+ {
1349
+ "counterfactual": [
1350
+ "answerPosition_randomLetter_counterfactual"
1351
+ ],
1352
+ "score": 0.06521739130434782
1353
+ }
1354
+ ]
1355
+ },
1356
+ {
1357
+ "intervention": [
1358
+ "output_location"
1359
+ ],
1360
+ "counterfactual_scores": [
1361
+ {
1362
+ "counterfactual": [
1363
+ "randomLetter_counterfactual"
1364
+ ],
1365
+ "score": 1.0
1366
+ },
1367
+ {
1368
+ "counterfactual": [
1369
+ "answerPosition_counterfactual"
1370
+ ],
1371
+ "score": 0.0
1372
+ },
1373
+ {
1374
+ "counterfactual": [
1375
+ "answerPosition_randomLetter_counterfactual"
1376
+ ],
1377
+ "score": 0.0
1378
+ }
1379
+ ]
1380
+ }
1381
+ ]
1382
+ },
1383
+ {
1384
+ "layer": "25",
1385
+ "layer_scores": [
1386
+ {
1387
+ "intervention": [
1388
+ "output_token"
1389
+ ],
1390
+ "counterfactual_scores": [
1391
+ {
1392
+ "counterfactual": [
1393
+ "randomLetter_counterfactual"
1394
+ ],
1395
+ "score": 0.021739130434782608
1396
+ },
1397
+ {
1398
+ "counterfactual": [
1399
+ "answerPosition_counterfactual"
1400
+ ],
1401
+ "score": 0.0
1402
+ },
1403
+ {
1404
+ "counterfactual": [
1405
+ "answerPosition_randomLetter_counterfactual"
1406
+ ],
1407
+ "score": 0.06521739130434782
1408
+ }
1409
+ ]
1410
+ },
1411
+ {
1412
+ "intervention": [
1413
+ "output_location"
1414
+ ],
1415
+ "counterfactual_scores": [
1416
+ {
1417
+ "counterfactual": [
1418
+ "randomLetter_counterfactual"
1419
+ ],
1420
+ "score": 1.0
1421
+ },
1422
+ {
1423
+ "counterfactual": [
1424
+ "answerPosition_counterfactual"
1425
+ ],
1426
+ "score": 0.0
1427
+ },
1428
+ {
1429
+ "counterfactual": [
1430
+ "answerPosition_randomLetter_counterfactual"
1431
+ ],
1432
+ "score": 0.0
1433
+ }
1434
+ ]
1435
+ }
1436
+ ]
1437
+ },
1438
+ {
1439
+ "layer": "26",
1440
+ "layer_scores": [
1441
+ {
1442
+ "intervention": [
1443
+ "output_token"
1444
+ ],
1445
+ "counterfactual_scores": [
1446
+ {
1447
+ "counterfactual": [
1448
+ "randomLetter_counterfactual"
1449
+ ],
1450
+ "score": 0.021739130434782608
1451
+ },
1452
+ {
1453
+ "counterfactual": [
1454
+ "answerPosition_counterfactual"
1455
+ ],
1456
+ "score": 0.0
1457
+ },
1458
+ {
1459
+ "counterfactual": [
1460
+ "answerPosition_randomLetter_counterfactual"
1461
+ ],
1462
+ "score": 0.06521739130434782
1463
+ }
1464
+ ]
1465
+ },
1466
+ {
1467
+ "intervention": [
1468
+ "output_location"
1469
+ ],
1470
+ "counterfactual_scores": [
1471
+ {
1472
+ "counterfactual": [
1473
+ "randomLetter_counterfactual"
1474
+ ],
1475
+ "score": 1.0
1476
+ },
1477
+ {
1478
+ "counterfactual": [
1479
+ "answerPosition_counterfactual"
1480
+ ],
1481
+ "score": 0.0
1482
+ },
1483
+ {
1484
+ "counterfactual": [
1485
+ "answerPosition_randomLetter_counterfactual"
1486
+ ],
1487
+ "score": 0.0
1488
+ }
1489
+ ]
1490
+ }
1491
+ ]
1492
+ },
1493
+ {
1494
+ "layer": "27",
1495
+ "layer_scores": [
1496
+ {
1497
+ "intervention": [
1498
+ "output_token"
1499
+ ],
1500
+ "counterfactual_scores": [
1501
+ {
1502
+ "counterfactual": [
1503
+ "randomLetter_counterfactual"
1504
+ ],
1505
+ "score": 0.021739130434782608
1506
+ },
1507
+ {
1508
+ "counterfactual": [
1509
+ "answerPosition_counterfactual"
1510
+ ],
1511
+ "score": 0.0
1512
+ },
1513
+ {
1514
+ "counterfactual": [
1515
+ "answerPosition_randomLetter_counterfactual"
1516
+ ],
1517
+ "score": 0.06521739130434782
1518
+ }
1519
+ ]
1520
+ },
1521
+ {
1522
+ "intervention": [
1523
+ "output_location"
1524
+ ],
1525
+ "counterfactual_scores": [
1526
+ {
1527
+ "counterfactual": [
1528
+ "randomLetter_counterfactual"
1529
+ ],
1530
+ "score": 1.0
1531
+ },
1532
+ {
1533
+ "counterfactual": [
1534
+ "answerPosition_counterfactual"
1535
+ ],
1536
+ "score": 0.0
1537
+ },
1538
+ {
1539
+ "counterfactual": [
1540
+ "answerPosition_randomLetter_counterfactual"
1541
+ ],
1542
+ "score": 0.0
1543
+ }
1544
+ ]
1545
+ }
1546
+ ]
1547
+ },
1548
+ {
1549
+ "layer": "28",
1550
+ "layer_scores": [
1551
+ {
1552
+ "intervention": [
1553
+ "output_token"
1554
+ ],
1555
+ "counterfactual_scores": [
1556
+ {
1557
+ "counterfactual": [
1558
+ "randomLetter_counterfactual"
1559
+ ],
1560
+ "score": 0.021739130434782608
1561
+ },
1562
+ {
1563
+ "counterfactual": [
1564
+ "answerPosition_counterfactual"
1565
+ ],
1566
+ "score": 0.0
1567
+ },
1568
+ {
1569
+ "counterfactual": [
1570
+ "answerPosition_randomLetter_counterfactual"
1571
+ ],
1572
+ "score": 0.06521739130434782
1573
+ }
1574
+ ]
1575
+ },
1576
+ {
1577
+ "intervention": [
1578
+ "output_location"
1579
+ ],
1580
+ "counterfactual_scores": [
1581
+ {
1582
+ "counterfactual": [
1583
+ "randomLetter_counterfactual"
1584
+ ],
1585
+ "score": 1.0
1586
+ },
1587
+ {
1588
+ "counterfactual": [
1589
+ "answerPosition_counterfactual"
1590
+ ],
1591
+ "score": 0.0
1592
+ },
1593
+ {
1594
+ "counterfactual": [
1595
+ "answerPosition_randomLetter_counterfactual"
1596
+ ],
1597
+ "score": 0.0
1598
+ }
1599
+ ]
1600
+ }
1601
+ ]
1602
+ },
1603
+ {
1604
+ "layer": "29",
1605
+ "layer_scores": [
1606
+ {
1607
+ "intervention": [
1608
+ "output_token"
1609
+ ],
1610
+ "counterfactual_scores": [
1611
+ {
1612
+ "counterfactual": [
1613
+ "randomLetter_counterfactual"
1614
+ ],
1615
+ "score": 0.021739130434782608
1616
+ },
1617
+ {
1618
+ "counterfactual": [
1619
+ "answerPosition_counterfactual"
1620
+ ],
1621
+ "score": 0.0
1622
+ },
1623
+ {
1624
+ "counterfactual": [
1625
+ "answerPosition_randomLetter_counterfactual"
1626
+ ],
1627
+ "score": 0.06521739130434782
1628
+ }
1629
+ ]
1630
+ },
1631
+ {
1632
+ "intervention": [
1633
+ "output_location"
1634
+ ],
1635
+ "counterfactual_scores": [
1636
+ {
1637
+ "counterfactual": [
1638
+ "randomLetter_counterfactual"
1639
+ ],
1640
+ "score": 1.0
1641
+ },
1642
+ {
1643
+ "counterfactual": [
1644
+ "answerPosition_counterfactual"
1645
+ ],
1646
+ "score": 0.0
1647
+ },
1648
+ {
1649
+ "counterfactual": [
1650
+ "answerPosition_randomLetter_counterfactual"
1651
+ ],
1652
+ "score": 0.0
1653
+ }
1654
+ ]
1655
+ }
1656
+ ]
1657
+ },
1658
+ {
1659
+ "layer": "30",
1660
+ "layer_scores": [
1661
+ {
1662
+ "intervention": [
1663
+ "output_token"
1664
+ ],
1665
+ "counterfactual_scores": [
1666
+ {
1667
+ "counterfactual": [
1668
+ "randomLetter_counterfactual"
1669
+ ],
1670
+ "score": 0.021739130434782608
1671
+ },
1672
+ {
1673
+ "counterfactual": [
1674
+ "answerPosition_counterfactual"
1675
+ ],
1676
+ "score": 0.0
1677
+ },
1678
+ {
1679
+ "counterfactual": [
1680
+ "answerPosition_randomLetter_counterfactual"
1681
+ ],
1682
+ "score": 0.06521739130434782
1683
+ }
1684
+ ]
1685
+ },
1686
+ {
1687
+ "intervention": [
1688
+ "output_location"
1689
+ ],
1690
+ "counterfactual_scores": [
1691
+ {
1692
+ "counterfactual": [
1693
+ "randomLetter_counterfactual"
1694
+ ],
1695
+ "score": 1.0
1696
+ },
1697
+ {
1698
+ "counterfactual": [
1699
+ "answerPosition_counterfactual"
1700
+ ],
1701
+ "score": 0.0
1702
+ },
1703
+ {
1704
+ "counterfactual": [
1705
+ "answerPosition_randomLetter_counterfactual"
1706
+ ],
1707
+ "score": 0.0
1708
+ }
1709
+ ]
1710
+ }
1711
+ ]
1712
+ },
1713
+ {
1714
+ "layer": "31",
1715
+ "layer_scores": [
1716
+ {
1717
+ "intervention": [
1718
+ "output_token"
1719
+ ],
1720
+ "counterfactual_scores": [
1721
+ {
1722
+ "counterfactual": [
1723
+ "randomLetter_counterfactual"
1724
+ ],
1725
+ "score": 0.021739130434782608
1726
+ },
1727
+ {
1728
+ "counterfactual": [
1729
+ "answerPosition_counterfactual"
1730
+ ],
1731
+ "score": 0.0
1732
+ },
1733
+ {
1734
+ "counterfactual": [
1735
+ "answerPosition_randomLetter_counterfactual"
1736
+ ],
1737
+ "score": 0.06521739130434782
1738
+ }
1739
+ ]
1740
+ },
1741
+ {
1742
+ "intervention": [
1743
+ "output_location"
1744
+ ],
1745
+ "counterfactual_scores": [
1746
+ {
1747
+ "counterfactual": [
1748
+ "randomLetter_counterfactual"
1749
+ ],
1750
+ "score": 1.0
1751
+ },
1752
+ {
1753
+ "counterfactual": [
1754
+ "answerPosition_counterfactual"
1755
+ ],
1756
+ "score": 0.0
1757
+ },
1758
+ {
1759
+ "counterfactual": [
1760
+ "answerPosition_randomLetter_counterfactual"
1761
+ ],
1762
+ "score": 0.0
1763
+ }
1764
+ ]
1765
+ }
1766
+ ]
1767
+ }
1768
+ ]
1769
+ }
1770
+ }
1771
+ ]
1772
+ }
eval-results-mib-subgraph/.gitattributes ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mds filter=lfs diff=lfs merge=lfs -text
13
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
14
+ *.model filter=lfs diff=lfs merge=lfs -text
15
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
16
+ *.npy filter=lfs diff=lfs merge=lfs -text
17
+ *.npz filter=lfs diff=lfs merge=lfs -text
18
+ *.onnx filter=lfs diff=lfs merge=lfs -text
19
+ *.ot filter=lfs diff=lfs merge=lfs -text
20
+ *.parquet filter=lfs diff=lfs merge=lfs -text
21
+ *.pb filter=lfs diff=lfs merge=lfs -text
22
+ *.pickle filter=lfs diff=lfs merge=lfs -text
23
+ *.pkl filter=lfs diff=lfs merge=lfs -text
24
+ *.pt filter=lfs diff=lfs merge=lfs -text
25
+ *.pth filter=lfs diff=lfs merge=lfs -text
26
+ *.rar filter=lfs diff=lfs merge=lfs -text
27
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
28
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
30
+ *.tar filter=lfs diff=lfs merge=lfs -text
31
+ *.tflite filter=lfs diff=lfs merge=lfs -text
32
+ *.tgz filter=lfs diff=lfs merge=lfs -text
33
+ *.wasm filter=lfs diff=lfs merge=lfs -text
34
+ *.xz filter=lfs diff=lfs merge=lfs -text
35
+ *.zip filter=lfs diff=lfs merge=lfs -text
36
+ *.zst filter=lfs diff=lfs merge=lfs -text
37
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
38
+ # Audio files - uncompressed
39
+ *.pcm filter=lfs diff=lfs merge=lfs -text
40
+ *.sam filter=lfs diff=lfs merge=lfs -text
41
+ *.raw filter=lfs diff=lfs merge=lfs -text
42
+ # Audio files - compressed
43
+ *.aac filter=lfs diff=lfs merge=lfs -text
44
+ *.flac filter=lfs diff=lfs merge=lfs -text
45
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
46
+ *.ogg filter=lfs diff=lfs merge=lfs -text
47
+ *.wav filter=lfs diff=lfs merge=lfs -text
48
+ # Image files - uncompressed
49
+ *.bmp filter=lfs diff=lfs merge=lfs -text
50
+ *.gif filter=lfs diff=lfs merge=lfs -text
51
+ *.png filter=lfs diff=lfs merge=lfs -text
52
+ *.tiff filter=lfs diff=lfs merge=lfs -text
53
+ # Image files - compressed
54
+ *.jpg filter=lfs diff=lfs merge=lfs -text
55
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
56
+ *.webp filter=lfs diff=lfs merge=lfs -text
57
+ # Video files - compressed
58
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ *.webm filter=lfs diff=lfs merge=lfs -text
eval-results-mib-subgraph/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
eval-results-mib-subgraph/baselines/EAP-IG-activations_CF.json ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "EAP-IG-activations (CF)",
3
+ "results": [
4
+ {
5
+ "model_id": "llama3",
6
+ "scores": {
7
+ "ioi": {
8
+ "edge_counts": [
9
+ 1147.0,
10
+ 2643.0,
11
+ 7461.0,
12
+ 15315.0,
13
+ 31091.0,
14
+ 78709.0,
15
+ 157644.0,
16
+ 314446.0,
17
+ 779049.0,
18
+ 1592881.0
19
+ ],
20
+ "faithfulness": [
21
+ 0.03076923076923077,
22
+ 0.16923076923076924,
23
+ 0.5303846153846153,
24
+ 0.8692307692307693,
25
+ 0.9415384615384615,
26
+ 1.0307692307692307,
27
+ 1.0769230769230769,
28
+ 1.083076923076923,
29
+ 1.0923076923076922,
30
+ 1.0
31
+ ]
32
+ },
33
+ "mcqa": {
34
+ "edge_counts": [
35
+ 1150.0,
36
+ 2654.0,
37
+ 7471.0,
38
+ 15345.0,
39
+ 31203.0,
40
+ 78828.0,
41
+ 158056.0,
42
+ 315717.0,
43
+ 777259.0,
44
+ 1592881.0
45
+ ],
46
+ "faithfulness": [
47
+ 0.0019305019305019305,
48
+ 0.005791505791505791,
49
+ 0.019305019305019305,
50
+ 0.04247104247104247,
51
+ 0.03088803088803089,
52
+ -0.06563706563706563,
53
+ -0.09652509652509653,
54
+ -0.12355212355212356,
55
+ -0.15057915057915058,
56
+ 1.0
57
+ ]
58
+ },
59
+ "arithmetic_addition": {
60
+ "edge_counts": [
61
+ 1193.0,
62
+ 2659.0,
63
+ 7427.0,
64
+ 15403.0,
65
+ 31137.0,
66
+ 78007.0,
67
+ 156372.0,
68
+ 311572.0,
69
+ 767443.0,
70
+ 1592881.0
71
+ ],
72
+ "faithfulness": [
73
+ 0.002352941176470588,
74
+ 0.002352941176470588,
75
+ 0.018823529411764704,
76
+ 0.07529411764705882,
77
+ 0.17647058823529413,
78
+ 0.30823529411764705,
79
+ 0.32,
80
+ 0.30941176470588233,
81
+ 0.31058823529411766,
82
+ 1.0
83
+ ]
84
+ },
85
+ "arc_challenge": {
86
+ "edge_counts": [
87
+ 912.0,
88
+ 2002.0,
89
+ 6665.0,
90
+ 14879.0,
91
+ 30445.0,
92
+ 77949.0,
93
+ 156864.0,
94
+ 312741.0,
95
+ 748529.0,
96
+ 1592881.0
97
+ ],
98
+ "faithfulness": [
99
+ 0.0,
100
+ 0.0,
101
+ 0.0,
102
+ 0.0,
103
+ 0.0,
104
+ 0.0,
105
+ 0.0,
106
+ 0.0024096385542168677,
107
+ 0.03132530120481928,
108
+ 1.0
109
+ ]
110
+ },
111
+ "arithmetic_subtraction": {
112
+ "edge_counts": [
113
+ 1257.0,
114
+ 2773.0,
115
+ 7536.0,
116
+ 15449.0,
117
+ 31202.0,
118
+ 78608.0,
119
+ 157421.0,
120
+ 313702.0,
121
+ 773168.0,
122
+ 1592881.0
123
+ ],
124
+ "faithfulness": [
125
+ 0.8605769230769231,
126
+ 0.9326923076923077,
127
+ 1.0192307692307692,
128
+ 1.0032051282051282,
129
+ 0.9967948717948718,
130
+ 0.9551282051282052,
131
+ 0.9487179487179487,
132
+ 0.9551282051282052,
133
+ 0.9583333333333334,
134
+ 1.0
135
+ ]
136
+ },
137
+ "arc_easy": {
138
+ "edge_counts": [
139
+ 757.0,
140
+ 1896.0,
141
+ 6791.0,
142
+ 14846.0,
143
+ 30465.0,
144
+ 78485.0,
145
+ 156768.0,
146
+ 313872.0,
147
+ 747107.0,
148
+ 1592881.0
149
+ ],
150
+ "faithfulness": [
151
+ 0.0,
152
+ 0.0023584905660377358,
153
+ 0.0023584905660377358,
154
+ 0.0023584905660377358,
155
+ 0.0023584905660377358,
156
+ 0.0023584905660377358,
157
+ -0.01179245283018868,
158
+ -0.009433962264150943,
159
+ 0.007075471698113208,
160
+ 1.0
161
+ ]
162
+ }
163
+ }
164
+ },
165
+ {
166
+ "model_id": "gemma2",
167
+ "scores": {
168
+ "ioi": {
169
+ "edge_counts": [
170
+ 29.0,
171
+ 58.0,
172
+ 127.0,
173
+ 232.0,
174
+ 744.0,
175
+ 2697.0,
176
+ 6288.0,
177
+ 13926.0,
178
+ 37109.0,
179
+ 74218.0
180
+ ],
181
+ "faithfulness": [
182
+ 0.0,
183
+ 0.0,
184
+ 0.0,
185
+ 0.022508038585209004,
186
+ 0.364951768488746,
187
+ 0.5610932475884244,
188
+ 0.6302250803858521,
189
+ 0.657556270096463,
190
+ 0.6366559485530546,
191
+ 1.0
192
+ ]
193
+ },
194
+ "mcqa": {
195
+ "edge_counts": [
196
+ 46.0,
197
+ 88.0,
198
+ 152.0,
199
+ 319.0,
200
+ 885.0,
201
+ 3049.0,
202
+ 6845.0,
203
+ 14362.0,
204
+ 36021.0,
205
+ 74218.0
206
+ ],
207
+ "faithfulness": [
208
+ 0.025510204081632654,
209
+ 0.06292517006802721,
210
+ 0.07993197278911565,
211
+ 0.17687074829931973,
212
+ 0.32950680272108845,
213
+ 0.8775510204081632,
214
+ 1.251700680272109,
215
+ 1.4829931972789117,
216
+ 1.6258503401360545,
217
+ 1.0
218
+ ]
219
+ },
220
+ "arc_easy": {
221
+ "edge_counts": [
222
+ 44.0,
223
+ 78.0,
224
+ 131.0,
225
+ 260.0,
226
+ 596.0,
227
+ 2290.0,
228
+ 5683.0,
229
+ 13168.0,
230
+ 36989.0,
231
+ 74218.0
232
+ ],
233
+ "faithfulness": [
234
+ -0.0017730496453900709,
235
+ 0.0,
236
+ 0.0,
237
+ -0.0035460992907801418,
238
+ 0.07801418439716312,
239
+ 0.14361702127659576,
240
+ 0.18085106382978725,
241
+ 0.2666223404255319,
242
+ 0.12677304964539007,
243
+ 1.0
244
+ ]
245
+ }
246
+ }
247
+ },
248
+ {
249
+ "model_id": "gpt2",
250
+ "scores": {
251
+ "ioi": {
252
+ "edge_counts": [
253
+ 1.0,
254
+ 5.0,
255
+ 31.0,
256
+ 116.0,
257
+ 302.0,
258
+ 1211.0,
259
+ 2824.0,
260
+ 5987.0,
261
+ 14986.0,
262
+ 32491.0
263
+ ],
264
+ "faithfulness": [
265
+ 0.0,
266
+ 0.0,
267
+ 0.0,
268
+ 0.00020081878619851592,
269
+ -0.0020080188226028373,
270
+ -0.005186060633885513,
271
+ -0.006654032437846079,
272
+ 0.0611483737769071,
273
+ 0.15180992495124734,
274
+ 1.0
275
+ ]
276
+ }
277
+ }
278
+ },
279
+ {
280
+ "model_id": "qwen2.5",
281
+ "scores": {
282
+ "mcqa": {
283
+ "edge_counts": [
284
+ 142.0,
285
+ 267.0,
286
+ 724.0,
287
+ 1612.0,
288
+ 3432.0,
289
+ 8817.0,
290
+ 17776.0,
291
+ 35593.0,
292
+ 87039.0,
293
+ 179749.0
294
+ ],
295
+ "faithfulness": [
296
+ 0.317246835443038,
297
+ 0.45450949367088606,
298
+ 0.5593354430379747,
299
+ 0.5229430379746836,
300
+ 0.5704113924050633,
301
+ 0.5514240506329114,
302
+ 0.7088607594936709,
303
+ 0.6930379746835443,
304
+ 0.7341772151898734,
305
+ 1.0
306
+ ]
307
+ },
308
+ "ioi": {
309
+ "edge_counts": [
310
+ 133.0,
311
+ 285.0,
312
+ 748.0,
313
+ 1634.0,
314
+ 3321.0,
315
+ 8607.0,
316
+ 17300.0,
317
+ 34763.0,
318
+ 87704.0,
319
+ 179749.0
320
+ ],
321
+ "faithfulness": [
322
+ 0.44820872274143303,
323
+ 0.6884735202492211,
324
+ 1.1090342679127725,
325
+ 1.3707165109034267,
326
+ 1.557632398753894,
327
+ 1.7757009345794392,
328
+ 1.8193146417445483,
329
+ 1.8255451713395638,
330
+ 1.8130841121495327,
331
+ 1.0
332
+ ]
333
+ }
334
+ }
335
+ }
336
+ ]
337
+ }
eval-results-mib-subgraph/baselines/EAP-IG-inputs_CF.json ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "EAP-IG-inputs (CF)",
3
+ "results": [
4
+ {
5
+ "model_id": "llama3",
6
+ "scores": {
7
+ "arithmetic_addition": {
8
+ "edge_counts": [
9
+ 1197.0,
10
+ 2573.0,
11
+ 7447.0,
12
+ 15416.0,
13
+ 31203.0,
14
+ 78484.0,
15
+ 156784.0,
16
+ 312065.0,
17
+ 769311.0,
18
+ 1592881.0
19
+ ],
20
+ "faithfulness": [
21
+ 1.0141176470588236,
22
+ 1.051764705882353,
23
+ 1.0635294117647058,
24
+ 1.0776470588235294,
25
+ 1.0258823529411765,
26
+ 0.9623529411764706,
27
+ 0.9058823529411765,
28
+ 0.8776470588235294,
29
+ 0.8564705882352941,
30
+ 1.0
31
+ ]
32
+ },
33
+ "arithmetic_subtraction": {
34
+ "edge_counts": [
35
+ 1193.0,
36
+ 2592.0,
37
+ 7327.0,
38
+ 15198.0,
39
+ 30956.0,
40
+ 78020.0,
41
+ 156182.0,
42
+ 311361.0,
43
+ 768765.0,
44
+ 1592881.0
45
+ ],
46
+ "faithfulness": [
47
+ 1.1570512820512822,
48
+ 1.1923076923076923,
49
+ 1.2211538461538463,
50
+ 1.189102564102564,
51
+ 1.1346153846153846,
52
+ 1.1057692307692308,
53
+ 1.0769230769230769,
54
+ 1.0320512820512822,
55
+ 1.0256410256410255,
56
+ 1.0
57
+ ]
58
+ },
59
+ "arc_challenge": {
60
+ "edge_counts": [
61
+ 1072.0,
62
+ 2186.0,
63
+ 6683.0,
64
+ 14838.0,
65
+ 30868.0,
66
+ 78580.0,
67
+ 157911.0,
68
+ 313964.0,
69
+ 783154.0,
70
+ 1592881.0
71
+ ],
72
+ "faithfulness": [
73
+ 0.0,
74
+ 0.0,
75
+ 0.0,
76
+ 0.0024390243902439024,
77
+ 0.004878048780487805,
78
+ 0.06341463414634146,
79
+ 0.07073170731707316,
80
+ 0.05853658536585366,
81
+ 0.07926829268292683,
82
+ 1.0
83
+ ]
84
+ },
85
+ "arc_easy": {
86
+ "edge_counts": [
87
+ 1239.0,
88
+ 2680.0,
89
+ 7481.0,
90
+ 15381.0,
91
+ 31252.0,
92
+ 78706.0,
93
+ 157634.0,
94
+ 314876.0,
95
+ 777087.0,
96
+ 1592881.0
97
+ ],
98
+ "faithfulness": [
99
+ 0.9208333333333333,
100
+ 0.9916666666666667,
101
+ 1.2041666666666666,
102
+ 1.1833333333333333,
103
+ 1.0625,
104
+ 1.0,
105
+ 1.075,
106
+ 1.05,
107
+ 1.05,
108
+ 1.0
109
+ ]
110
+ },
111
+ "mcqa": {
112
+ "edge_counts": [
113
+ 1248.0,
114
+ 2764.0,
115
+ 7492.0,
116
+ 15370.0,
117
+ 31235.0,
118
+ 78615.0,
119
+ 157388.0,
120
+ 314400.0,
121
+ 777958.0,
122
+ 1592881.0
123
+ ],
124
+ "faithfulness": [
125
+ 0.6019076305220884,
126
+ 0.8032128514056225,
127
+ 1.0461847389558232,
128
+ 1.0983935742971886,
129
+ 1.0140562248995983,
130
+ 0.8152610441767069,
131
+ 0.8815261044176707,
132
+ 1.0301204819277108,
133
+ 1.2429718875502007,
134
+ 1.0
135
+ ]
136
+ }
137
+ }
138
+ },
139
+ {
140
+ "model_id": "qwen2.5",
141
+ "scores": {
142
+ "ioi": {
143
+ "edge_counts": [
144
+ 124.0,
145
+ 287.0,
146
+ 757.0,
147
+ 1603.0,
148
+ 3306.0,
149
+ 8490.0,
150
+ 17268.0,
151
+ 34572.0,
152
+ 87156.0,
153
+ 179749.0
154
+ ],
155
+ "faithfulness": [
156
+ 0.5985202492211839,
157
+ 0.9314641744548287,
158
+ 1.6635514018691588,
159
+ 1.925233644859813,
160
+ 2.155763239875389,
161
+ 2.2554517133956384,
162
+ 2.2679127725856696,
163
+ 2.2803738317757007,
164
+ 2.2679127725856696,
165
+ 1.0
166
+ ]
167
+ },
168
+ "mcqa": {
169
+ "edge_counts": [
170
+ 131.0,
171
+ 267.0,
172
+ 751.0,
173
+ 1592.0,
174
+ 3412.0,
175
+ 8807.0,
176
+ 17709.0,
177
+ 35540.0,
178
+ 87132.0,
179
+ 179749.0
180
+ ],
181
+ "faithfulness": [
182
+ 0.42764195583596215,
183
+ 0.7066246056782335,
184
+ 0.8517350157728707,
185
+ 1.0094637223974763,
186
+ 1.2176656151419558,
187
+ 1.337539432176656,
188
+ 1.2618296529968454,
189
+ 1.1230283911671923,
190
+ 1.0725552050473186,
191
+ 1.0
192
+ ]
193
+ }
194
+ }
195
+ },
196
+ {
197
+ "model_id": "gemma2",
198
+ "scores": {
199
+ "mcqa": {
200
+ "edge_counts": [
201
+ 41.0,
202
+ 84.0,
203
+ 173.0,
204
+ 381.0,
205
+ 1003.0,
206
+ 3195.0,
207
+ 6914.0,
208
+ 14394.0,
209
+ 35995.0,
210
+ 74218.0
211
+ ],
212
+ "faithfulness": [
213
+ 0.0,
214
+ 0.10374149659863946,
215
+ 0.27253401360544216,
216
+ 0.4574829931972789,
217
+ 0.9217687074829932,
218
+ 1.3129251700680271,
219
+ 1.564625850340136,
220
+ 1.6734693877551021,
221
+ 1.91156462585034,
222
+ 1.0
223
+ ]
224
+ },
225
+ "ioi": {
226
+ "edge_counts": [
227
+ 28.0,
228
+ 77.0,
229
+ 269.0,
230
+ 624.0,
231
+ 1342.0,
232
+ 3526.0,
233
+ 7322.0,
234
+ 14745.0,
235
+ 36995.0,
236
+ 74218.0
237
+ ],
238
+ "faithfulness": [
239
+ 0.04180064308681672,
240
+ 0.36897106109324757,
241
+ 1.77491961414791,
242
+ 2.707395498392283,
243
+ 3.247588424437299,
244
+ 4.147909967845659,
245
+ 4.508038585209003,
246
+ 4.713826366559486,
247
+ 4.816720257234727,
248
+ 1.0
249
+ ]
250
+ },
251
+ "arc_easy": {
252
+ "edge_counts": [
253
+ 38.0,
254
+ 107.0,
255
+ 228.0,
256
+ 443.0,
257
+ 988.0,
258
+ 3232.0,
259
+ 7086.0,
260
+ 14674.0,
261
+ 36641.0,
262
+ 74218.0
263
+ ],
264
+ "faithfulness": [
265
+ 0.0736196319018405,
266
+ 0.18404907975460122,
267
+ 0.4302147239263804,
268
+ 0.9447852760736196,
269
+ 1.1748466257668713,
270
+ 1.549079754601227,
271
+ 1.6349693251533743,
272
+ 1.7576687116564418,
273
+ 1.9171779141104295,
274
+ 1.0
275
+ ]
276
+ }
277
+ }
278
+ },
279
+ {
280
+ "model_id": "gpt2",
281
+ "scores": {
282
+ "ioi": {
283
+ "edge_counts": [
284
+ 28.17708396911621,
285
+ 74.19965362548828,
286
+ 317.2265625,
287
+ 881.1848754882812,
288
+ 2312.229248046875,
289
+ 7085.25,
290
+ 15016.50390625,
291
+ 30456.0,
292
+ 76492.484375,
293
+ 152985.0
294
+ ],
295
+ "faithfulness": [
296
+ 0.11661430390775049,
297
+ 0.4642177695188903,
298
+ 1.4695986430872536,
299
+ 1.8416996434283752,
300
+ 2.103407867294637,
301
+ 2.3131087656712164,
302
+ 2.385959455571554,
303
+ 2.406932236515815,
304
+ 2.365694611464723,
305
+ 1.0
306
+ ]
307
+ }
308
+ }
309
+ }
310
+ ]
311
+ }
eval-results-mib-subgraph/baselines/EAP_CF.json ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "EAP (CF)",
3
+ "results": [
4
+ {
5
+ "model_id": "gemma2",
6
+ "scores": {
7
+ "mcqa": {
8
+ "edge_counts": [
9
+ 40.0,
10
+ 76.0,
11
+ 176.0,
12
+ 332.0,
13
+ 962.0,
14
+ 3256.0,
15
+ 6866.0,
16
+ 14441.0,
17
+ 36027.0,
18
+ 74218.0
19
+ ],
20
+ "faithfulness": [
21
+ 0.05782312925170068,
22
+ 0.06462585034013606,
23
+ 0.13435374149659865,
24
+ 0.19727891156462585,
25
+ 0.3521471088435374,
26
+ 0.9591836734693877,
27
+ 1.2857142857142858,
28
+ 1.435374149659864,
29
+ 1.5510204081632653,
30
+ 1.0
31
+ ]
32
+ },
33
+ "arc_easy": {
34
+ "edge_counts": [
35
+ 33.0,
36
+ 85.0,
37
+ 231.0,
38
+ 451.0,
39
+ 1054.0,
40
+ 3305.0,
41
+ 7024.0,
42
+ 14506.0,
43
+ 36031.0,
44
+ 74218.0
45
+ ],
46
+ "faithfulness": [
47
+ 0.04447852760736196,
48
+ 0.06441717791411043,
49
+ 0.21779141104294478,
50
+ 0.3159509202453988,
51
+ 0.5076687116564417,
52
+ 0.8374233128834356,
53
+ 1.0245398773006136,
54
+ 1.138036809815951,
55
+ 1.334355828220859,
56
+ 1.0
57
+ ]
58
+ },
59
+ "ioi": {
60
+ "edge_counts": [
61
+ 1193.0,
62
+ 2592.0,
63
+ 7327.0,
64
+ 15198.0,
65
+ 30956.0,
66
+ 78020.0,
67
+ 156182.0,
68
+ 311361.0,
69
+ 768765.0,
70
+ 1592881.0
71
+ ],
72
+ "faithfulness": [
73
+ 0.08709677419354839,
74
+ 0.16612903225806452,
75
+ 0.37661290322580643,
76
+ 0.5518145161290322,
77
+ 0.8741935483870967,
78
+ 1.0354838709677419,
79
+ 1.3612903225806452,
80
+ 1.3612903225806452,
81
+ 1.4645161290322581,
82
+ 1.0
83
+ ]
84
+ }
85
+ }
86
+ },
87
+ {
88
+ "model_id": "qwen2.5",
89
+ "scores": {
90
+ "ioi": {
91
+ "edge_counts": [
92
+ 110.0,
93
+ 246.0,
94
+ 653.0,
95
+ 1455.0,
96
+ 3114.0,
97
+ 8393.0,
98
+ 17273.0,
99
+ 34695.0,
100
+ 87690.0,
101
+ 179749.0
102
+ ],
103
+ "faithfulness": [
104
+ 0.0,
105
+ 0.0,
106
+ 0.0,
107
+ 0.0,
108
+ 0.0,
109
+ 0.0,
110
+ 0.01557632398753894,
111
+ 0.021806853582554516,
112
+ 0.018691588785046728,
113
+ 1.0
114
+ ]
115
+ },
116
+ "mcqa": {
117
+ "edge_counts": [
118
+ 128.0,
119
+ 262.0,
120
+ 724.0,
121
+ 1586.0,
122
+ 3413.0,
123
+ 8813.0,
124
+ 17768.0,
125
+ 35535.0,
126
+ 86556.0,
127
+ 179749.0
128
+ ],
129
+ "faithfulness": [
130
+ 0.27681388012618297,
131
+ 0.3994479495268139,
132
+ 0.5441640378548895,
133
+ 0.5291798107255521,
134
+ 0.48738170347003157,
135
+ 0.5977917981072555,
136
+ 0.7523659305993691,
137
+ 0.6876971608832808,
138
+ 0.667192429022082,
139
+ 1.0
140
+ ]
141
+ }
142
+ }
143
+ },
144
+ {
145
+ "model_id": "gpt2",
146
+ "scores": {
147
+ "ioi": {
148
+ "edge_counts": [
149
+ 28.17708396911621,
150
+ 74.19965362548828,
151
+ 317.2265625,
152
+ 881.1848754882812,
153
+ 2312.229248046875,
154
+ 7085.25,
155
+ 15016.50390625,
156
+ 30456.0,
157
+ 76492.484375,
158
+ 152985.0
159
+ ],
160
+ "faithfulness": [
161
+ 0.0,
162
+ 0.22449694731820746,
163
+ 0.5048270710448762,
164
+ 0.7968936313069007,
165
+ 0.976166532167683,
166
+ 1.2088630716953397,
167
+ 1.232255211567972,
168
+ 1.2651923955277595,
169
+ 1.3237581777016123,
170
+ 1.0
171
+ ]
172
+ }
173
+ }
174
+ },
175
+ {
176
+ "model_id": "llama3",
177
+ "scores": {
178
+ "arithmetic_subtraction": {
179
+ "edge_counts": [
180
+ 1101.0,
181
+ 2521.0,
182
+ 7330.0,
183
+ 15268.0,
184
+ 31128.0,
185
+ 78568.0,
186
+ 157165.0,
187
+ 313324.0,
188
+ 773309.0,
189
+ 1592881.0
190
+ ],
191
+ "faithfulness": [
192
+ 0.21634615384615385,
193
+ 0.3076923076923077,
194
+ 0.4110576923076923,
195
+ 0.4467147435897436,
196
+ 0.49854767628205127,
197
+ 0.5422676282051282,
198
+ 0.5584935897435898,
199
+ 0.5737179487179487,
200
+ 0.577323717948718,
201
+ 1.0
202
+ ]
203
+ },
204
+ "arithmetic_addition": {
205
+ "edge_counts": [
206
+ 1105.0,
207
+ 2636.0,
208
+ 7488.0,
209
+ 15424.0,
210
+ 31195.0,
211
+ 78423.0,
212
+ 156792.0,
213
+ 312466.0,
214
+ 776673.0,
215
+ 1592881.0
216
+ ],
217
+ "faithfulness": [
218
+ 0.08,
219
+ 0.13411764705882354,
220
+ 0.18352941176470589,
221
+ 0.22588235294117648,
222
+ 0.2847058823529412,
223
+ 0.3588235294117647,
224
+ 0.37529411764705883,
225
+ 0.3841176470588235,
226
+ 0.3988235294117647,
227
+ 1.0
228
+ ]
229
+ },
230
+ "mcqa": {
231
+ "edge_counts": [
232
+ 1204.0,
233
+ 2743.0,
234
+ 7462.0,
235
+ 15300.0,
236
+ 31172.0,
237
+ 78615.0,
238
+ 157496.0,
239
+ 314048.0,
240
+ 767228.0,
241
+ 1592881.0
242
+ ],
243
+ "faithfulness": [
244
+ 0.014056224899598393,
245
+ 0.04618473895582329,
246
+ 0.1144578313253012,
247
+ 0.19678714859437751,
248
+ 0.19076305220883535,
249
+ 0.3644578313253012,
250
+ 0.5144327309236948,
251
+ 0.7590361445783133,
252
+ 0.7570281124497992,
253
+ 1.0
254
+ ]
255
+ },
256
+ "arc_challenge": {
257
+ "edge_counts": [
258
+ 1080.0,
259
+ 2633.0,
260
+ 6870.0,
261
+ 14850.0,
262
+ 30338.0,
263
+ 77983.0,
264
+ 156741.0,
265
+ 306677.0,
266
+ 781379.0,
267
+ 1592881.0
268
+ ],
269
+ "faithfulness": [
270
+ -0.0024390243902439024,
271
+ 0.0,
272
+ 0.0,
273
+ 0.0,
274
+ 0.0,
275
+ 0.0,
276
+ 0.0,
277
+ 0.014634146341463415,
278
+ 0.007317073170731708,
279
+ 1.0
280
+ ]
281
+ }
282
+ }
283
+ }
284
+ ]
285
+ }
eval-results-mib-subgraph/baselines/EAP_OA.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "EAP (OA)",
3
+ "results": [
4
+ {
5
+ "model_id": "gpt2",
6
+ "scores": {
7
+ "ioi": {
8
+ "edge_counts": [
9
+ 3.0,
10
+ 8.0,
11
+ 18.0,
12
+ 66.0,
13
+ 119.0,
14
+ 616.0,
15
+ 2699.0,
16
+ 6055.0,
17
+ 14988.0,
18
+ 32491.0
19
+ ],
20
+ "faithfulness": [
21
+ 0.0,
22
+ 0.0,
23
+ 0.0,
24
+ -0.02605376953427554,
25
+ -0.027599262798968967,
26
+ 0.018682773460249002,
27
+ 0.3035174289407458,
28
+ 0.905198792680162,
29
+ 1.2374291327745524,
30
+ 1.0
31
+ ]
32
+ }
33
+ }
34
+ },
35
+ {
36
+ "model_id": "qwen2.5",
37
+ "scores": {
38
+ "mcqa": {
39
+ "edge_counts": [
40
+ 19.0,
41
+ 81.0,
42
+ 203.0,
43
+ 360.0,
44
+ 872.0,
45
+ 7146.0,
46
+ 17450.0,
47
+ 35719.0,
48
+ 87921.0,
49
+ 179749.0
50
+ ],
51
+ "faithfulness": [
52
+ 0.0,
53
+ -0.0031645569620253164,
54
+ -0.006329113924050633,
55
+ -0.015822784810126583,
56
+ 0.03164556962025317,
57
+ 0.09651898734177215,
58
+ 0.189873417721519,
59
+ 0.13132911392405064,
60
+ 0.07278481012658228,
61
+ 1.0
62
+ ]
63
+ },
64
+ "ioi": {
65
+ "edge_counts": [
66
+ 7.0,
67
+ 62.0,
68
+ 163.0,
69
+ 393.0,
70
+ 868.0,
71
+ 7103.0,
72
+ 17098.0,
73
+ 35450.0,
74
+ 88180.0,
75
+ 179749.0
76
+ ],
77
+ "faithfulness": [
78
+ 0.0,
79
+ 0.0,
80
+ 0.0,
81
+ 0.16043613707165108,
82
+ 0.22429906542056074,
83
+ 0.38317757009345793,
84
+ 0.5595794392523364,
85
+ 0.6433021806853583,
86
+ 0.6448598130841121,
87
+ 1.0
88
+ ]
89
+ }
90
+ }
91
+ }
92
+ ]
93
+ }
eval-results-mib-subgraph/baselines/EAP_mean.json ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "EAP (mean)",
3
+ "results": [
4
+ {
5
+ "model_id": "gemma2",
6
+ "scores": {
7
+ "mcqa": {
8
+ "edge_counts": [
9
+ 31.0,
10
+ 71.0,
11
+ 139.0,
12
+ 266.0,
13
+ 746.0,
14
+ 2672.0,
15
+ 6280.0,
16
+ 13367.0,
17
+ 36357.0,
18
+ 74218.0
19
+ ],
20
+ "faithfulness": [
21
+ 0.0,
22
+ 0.0,
23
+ -0.003401360544217687,
24
+ -0.01020408163265306,
25
+ 0.07312925170068027,
26
+ 0.06292517006802721,
27
+ -0.006802721088435374,
28
+ -0.1326530612244898,
29
+ -0.22108843537414966,
30
+ 1.0
31
+ ]
32
+ },
33
+ "arc_easy": {
34
+ "edge_counts": [
35
+ 32.0,
36
+ 66.0,
37
+ 154.0,
38
+ 266.0,
39
+ 717.0,
40
+ 2773.0,
41
+ 6332.0,
42
+ 13490.0,
43
+ 36992.0,
44
+ 74218.0
45
+ ],
46
+ "faithfulness": [
47
+ 0.0,
48
+ 0.0,
49
+ -0.0015337423312883436,
50
+ 0.032208588957055216,
51
+ 0.05061349693251534,
52
+ 0.08742331288343558,
53
+ 0.0705521472392638,
54
+ 0.1303680981595092,
55
+ 0.09202453987730061,
56
+ 1.0
57
+ ]
58
+ },
59
+ "ioi": {
60
+ "edge_counts": [
61
+ 14.0,
62
+ 28.0,
63
+ 103.0,
64
+ 296.0,
65
+ 866.0,
66
+ 3074.0,
67
+ 6433.0,
68
+ 13536.0,
69
+ 36793.0,
70
+ 74218.0
71
+ ],
72
+ "faithfulness": [
73
+ 0.0,
74
+ 0.0,
75
+ 0.02572347266881029,
76
+ 0.26688102893890675,
77
+ 0.2990353697749196,
78
+ 0.39147909967845657,
79
+ 0.5315012057877814,
80
+ 0.5836012861736335,
81
+ 0.6213826366559485,
82
+ 1.0
83
+ ]
84
+ }
85
+ }
86
+ },
87
+ {
88
+ "model_id": "llama3",
89
+ "scores": {
90
+ "arc_easy": {
91
+ "edge_counts": [
92
+ 1062.0,
93
+ 2532.0,
94
+ 7119.0,
95
+ 14925.0,
96
+ 30758.0,
97
+ 78089.0,
98
+ 156448.0,
99
+ 313277.0,
100
+ 777481.0,
101
+ 1592881.0
102
+ ],
103
+ "faithfulness": [
104
+ 0.0,
105
+ 0.0,
106
+ 0.0020833333333333333,
107
+ 0.035416666666666666,
108
+ 0.075,
109
+ 0.115625,
110
+ 0.13854166666666667,
111
+ 0.13958333333333334,
112
+ 0.12291666666666666,
113
+ 1.0
114
+ ]
115
+ },
116
+ "arc_challenge": {
117
+ "edge_counts": [
118
+ 1040.0,
119
+ 2529.0,
120
+ 6078.0,
121
+ 14242.0,
122
+ 29098.0,
123
+ 74505.0,
124
+ 152010.0,
125
+ 306198.0,
126
+ 778836.0,
127
+ 1592881.0
128
+ ],
129
+ "faithfulness": [
130
+ -0.0024390243902439024,
131
+ -0.0024390243902439024,
132
+ 0.0,
133
+ 0.004878048780487805,
134
+ 0.05365853658536585,
135
+ 0.02926829268292683,
136
+ 0.02195121951219512,
137
+ 0.03170731707317073,
138
+ 0.1048780487804878,
139
+ 1.0
140
+ ]
141
+ },
142
+ "arithmetic_subtraction": {
143
+ "edge_counts": [
144
+ 1125.0,
145
+ 2578.0,
146
+ 7432.0,
147
+ 15224.0,
148
+ 30876.0,
149
+ 77771.0,
150
+ 155258.0,
151
+ 309527.0,
152
+ 774834.0,
153
+ 1592881.0
154
+ ],
155
+ "faithfulness": [
156
+ 0.0,
157
+ 0.003205128205128205,
158
+ 0.016025641025641024,
159
+ 0.028846153846153848,
160
+ 0.041666666666666664,
161
+ 0.10256410256410256,
162
+ 0.10096153846153846,
163
+ 0.11538461538461539,
164
+ 0.11538461538461539,
165
+ 1.0
166
+ ]
167
+ },
168
+ "mcqa": {
169
+ "edge_counts": [
170
+ 1044.0,
171
+ 2503.0,
172
+ 7278.0,
173
+ 15027.0,
174
+ 30930.0,
175
+ 78647.0,
176
+ 157997.0,
177
+ 315572.0,
178
+ 777353.0,
179
+ 1592881.0
180
+ ],
181
+ "faithfulness": [
182
+ 0.0,
183
+ 0.004016064257028112,
184
+ 0.008032128514056224,
185
+ 0.012048192771084338,
186
+ 0.050200803212851405,
187
+ 0.13253012048192772,
188
+ 0.09437751004016064,
189
+ 0.10843373493975904,
190
+ 0.11646586345381527,
191
+ 1.0
192
+ ]
193
+ },
194
+ "arithmetic_addition": {
195
+ "edge_counts": [
196
+ 1148.0,
197
+ 2687.0,
198
+ 7431.0,
199
+ 15219.0,
200
+ 30812.0,
201
+ 77526.0,
202
+ 154898.0,
203
+ 307910.0,
204
+ 769045.0,
205
+ 1592881.0
206
+ ],
207
+ "faithfulness": [
208
+ 0.0,
209
+ 0.0,
210
+ 0.002352941176470588,
211
+ 0.018823529411764704,
212
+ 0.07058823529411765,
213
+ 0.18352941176470589,
214
+ 0.27647058823529413,
215
+ 0.31411764705882356,
216
+ 0.31882352941176473,
217
+ 1.0
218
+ ]
219
+ }
220
+ }
221
+ },
222
+ {
223
+ "model_id": "gpt2",
224
+ "scores": {
225
+ "ioi": {
226
+ "edge_counts": [
227
+ 28.17708396911621,
228
+ 74.19965362548828,
229
+ 317.2265625,
230
+ 881.1848754882812,
231
+ 2312.229248046875,
232
+ 7085.25,
233
+ 15016.50390625,
234
+ 30456.0,
235
+ 76492.484375,
236
+ 152985.0
237
+ ],
238
+ "faithfulness": [
239
+ 0.0,
240
+ 0.0,
241
+ 0.0,
242
+ -0.00020504474302805614,
243
+ -0.0014366993239439858,
244
+ 0.0008672395297865946,
245
+ -0.012577542580169451,
246
+ 0.08428255131781394,
247
+ 0.05067079889015518,
248
+ 1.0
249
+ ]
250
+ }
251
+ }
252
+ },
253
+ {
254
+ "model_id": "qwen2.5",
255
+ "scores": {
256
+ "mcqa": {
257
+ "edge_counts": [
258
+ 58.0,
259
+ 119.0,
260
+ 434.0,
261
+ 1327.0,
262
+ 3168.0,
263
+ 8434.0,
264
+ 17304.0,
265
+ 35310.0,
266
+ 87249.0,
267
+ 179749.0
268
+ ],
269
+ "faithfulness": [
270
+ 0.0,
271
+ 0.0,
272
+ 0.006329113924050633,
273
+ 0.006329113924050633,
274
+ -0.0189873417721519,
275
+ -0.022151898734177215,
276
+ 0.006329113924050633,
277
+ 0.0,
278
+ 0.1439873417721519,
279
+ 1.0
280
+ ]
281
+ },
282
+ "ioi": {
283
+ "edge_counts": [
284
+ 26.0,
285
+ 85.0,
286
+ 298.0,
287
+ 842.0,
288
+ 2512.0,
289
+ 7868.0,
290
+ 16655.0,
291
+ 34761.0,
292
+ 86664.0,
293
+ 179749.0
294
+ ],
295
+ "faithfulness": [
296
+ 0.0,
297
+ 0.0,
298
+ 0.0,
299
+ 0.0,
300
+ 0.0,
301
+ 0.5311039719626168,
302
+ 0.5891744548286605,
303
+ 0.6456386292834891,
304
+ 0.6401869158878505,
305
+ 1.0
306
+ ]
307
+ }
308
+ }
309
+ }
310
+ ]
311
+ }
eval-results-mib-subgraph/baselines/IFR.json ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "IFR",
3
+ "results": [
4
+ {
5
+ "model_id": "gemma2",
6
+ "scores": {
7
+ "ioi": {
8
+ "edge_counts": [
9
+ 0.0,
10
+ 0.0,
11
+ 0.0,
12
+ 0.0,
13
+ 0.0,
14
+ 60.0,
15
+ 697.0,
16
+ 2012.0,
17
+ 27442.0,
18
+ 74218.0
19
+ ],
20
+ "faithfulness": [
21
+ 0.0,
22
+ 0.0,
23
+ 0.0,
24
+ 0.0,
25
+ 0.0,
26
+ 0.0,
27
+ 0.0,
28
+ 0.0,
29
+ 0.0,
30
+ 1.0
31
+ ]
32
+ },
33
+ "arc_easy": {
34
+ "edge_counts": [
35
+ 0.0,
36
+ 0.0,
37
+ 0.0,
38
+ 0.0,
39
+ 0.0,
40
+ 34.0,
41
+ 569.0,
42
+ 1843.0,
43
+ 23583.0,
44
+ 74218.0
45
+ ],
46
+ "faithfulness": [
47
+ 0.0,
48
+ 0.0,
49
+ 0.0,
50
+ 0.0,
51
+ 0.0,
52
+ 0.0,
53
+ 0.0,
54
+ 0.0,
55
+ 0.0,
56
+ 1.0
57
+ ]
58
+ },
59
+ "mcqa": {
60
+ "edge_counts": [
61
+ 0.0,
62
+ 0.0,
63
+ 0.0,
64
+ 0.0,
65
+ 0.0,
66
+ 48.0,
67
+ 282.0,
68
+ 1912.0,
69
+ 36317.0,
70
+ 74218.0
71
+ ],
72
+ "faithfulness": [
73
+ 0.0,
74
+ 0.0,
75
+ 0.0,
76
+ 0.0,
77
+ 0.0,
78
+ 0.0,
79
+ 0.0,
80
+ 0.0,
81
+ 0.18877551020408162,
82
+ 1.0
83
+ ]
84
+ }
85
+ }
86
+ },
87
+ {
88
+ "model_id": "llama3",
89
+ "scores": {
90
+ "arc_easy": {
91
+ "edge_counts": [
92
+ 0.0,
93
+ 0.0,
94
+ 0.0,
95
+ 0.0,
96
+ 0.0,
97
+ 0.0,
98
+ 0.0,
99
+ 81121.0,
100
+ 678541.0,
101
+ 1592881.0
102
+ ],
103
+ "faithfulness": [
104
+ 0.0,
105
+ 0.0,
106
+ 0.0,
107
+ 0.0,
108
+ 0.0,
109
+ 0.0,
110
+ 0.0,
111
+ 0.0,
112
+ 0.24895833333333334,
113
+ 1.0
114
+ ]
115
+ },
116
+ "arithmetic_addition": {
117
+ "edge_counts": [
118
+ 0.0,
119
+ 0.0,
120
+ 0.0,
121
+ 0.0,
122
+ 0.0,
123
+ 0.0,
124
+ 19009.0,
125
+ 245201.0,
126
+ 728253.0,
127
+ 1592881.0
128
+ ],
129
+ "faithfulness": [
130
+ 0.0,
131
+ 0.0,
132
+ 0.0,
133
+ 0.0,
134
+ 0.0,
135
+ 0.0,
136
+ 0.0,
137
+ 0.9435294117647058,
138
+ 1.3129411764705883,
139
+ 1.0
140
+ ]
141
+ },
142
+ "arithmetic_subtraction": {
143
+ "edge_counts": [
144
+ 0.0,
145
+ 0.0,
146
+ 0.0,
147
+ 0.0,
148
+ 0.0,
149
+ 1719.0,
150
+ 104811.0,
151
+ 210163.0,
152
+ 726283.0,
153
+ 1592881.0
154
+ ],
155
+ "faithfulness": [
156
+ 0.0,
157
+ 0.0,
158
+ 0.0,
159
+ 0.0,
160
+ 0.0,
161
+ 0.0,
162
+ 0.39503205128205127,
163
+ 0.9198717948717948,
164
+ 1.2788461538461537,
165
+ 1.0
166
+ ]
167
+ },
168
+ "mcqa": {
169
+ "edge_counts": [
170
+ 0.0,
171
+ 0.0,
172
+ 0.0,
173
+ 0.0,
174
+ 0.0,
175
+ 0.0,
176
+ 11420.0,
177
+ 249100.0,
178
+ 687383.0,
179
+ 1592881.0
180
+ ],
181
+ "faithfulness": [
182
+ 0.0,
183
+ 0.0,
184
+ 0.0,
185
+ 0.0,
186
+ 0.0,
187
+ 0.0,
188
+ 0.0,
189
+ 0.07228915662650602,
190
+ 0.7018072289156626,
191
+ 1.0
192
+ ]
193
+ }
194
+ }
195
+ },
196
+ {
197
+ "model_id": "gpt2",
198
+ "scores": {
199
+ "ioi": {
200
+ "edge_counts": [
201
+ 0.0,
202
+ 0.0,
203
+ 1.0,
204
+ 5.0,
205
+ 18.0,
206
+ 61.0,
207
+ 431.0,
208
+ 2217.0,
209
+ 11133.0,
210
+ 32491.0
211
+ ],
212
+ "faithfulness": [
213
+ 0.0,
214
+ 0.0,
215
+ 0.0,
216
+ 0.0,
217
+ 9.699480413473062e-05,
218
+ 0.002604457555290081,
219
+ 0.005239274585697969,
220
+ 0.007735479313229455,
221
+ 0.8286768164229479,
222
+ 1.0
223
+ ]
224
+ }
225
+ }
226
+ },
227
+ {
228
+ "model_id": "qwen2.5",
229
+ "scores": {
230
+ "mcqa": {
231
+ "edge_counts": [
232
+ 0.0,
233
+ 0.0,
234
+ 0.0,
235
+ 0.0,
236
+ 0.0,
237
+ 0.0,
238
+ 0.0,
239
+ 6114.0,
240
+ 78382.0,
241
+ 179749.0
242
+ ],
243
+ "faithfulness": [
244
+ 0.0,
245
+ 0.0,
246
+ 0.0,
247
+ 0.0,
248
+ 0.0,
249
+ 0.0,
250
+ 0.0,
251
+ 0.0,
252
+ 0.6593059936908517,
253
+ 1.0
254
+ ]
255
+ },
256
+ "ioi": {
257
+ "edge_counts": [
258
+ 0.0,
259
+ 0.0,
260
+ 0.0,
261
+ 0.0,
262
+ 0.0,
263
+ 18.0,
264
+ 18.0,
265
+ 8519.0,
266
+ 77692.0,
267
+ 179749.0
268
+ ],
269
+ "faithfulness": [
270
+ 0.0,
271
+ 0.0,
272
+ 0.0,
273
+ 0.0,
274
+ 0.0,
275
+ 0.0,
276
+ 0.0,
277
+ 0.0,
278
+ 0.20249221183800623,
279
+ 1.0
280
+ ]
281
+ }
282
+ }
283
+ }
284
+ ]
285
+ }
eval-results-mib-subgraph/baselines/NAP-IG-inputs_CF.json ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "NAP-IG-inputs (CF)",
3
+ "results": [
4
+ {
5
+ "model_id": "qwen2.5",
6
+ "scores": {
7
+ "ioi": {
8
+ "edge_counts": [
9
+ 109.0,
10
+ 256.0,
11
+ 673.0,
12
+ 897.0,
13
+ 1203.0,
14
+ 2237.0,
15
+ 6224.0,
16
+ 16053.0,
17
+ 57617.0,
18
+ 179749.0
19
+ ],
20
+ "faithfulness": [
21
+ 0.47001557632398755,
22
+ 0.6954828660436138,
23
+ 1.087227414330218,
24
+ 1.087227414330218,
25
+ 1.1059190031152648,
26
+ 1.1246105919003115,
27
+ 1.2024922118380061,
28
+ 1.2118380062305296,
29
+ 1.3582554517133956,
30
+ 1.0
31
+ ]
32
+ },
33
+ "mcqa": {
34
+ "edge_counts": [
35
+ 15.861607551574707,
36
+ 39.65401840209961,
37
+ 223.78125,
38
+ 978.140625,
39
+ 3732.670654296875,
40
+ 15233.4462890625,
41
+ 34149.96484375,
42
+ 71549.9765625,
43
+ 185915.046875,
44
+ 372913.0
45
+ ],
46
+ "faithfulness": [
47
+ -0.0031645569620253164,
48
+ 0.0,
49
+ 0.0,
50
+ 0.012658227848101266,
51
+ 0.0379746835443038,
52
+ 0.11708860759493671,
53
+ 0.2120253164556962,
54
+ 0.27610759493670883,
55
+ 0.49802215189873417,
56
+ 1.0
57
+ ]
58
+ }
59
+ }
60
+ },
61
+ {
62
+ "model_id": "gemma2",
63
+ "scores": {
64
+ "mcqa": {
65
+ "edge_counts": [
66
+ 20.42838478088379,
67
+ 76.078125,
68
+ 334.8502502441406,
69
+ 1029.2239990234375,
70
+ 2481.416748046875,
71
+ 7191.0,
72
+ 15016.498046875,
73
+ 30315.009765625,
74
+ 76492.484375,
75
+ 152985.0
76
+ ],
77
+ "faithfulness": [
78
+ 0.006802721088435374,
79
+ 0.01020408163265306,
80
+ 0.03571428571428571,
81
+ 0.07482993197278912,
82
+ 0.16071428571428573,
83
+ 0.5068027210884354,
84
+ 1.119047619047619,
85
+ 2.17687074829932,
86
+ 3.061224489795918,
87
+ 1.0
88
+ ]
89
+ },
90
+ "arc_easy": {
91
+ "edge_counts": [
92
+ 20.19357681274414,
93
+ 76.078125,
94
+ 296.078125,
95
+ 852.9869995117188,
96
+ 2340.427001953125,
97
+ 7191.0,
98
+ 14946.0,
99
+ 30456.0,
100
+ 76492.5078125,
101
+ 152985.0
102
+ ],
103
+ "faithfulness": [
104
+ 0.009202453987730062,
105
+ 0.01687116564417178,
106
+ 0.04141104294478527,
107
+ 0.08282208588957055,
108
+ 0.18251533742331288,
109
+ 0.4854294478527607,
110
+ 1.0582822085889572,
111
+ 1.7699386503067485,
112
+ 2.303680981595092,
113
+ 1.0
114
+ ]
115
+ },
116
+ "ioi": {
117
+ "edge_counts": [
118
+ 5.87022590637207,
119
+ 28.17708396911621,
120
+ 155.0885467529297,
121
+ 505.2126770019531,
122
+ 1362.8992919921875,
123
+ 5111.25,
124
+ 13254.0,
125
+ 29892.0,
126
+ 76492.515625,
127
+ 152985.0
128
+ ],
129
+ "faithfulness": [
130
+ 0.0,
131
+ 0.0,
132
+ 0.0,
133
+ 0.006430868167202572,
134
+ 0.04823151125401929,
135
+ 0.2877813504823151,
136
+ 0.882636655948553,
137
+ 1.6463022508038585,
138
+ 2.1607717041800645,
139
+ 1.0
140
+ ]
141
+ }
142
+ }
143
+ },
144
+ {
145
+ "model_id": "gpt2",
146
+ "scores": {
147
+ "ioi": {
148
+ "edge_counts": [
149
+ 1.25,
150
+ 4.70703125,
151
+ 23.515625,
152
+ 67.46744537353516,
153
+ 197.77734375,
154
+ 1279.3802490234375,
155
+ 3971.90234375,
156
+ 13030.8916015625,
157
+ 34932.5,
158
+ 69865.0
159
+ ],
160
+ "faithfulness": [
161
+ -9.80428417467502e-07,
162
+ 7.268693439845274e-06,
163
+ -3.6850585346192315e-05,
164
+ 0.00037178521747896964,
165
+ 0.005384073366337484,
166
+ 0.05083179885016707,
167
+ 0.17342828703671462,
168
+ 0.4600666180824943,
169
+ 1.0150521456067374,
170
+ 1.0
171
+ ]
172
+ }
173
+ }
174
+ },
175
+ {
176
+ "model_id": "llama3",
177
+ "scores": {
178
+ "arithmetic_addition": {
179
+ "edge_counts": [
180
+ 66.583740234375,
181
+ 238.85595703125,
182
+ 2013.551513671875,
183
+ 8403.009765625,
184
+ 32026.939453125,
185
+ 120814.765625,
186
+ 276828.1875,
187
+ 615173.8125,
188
+ 1639407.0,
189
+ 3281985.0
190
+ ],
191
+ "faithfulness": [
192
+ 0.0,
193
+ -0.002352941176470588,
194
+ 0.0,
195
+ 0.002352941176470588,
196
+ 0.01647058823529412,
197
+ 0.08,
198
+ 0.1811764705882353,
199
+ 0.2858823529411765,
200
+ 0.36764705882352944,
201
+ 1.0
202
+ ]
203
+ },
204
+ "mcqa": {
205
+ "edge_counts": [
206
+ 453.403564453125,
207
+ 1490.20751953125,
208
+ 6706.5537109375,
209
+ 19596.455078125,
210
+ 50291.8125,
211
+ 148085.3125,
212
+ 313294.65625,
213
+ 649420.5,
214
+ 1639406.875,
215
+ 3281985.0
216
+ ],
217
+ "faithfulness": [
218
+ 0.0019305019305019305,
219
+ 0.0019305019305019305,
220
+ 0.007722007722007722,
221
+ 0.025096525096525095,
222
+ 0.11196911196911197,
223
+ 0.9884169884169884,
224
+ 1.9652509652509653,
225
+ 2.305019305019305,
226
+ 2.4208494208494207,
227
+ 1.0
228
+ ]
229
+ },
230
+ "arc_challenge": {
231
+ "edge_counts": [
232
+ 24.308349609375,
233
+ 107.80224609375,
234
+ 317.0947265625,
235
+ 697.6083984375,
236
+ 7103.00390625,
237
+ 77689.2890625,
238
+ 278096.5625,
239
+ 649420.5,
240
+ 1640992.625,
241
+ 3281985.0
242
+ ],
243
+ "faithfulness": [
244
+ -0.0024390243902439024,
245
+ 0.0,
246
+ 0.0,
247
+ 0.0,
248
+ 0.0,
249
+ 0.0,
250
+ 0.0,
251
+ 0.004878048780487805,
252
+ 0.012195121951219513,
253
+ 1.0
254
+ ]
255
+ },
256
+ "arithmetic_subtraction": {
257
+ "edge_counts": [
258
+ 53.901123046875,
259
+ 257.8798828125,
260
+ 1839.1494140625,
261
+ 9068.91015625,
262
+ 32851.39453125,
263
+ 120022.0390625,
264
+ 273340.0625,
265
+ 613271.0625,
266
+ 1639407.0,
267
+ 3281985.0
268
+ ],
269
+ "faithfulness": [
270
+ 0.0,
271
+ 0.0,
272
+ 0.003205128205128205,
273
+ 0.0,
274
+ 0.00641025641025641,
275
+ 0.022435897435897436,
276
+ 0.08012820512820513,
277
+ 0.17467948717948717,
278
+ 0.2644230769230769,
279
+ 1.0
280
+ ]
281
+ },
282
+ "ioi": {
283
+ "edge_counts": [
284
+ 211.376953125,
285
+ 629.9033203125,
286
+ 2991.26025390625,
287
+ 9248.595703125,
288
+ 27418.44140625,
289
+ 109716.296875,
290
+ 271754.59375,
291
+ 614539.625,
292
+ 1640992.5,
293
+ 3281985.0
294
+ ],
295
+ "faithfulness": [
296
+ 0.0,
297
+ 0.0,
298
+ 0.0,
299
+ 0.003076923076923077,
300
+ 0.015384615384615385,
301
+ 0.07384615384615385,
302
+ 0.14923076923076922,
303
+ 0.2323076923076923,
304
+ 0.2846153846153846,
305
+ 1.0
306
+ ]
307
+ },
308
+ "arc_easy": {
309
+ "edge_counts": [
310
+ 17.967041015625,
311
+ 93.005859375,
312
+ 301.239990234375,
313
+ 697.6083984375,
314
+ 8751.9150390625,
315
+ 80226.0859375,
316
+ 281901.75,
317
+ 648152.1875,
318
+ 1640992.375,
319
+ 3281985.0
320
+ ],
321
+ "faithfulness": [
322
+ 0.0,
323
+ 0.0,
324
+ 0.0,
325
+ 0.0,
326
+ 0.0,
327
+ 0.0,
328
+ 0.0,
329
+ 0.005,
330
+ 0.015,
331
+ 1.0
332
+ ]
333
+ }
334
+ }
335
+ }
336
+ ]
337
+ }
eval-results-mib-subgraph/baselines/NAP_CF.json ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "NAP (CF)",
3
+ "results": [
4
+ {
5
+ "model_id": "gemma2",
6
+ "scores": {
7
+ "arc_easy": {
8
+ "edge_counts": [
9
+ 28.17708396911621,
10
+ 74.19965362548828,
11
+ 317.2265625,
12
+ 881.1848754882812,
13
+ 2312.229248046875,
14
+ 7085.25,
15
+ 15016.50390625,
16
+ 30456.0,
17
+ 76492.484375,
18
+ 152985.0
19
+ ],
20
+ "faithfulness": [
21
+ 0.004601226993865031,
22
+ 0.006134969325153374,
23
+ 0.013803680981595092,
24
+ 0.02607361963190184,
25
+ 0.04754601226993865,
26
+ 0.12423312883435583,
27
+ 0.24233128834355827,
28
+ 0.5751533742331288,
29
+ 1.6411042944785277,
30
+ 1.0
31
+ ]
32
+ },
33
+ "mcqa": {
34
+ "edge_counts": [
35
+ 44.14409637451172,
36
+ 142.76388549804688,
37
+ 553.3840942382812,
38
+ 1318.2525634765625,
39
+ 2904.385498046875,
40
+ 7578.75,
41
+ 15298.49609375,
42
+ 30597.0078125,
43
+ 76492.4765625,
44
+ 152985.0
45
+ ],
46
+ "faithfulness": [
47
+ 0.0,
48
+ -0.003401360544217687,
49
+ -0.003401360544217687,
50
+ -0.0017006802721088435,
51
+ 0.0017006802721088435,
52
+ 0.02040816326530612,
53
+ 0.022108843537414966,
54
+ 0.10204081632653061,
55
+ 0.24489795918367346,
56
+ 1.0
57
+ ]
58
+ },
59
+ "ioi": {
60
+ "edge_counts": [
61
+ 78.42621612548828,
62
+ 195.36111450195312,
63
+ 587.4566040039062,
64
+ 1273.60595703125,
65
+ 2791.59375,
66
+ 7614.0,
67
+ 15298.5,
68
+ 30596.99609375,
69
+ 76492.484375,
70
+ 152985.0
71
+ ],
72
+ "faithfulness": [
73
+ 0.0,
74
+ 0.0,
75
+ 0.0,
76
+ 0.0,
77
+ 0.0,
78
+ 0.003215434083601286,
79
+ 0.01929260450160772,
80
+ 0.07395498392282958,
81
+ 0.1607717041800643,
82
+ 1.0
83
+ ]
84
+ }
85
+ }
86
+ },
87
+ {
88
+ "model_id": "llama3",
89
+ "scores": {
90
+ "arithmetic_subtraction": {
91
+ "edge_counts": [
92
+ 210.320068359375,
93
+ 596.0830078125,
94
+ 3139.23779296875,
95
+ 11288.572265625,
96
+ 34563.7265625,
97
+ 117960.8671875,
98
+ 271754.5625,
99
+ 614539.5,
100
+ 1639406.875,
101
+ 3281985.0
102
+ ],
103
+ "faithfulness": [
104
+ 0.0,
105
+ 0.0,
106
+ 0.0,
107
+ 0.0,
108
+ 0.003205128205128205,
109
+ 0.00641025641025641,
110
+ 0.01282051282051282,
111
+ 0.041666666666666664,
112
+ 0.08012820512820513,
113
+ 1.0
114
+ ]
115
+ },
116
+ "arc_easy": {
117
+ "edge_counts": [
118
+ 22.194580078125,
119
+ 59.185546875,
120
+ 248.390869140625,
121
+ 507.3515625,
122
+ 9259.2734375,
123
+ 77213.6328125,
124
+ 270803.25,
125
+ 646249.5625,
126
+ 1640992.125,
127
+ 3281985.0
128
+ ],
129
+ "faithfulness": [
130
+ 0.0,
131
+ 0.0,
132
+ 0.0,
133
+ 0.0,
134
+ 0.0,
135
+ 0.0,
136
+ 0.0,
137
+ 0.0,
138
+ 0.02,
139
+ 1.0
140
+ ]
141
+ },
142
+ "arc_challenge": {
143
+ "edge_counts": [
144
+ 31.70654296875,
145
+ 90.89208984375,
146
+ 232.5361328125,
147
+ 792.73681640625,
148
+ 9893.4697265625,
149
+ 64371.125,
150
+ 143329.125,
151
+ 628492.0,
152
+ 1640992.375,
153
+ 3281985.0
154
+ ],
155
+ "faithfulness": [
156
+ 0.0,
157
+ 0.0,
158
+ 0.0,
159
+ 0.0,
160
+ 0.0,
161
+ 0.0,
162
+ 0.0,
163
+ 0.0,
164
+ 0.012195121951219513,
165
+ 1.0
166
+ ]
167
+ },
168
+ "ioi": {
169
+ "edge_counts": [
170
+ 361.45458984375,
171
+ 1048.4296875,
172
+ 4175.08056640625,
173
+ 11753.64453125,
174
+ 31984.65625,
175
+ 109082.09375,
176
+ 260973.203125,
177
+ 612637.0,
178
+ 1640992.375,
179
+ 3281985.0
180
+ ],
181
+ "faithfulness": [
182
+ 0.0,
183
+ 0.0,
184
+ 0.0,
185
+ 0.0,
186
+ 0.0,
187
+ 0.0,
188
+ 0.009230769230769232,
189
+ 0.015384615384615385,
190
+ 0.015384615384615385,
191
+ 1.0
192
+ ]
193
+ },
194
+ "arithmetic_addition": {
195
+ "edge_counts": [
196
+ 161.703369140625,
197
+ 450.23291015625,
198
+ 2742.869384765625,
199
+ 10559.25390625,
200
+ 34246.625,
201
+ 117802.3125,
202
+ 275559.8125,
203
+ 616442.0625,
204
+ 1639407.0,
205
+ 3281985.0
206
+ ],
207
+ "faithfulness": [
208
+ 0.0,
209
+ -0.002352941176470588,
210
+ 0.0,
211
+ 0.0,
212
+ 0.0,
213
+ 0.004705882352941176,
214
+ 0.011764705882352941,
215
+ 0.0,
216
+ 0.0,
217
+ 1.0
218
+ ]
219
+ },
220
+ "mcqa": {
221
+ "edge_counts": [
222
+ 700.714599609375,
223
+ 2084.1767578125,
224
+ 9259.166015625,
225
+ 23591.84765625,
226
+ 54921.44140625,
227
+ 153634.5,
228
+ 320587.96875,
229
+ 655762.5625,
230
+ 1640992.375,
231
+ 3281985.0
232
+ ],
233
+ "faithfulness": [
234
+ 0.0,
235
+ 0.0,
236
+ 0.0019305019305019305,
237
+ 0.0019305019305019305,
238
+ 0.0,
239
+ -0.007722007722007722,
240
+ -0.02702702702702703,
241
+ -0.07722007722007722,
242
+ -0.05791505791505792,
243
+ 1.0
244
+ ]
245
+ }
246
+ }
247
+ },
248
+ {
249
+ "model_id": "qwen2.5",
250
+ "scores": {
251
+ "ioi": {
252
+ "edge_counts": [
253
+ 32.8046875,
254
+ 88.68080139160156,
255
+ 348.3046875,
256
+ 1010.625,
257
+ 3039.563720703125,
258
+ 11605.576171875,
259
+ 29601.451171875,
260
+ 70466.984375,
261
+ 185914.984375,
262
+ 372913.0
263
+ ],
264
+ "faithfulness": [
265
+ 0.0,
266
+ 0.0,
267
+ 0.0,
268
+ 0.0,
269
+ 0.006230529595015576,
270
+ 0.024922118380062305,
271
+ 0.04984423676012461,
272
+ 0.06853582554517133,
273
+ 0.08099688473520249,
274
+ 1.0
275
+ ]
276
+ },
277
+ "mcqa": {
278
+ "edge_counts": [
279
+ 21.98995590209961,
280
+ 100.21651458740234,
281
+ 516.140625,
282
+ 1844.390625,
283
+ 5357.140625,
284
+ 17507.634765625,
285
+ 37182.30859375,
286
+ 74582.3984375,
287
+ 186456.53125,
288
+ 372913.0
289
+ ],
290
+ "faithfulness": [
291
+ -0.0031645569620253164,
292
+ -0.0031645569620253164,
293
+ -0.0031645569620253164,
294
+ -0.0031645569620253164,
295
+ -0.0031645569620253164,
296
+ 0.012658227848101266,
297
+ 0.012658227848101266,
298
+ 0.03481012658227848,
299
+ 0.056962025316455694,
300
+ 1.0
301
+ ]
302
+ }
303
+ }
304
+ },
305
+ {
306
+ "model_id": "gpt2",
307
+ "scores": {
308
+ "ioi": {
309
+ "edge_counts": [
310
+ 3.28125,
311
+ 10.35546875,
312
+ 31.35416603088379,
313
+ 83.15755462646484,
314
+ 279.3997497558594,
315
+ 1350.0208740234375,
316
+ 4442.87890625,
317
+ 12654.0947265625,
318
+ 34932.49609375,
319
+ 69865.0
320
+ ],
321
+ "faithfulness": [
322
+ -3.380787646439662e-08,
323
+ 0.0,
324
+ -1.5450199544229257e-05,
325
+ -6.119225640055789e-06,
326
+ 0.00011721190770206309,
327
+ 0.0020236718694058528,
328
+ 0.015380555318712599,
329
+ 0.03909438029925786,
330
+ 0.04813088759942643,
331
+ 1.0
332
+ ]
333
+ }
334
+ }
335
+ }
336
+ ]
337
+ }
eval-results-mib-subgraph/submissions/results_2024-10-2T13-36-121.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"method_name": "EAP-IG (mean)", "results": [
2
+ {"model_id": "meta-llama/Llama-3.1-8B", "scores": {
3
+ "ioi": {
4
+ "edge_counts": [10.0, 29.0, 117.0, 269.0, 561.0, 1570.0, 3194.0, 6386.0, 16245.0, 32491.0],
5
+ "faithfulness": [0.11454112510535433,0.14123527363014815,0.3197643850972241,0.47765884872924175,0.7701570853704176,1.3201798748760563,2.037825774185549,2.651813181821849,3.27612042118584,1.0]},
6
+ "mcqa": {
7
+ "edge_counts": [10.0, 21.0, 94.0, 241.0, 527.0, 1469.0, 3046.0, 6036.0, 14832.0, 32491.0],
8
+ "faithfulness": [[0.02677059664121319,0.1965060952906922,0.449060470868564,0.7604756153676078,0.786575587658478,1.106011020720112,1.3436645156597262,1.5466349080478032,1.4914126224418107,1.0]]}
9
+ }},
10
+ {"model_id": "Qwen/Qwen2-1.5B", "scores": {
11
+ "ioi": {
12
+ "edge_counts": [],
13
+ "faithfulness": []},
14
+ "mcqa": {
15
+ "edge_counts": [],
16
+ "faithfulness": []}
17
+ }}
18
+ ]
19
+ }
src/__pycache__/about.cpython-310.pyc ADDED
Binary file (5.87 kB). View file
 
src/__pycache__/envs.cpython-310.pyc ADDED
Binary file (904 Bytes). View file
 
src/__pycache__/populate.cpython-310.pyc ADDED
Binary file (6.73 kB). View file
 
src/display/__pycache__/css_html_js.cpython-310.pyc ADDED
Binary file (1.94 kB). View file
 
src/display/__pycache__/formatting.cpython-310.pyc ADDED
Binary file (1.43 kB). View file
 
src/display/__pycache__/utils.cpython-310.pyc ADDED
Binary file (9.01 kB). View file
 
src/leaderboard/__pycache__/read_evals.cpython-310.pyc ADDED
Binary file (12.2 kB). View file
 
src/leaderboard/read_evals.py CHANGED
@@ -99,7 +99,7 @@ class EvalResult_MIB_SUBGRAPH:
99
 
100
 
101
 
102
- def to_dict(self):
103
  """Converts the Eval Result to a dict for dataframe display"""
104
  data_dict = {
105
  "eval_name": self.eval_name,
@@ -140,8 +140,8 @@ class EvalResult_MIB_SUBGRAPH:
140
  if result is None or result[0] is None:
141
  continue
142
 
143
- area_under, _, _ = result
144
- score = area_under
145
  data_dict[col_name] = round(score, 2)
146
  all_scores.append(score)
147
 
 
99
 
100
 
101
 
102
+ def to_dict(self, metric_type="F+"):
103
  """Converts the Eval Result to a dict for dataframe display"""
104
  data_dict = {
105
  "eval_name": self.eval_name,
 
140
  if result is None or result[0] is None:
141
  continue
142
 
143
+ area_under, area_from_100, _ = result
144
+ score = area_under if metric_type == "F+" else area_from_100
145
  data_dict[col_name] = round(score, 2)
146
  all_scores.append(score)
147
 
src/populate.py CHANGED
@@ -42,14 +42,15 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
42
 
43
 
44
 
45
- def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
46
  """Creates a dataframe from all the MIB experiment results"""
47
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
48
  raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
49
  # print(f"raw_data is {raw_data}")
50
 
51
  # Convert each result to dict format
52
- all_data_json = [v.to_dict() for v in raw_data]
53
  # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
54
 
55
  # Convert to dataframe
 
42
 
43
 
44
 
45
+ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list,
46
+ metric_type = "F+") -> pd.DataFrame:
47
  """Creates a dataframe from all the MIB experiment results"""
48
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
49
  raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
50
  # print(f"raw_data is {raw_data}")
51
 
52
  # Convert each result to dict format
53
+ all_data_json = [v.to_dict(metric_type=metric_type) for v in raw_data]
54
  # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
55
 
56
  # Convert to dataframe
src/submission/__pycache__/check_validity.cpython-310.pyc ADDED
Binary file (5.49 kB). View file
 
src/submission/__pycache__/submit.cpython-310.pyc ADDED
Binary file (2.62 kB). View file