Aaron Mueller commited on
Commit
11e2149
·
1 Parent(s): f8ca5d3

code cleanup

Browse files
app.py CHANGED
@@ -42,83 +42,6 @@ from src.submission.check_validity import verify_circuit_submission, verify_caus
42
 
43
  from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
44
 
45
- # class SmartSelectColumns(SelectColumns):
46
- # """
47
- # Enhanced SelectColumns component with basic filtering functionality.
48
- # """
49
- # def __init__(
50
- # self,
51
- # benchmark_keywords: Optional[List[str]] = None,
52
- # model_keywords: Optional[List[str]] = None,
53
- # initial_selected: Optional[List[str]] = None,
54
- # **kwargs
55
- # ):
56
- # """
57
- # Initialize SmartSelectColumns with minimal configuration.
58
-
59
- # Args:
60
- # benchmark_keywords: List of benchmark names to filter by
61
- # model_keywords: List of model names to filter by
62
- # initial_selected: List of columns to show initially
63
- # """
64
- # super().__init__(**kwargs)
65
- # self.benchmark_keywords = benchmark_keywords or []
66
- # self.model_keywords = model_keywords or []
67
- # self.initial_selected = initial_selected or []
68
-
69
- # def get_filtered_groups(self, df: pd.DataFrame) -> Dict[str, List[str]]:
70
- # """
71
- # Create column groups based on simple substring matching.
72
- # """
73
- # filtered_groups = {}
74
-
75
- # # Create benchmark groups
76
- # for benchmark in self.benchmark_keywords:
77
- # matching_cols = [
78
- # col for col in df.columns
79
- # if benchmark in col.lower()
80
- # ]
81
- # if matching_cols:
82
- # group_name = f"Benchmark group for {benchmark}"
83
- # filtered_groups[group_name] = matching_cols
84
-
85
- # # Create model groups
86
- # for model in self.model_keywords:
87
- # matching_cols = [
88
- # col for col in df.columns
89
- # if model in col.lower()
90
- # ]
91
- # if matching_cols:
92
- # group_name = f"Model group for {model}"
93
- # filtered_groups[group_name] = matching_cols
94
-
95
- # return filtered_groups
96
-
97
- # def update(
98
- # self,
99
- # value: Union[pd.DataFrame, Dict[str, List[str]], Any]
100
- # ) -> Dict:
101
- # """Update component with new values."""
102
- # if isinstance(value, pd.DataFrame):
103
- # choices = list(value.columns)
104
- # selected = self.initial_selected if self.initial_selected else choices
105
- # filtered_cols = self.get_filtered_groups(value)
106
-
107
- # return {
108
- # "choices": choices,
109
- # "value": selected,
110
- # "filtered_cols": filtered_cols
111
- # }
112
-
113
- # if hasattr(value, '__dataclass_fields__'):
114
- # field_names = [field.name for field in fields(value)]
115
- # return {
116
- # "choices": field_names,
117
- # "value": self.initial_selected if self.initial_selected else field_names
118
- # }
119
-
120
- # return super().update(value)
121
-
122
  from gradio_leaderboard import SelectColumns, Leaderboard
123
  import pandas as pd
124
  from typing import List, Dict, Optional
@@ -290,16 +213,11 @@ LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_M
290
  LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
291
  metric_type="CMD")
292
 
293
- # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
294
  # In app.py, modify the LEADERBOARD initialization
295
  LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
296
  EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
297
  )
298
 
299
-
300
- # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
301
- # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
302
-
303
  (
304
  finished_eval_queue_df_subgraph,
305
  pending_eval_queue_df_subgraph,
@@ -320,8 +238,6 @@ def init_leaderboard_mib_subgraph(dataframe, track):
320
 
321
  print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
322
 
323
- # First, create our display name mapping
324
- # This is like creating a translation dictionary between internal names and display names
325
  model_name_mapping = {
326
  "qwen2_5": "Qwen-2.5",
327
  "gpt2": "GPT-2",
@@ -377,12 +293,8 @@ def init_leaderboard_mib_subgraph(dataframe, track):
377
  # Combine all groups using display names
378
  all_groups = benchmark_groups + model_groups
379
  all_columns = [col for group in all_groups for col in group]
380
-
381
- # Important: We need to rename our DataFrame columns to match display names
382
 
383
  renamed_df = dataframe.rename(columns=display_mapping)
384
- # all_columns = [c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default]
385
- # all_columns = [c.name for c in fields(AutoEvalColumn_mib_subgraph)]
386
  all_columns = renamed_df.columns.tolist()
387
 
388
 
@@ -390,45 +302,10 @@ def init_leaderboard_mib_subgraph(dataframe, track):
390
  return Leaderboard(
391
  value=renamed_df, # Use DataFrame with display names
392
  datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
393
- # select_columns=SelectColumns(
394
- # default_selection=all_columns, # Now contains display names
395
- # label="Filter Results:",
396
- # ),
397
  search_columns=["Method"],
398
  hide_columns=["eval_name"],
399
  interactive=False,
400
  ), renamed_df
401
-
402
-
403
-
404
- # @dataclass
405
- # class TaskMIB_Causalgraph:
406
- # benchmark: str # task name in json (ioi/arithmetic)
407
- # models: list[str] # list of models to show as sub-columns
408
- # col_name: str # display name in leaderboard
409
- # metrics: list[str] # metrics to store (average_score)
410
-
411
- # class TasksMib_Causalgraph(Enum):
412
- # task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "ioi_task", ["average_score"])
413
- # task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
414
- # task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
415
- # task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
416
-
417
- # @classmethod
418
- # def get_all_tasks(cls):
419
- # """Returns a list of all task benchmarks"""
420
- # return [task.value.benchmark for task in cls]
421
-
422
- # @classmethod
423
- # def get_all_models(cls):
424
- # """Returns a list of all unique models across all tasks"""
425
- # models = set()
426
- # for task in cls:
427
- # models.update(task.value.models)
428
- # return sorted(list(models))
429
-
430
- # ioi_task
431
- # 4_answer_MCQA
432
 
433
 
434
  def init_leaderboard_mib_causalgraph(dataframe, track):
@@ -694,11 +571,6 @@ with demo:
694
  # Then modify the Causal Graph tab section
695
  with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
696
  with gr.Tabs() as causalgraph_tabs:
697
- # with gr.TabItem("Detailed View", id=0):
698
- # leaderboard_detailed, data = init_leaderboard_mib_causalgraph(
699
- # LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
700
- # "Causal Graph"
701
- # )
702
  with gr.TabItem("Highest View", id=0):
703
  gr.Markdown("""
704
  ### Filtering Options
@@ -759,11 +631,6 @@ with demo:
759
  inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
760
  outputs=leaderboard_averaged
761
  )
762
-
763
- # leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
764
- # LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
765
- # "Causal Graph"
766
- # )
767
 
768
  with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
769
  # Track selection
@@ -776,7 +643,6 @@ with demo:
776
  elem_id="track_selector"
777
  )
778
 
779
- # with gr.Group(visible=False) as circuit_ui:
780
  with gr.Column(visible=False, elem_id="bordered-column") as circuit_ui:
781
  with gr.Row():
782
  gr.Markdown(EVALUATION_QUEUE_TEXT_SUBGRAPH, elem_classes="markdown-text")
@@ -799,33 +665,13 @@ with demo:
799
  "within those submodules (e.g., MLP1 neuron 295)?"
800
  )
801
 
802
- # with gr.Group(visible=False) as causal_ui:
803
  with gr.Column(visible=False, elem_id="bordered-column") as causal_ui:
804
  gr.Markdown(EVALUATION_QUEUE_TEXT_CAUSALVARIABLE, elem_classes="markdown-text")
805
- """
806
- with gr.Row():
807
- layer = gr.Number(
808
- label="Layer Number",
809
- precision=0,
810
- minimum=0,
811
- info="Integer specifying the model layer"
812
- )
813
- token_position = gr.Number(
814
- label="Token Position",
815
- precision=0,
816
- minimum=0,
817
- info="Integer specifying token position"
818
- )
819
- """
820
  with gr.Row():
821
  hf_repo_cg = gr.Textbox(
822
  label="HuggingFace Repository URL",
823
  placeholder="https://huggingface.co/username/repo/path",
824
  info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " )
825
- # code_upload = gr.File(
826
- # label="Upload Python file implementing your featurization function",
827
- # file_types=[".py"],
828
- # )
829
 
830
  # Common fields
831
  with gr.Group():
@@ -884,11 +730,6 @@ with demo:
884
  submission_errors, submission_warnings = verify_circuit_submission(hf_repo, level)
885
 
886
  elif not breaking_error:
887
- # if not (isinstance(layer, int) and isinstance(token_position, int)):
888
- # errors.append("Layer and token position must be integers")
889
- # if not code_upload:
890
- # errors.append("Code file upload is required")
891
-
892
  submission_errors, submission_warnings = verify_causal_variable_submission(hf_repo)
893
 
894
  if not breaking_error:
@@ -986,17 +827,18 @@ with demo:
986
  - Maximum 2 valid submissions per HuggingFace account per week
987
  - Invalid submissions don't count toward your limit
988
  - Rate limit tracked on a rolling basis: a submission no longer counts toward the limit as soon as 7 days have passed since the submission time
 
989
  """)
990
 
991
- # with gr.Row():
992
- # with gr.Accordion("📙 Citation", open=False):
993
- # citation_button = gr.Textbox(
994
- # value=CITATION_BUTTON_TEXT,
995
- # label=CITATION_BUTTON_LABEL,
996
- # lines=20,
997
- # elem_id="citation-button",
998
- # show_copy_button=True,
999
- # )
1000
 
1001
  scheduler = BackgroundScheduler()
1002
  scheduler.add_job(restart_space, "interval", seconds=1800)
 
42
 
43
  from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  from gradio_leaderboard import SelectColumns, Leaderboard
46
  import pandas as pd
47
  from typing import List, Dict, Optional
 
213
  LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
214
  metric_type="CMD")
215
 
 
216
  # In app.py, modify the LEADERBOARD initialization
217
  LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
218
  EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
219
  )
220
 
 
 
 
 
221
  (
222
  finished_eval_queue_df_subgraph,
223
  pending_eval_queue_df_subgraph,
 
238
 
239
  print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
240
 
 
 
241
  model_name_mapping = {
242
  "qwen2_5": "Qwen-2.5",
243
  "gpt2": "GPT-2",
 
293
  # Combine all groups using display names
294
  all_groups = benchmark_groups + model_groups
295
  all_columns = [col for group in all_groups for col in group]
 
 
296
 
297
  renamed_df = dataframe.rename(columns=display_mapping)
 
 
298
  all_columns = renamed_df.columns.tolist()
299
 
300
 
 
302
  return Leaderboard(
303
  value=renamed_df, # Use DataFrame with display names
304
  datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
 
 
 
 
305
  search_columns=["Method"],
306
  hide_columns=["eval_name"],
307
  interactive=False,
308
  ), renamed_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
 
311
  def init_leaderboard_mib_causalgraph(dataframe, track):
 
571
  # Then modify the Causal Graph tab section
572
  with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
573
  with gr.Tabs() as causalgraph_tabs:
 
 
 
 
 
574
  with gr.TabItem("Highest View", id=0):
575
  gr.Markdown("""
576
  ### Filtering Options
 
631
  inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
632
  outputs=leaderboard_averaged
633
  )
 
 
 
 
 
634
 
635
  with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
636
  # Track selection
 
643
  elem_id="track_selector"
644
  )
645
 
 
646
  with gr.Column(visible=False, elem_id="bordered-column") as circuit_ui:
647
  with gr.Row():
648
  gr.Markdown(EVALUATION_QUEUE_TEXT_SUBGRAPH, elem_classes="markdown-text")
 
665
  "within those submodules (e.g., MLP1 neuron 295)?"
666
  )
667
 
 
668
  with gr.Column(visible=False, elem_id="bordered-column") as causal_ui:
669
  gr.Markdown(EVALUATION_QUEUE_TEXT_CAUSALVARIABLE, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
670
  with gr.Row():
671
  hf_repo_cg = gr.Textbox(
672
  label="HuggingFace Repository URL",
673
  placeholder="https://huggingface.co/username/repo/path",
674
  info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " )
 
 
 
 
675
 
676
  # Common fields
677
  with gr.Group():
 
730
  submission_errors, submission_warnings = verify_circuit_submission(hf_repo, level)
731
 
732
  elif not breaking_error:
 
 
 
 
 
733
  submission_errors, submission_warnings = verify_causal_variable_submission(hf_repo)
734
 
735
  if not breaking_error:
 
827
  - Maximum 2 valid submissions per HuggingFace account per week
828
  - Invalid submissions don't count toward your limit
829
  - Rate limit tracked on a rolling basis: a submission no longer counts toward the limit as soon as 7 days have passed since the submission time
830
+ - The queues can take up to an hour to update; don't fret if your submission doesn't show up immediately!
831
  """)
832
 
833
+ with gr.Row():
834
+ with gr.Accordion("📙 Citation", open=False):
835
+ citation_button = gr.Textbox(
836
+ value=CITATION_BUTTON_TEXT,
837
+ label=CITATION_BUTTON_LABEL,
838
+ lines=10,
839
+ elem_id="citation-button",
840
+ show_copy_button=True,
841
+ )
842
 
843
  scheduler = BackgroundScheduler()
844
  scheduler.add_job(restart_space, "interval", seconds=1800)
src/about.py CHANGED
@@ -166,12 +166,12 @@ It will keep the PENDING status until it has been run on the private test set.
166
  """
167
 
168
  CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
169
- CITATION_BUTTON_TEXT = r"""
170
- @article{mib-2025,
171
- title = {{MIB}: A Mechanistic Interpretability Benchmark},
172
- author = {Aaron Mueller and Atticus Geiger and Sarah Wiegreffe and Dana Arad and Iv{\'a}n Arcuschin and Adam Belfki and Yik Siu Chan and Jaden Fiotto-Kaufman and Tal Haklay and Michael Hanna and Jing Huang and Rohan Gupta and Yaniv Nikankin and Hadas Orgad and Nikhil Prakash and Anja Reusch and Aruna Sankaranarayanan and Shun Shao and Alessandro Stolfo and Martin Tutek and Amir Zur and David Bau and Yonatan Belinkov},
173
- year = {2025},
174
- note = {To appear},
175
- journal = {arXiv preprint}
176
  }
177
  """
 
166
  """
167
 
168
  CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
169
+ CITATION_BUTTON_TEXT = r"""@article{mib-2025,
170
+ title = {{MIB}: A Mechanistic Interpretability Benchmark},
171
+ author = {Aaron Mueller and Atticus Geiger and Sarah Wiegreffe and Dana Arad and Iv{\'a}n Arcuschin and Adam Belfki and Yik Siu Chan and Jaden Fiotto-Kaufman and Tal Haklay and Michael Hanna and Jing Huang and Rohan Gupta and Yaniv Nikankin and Hadas Orgad and Nikhil Prakash and Anja Reusch and Aruna Sankaranarayanan and Shun Shao and Alessandro Stolfo and Martin Tutek and Amir Zur and David Bau and Yonatan Belinkov},
172
+ year = {2025},
173
+ journal = {CoRR},
174
+ volume = {arXiv:2504.13151},
175
+ url = {https://arxiv.org/abs/2504.13151v1}
176
  }
177
  """
src/display/utils.py CHANGED
@@ -53,12 +53,6 @@ AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_
53
 
54
 
55
 
56
-
57
-
58
-
59
-
60
-
61
-
62
  ##############################################################################################################
63
  # Version 3
64
  auto_eval_column_dict_mib_subgraph = []
@@ -95,10 +89,6 @@ for field in auto_eval_column_dict_mib_subgraph:
95
  print(f"Field name: {field[0]}, Display name: {field[2].name}")
96
 
97
 
98
-
99
-
100
-
101
-
102
  # Create the dataclass for MIB columns
103
  AutoEvalColumn_mib_subgraph = make_dataclass("AutoEvalColumn_mib_subgraph", auto_eval_column_dict_mib_subgraph, frozen=True)
104
 
@@ -118,15 +108,6 @@ COLS_MIB_CAUSALGRAPH = []
118
  BENCHMARK_COLS_MIB_CAUSALGRAPH = []
119
 
120
 
121
-
122
-
123
-
124
-
125
-
126
-
127
-
128
-
129
-
130
  auto_eval_column_dict_mib_causalgraph = []
131
 
132
  # Only include Method column as required
@@ -154,40 +135,6 @@ AutoEvalColumn_mib_causalgraph = make_dataclass(
154
  frozen=True
155
  )
156
 
157
-
158
-
159
-
160
-
161
-
162
-
163
- # # Column selection for display
164
- # COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
165
-
166
-
167
- # BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"{model}_{task.value.benchmark}_{intervention}".lower()
168
- # for task in TasksMib_Causalgraph
169
- # for model in task.value.models
170
- # for intervention in task.value.interventions]
171
-
172
-
173
-
174
-
175
-
176
-
177
-
178
-
179
-
180
-
181
-
182
-
183
-
184
-
185
-
186
-
187
-
188
-
189
-
190
-
191
 
192
  ## For the queue columns in the submission tab
193
  @dataclass(frozen=True)
@@ -213,156 +160,4 @@ COLS_MULTIMODAL = [c.name for c in fields(AutoEvalColumnMultimodal) if not c.hid
213
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
214
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
215
 
216
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
217
- # BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
218
-
219
- TEXT_TASKS = {
220
- "glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
221
- "boolq", "multirc", "wsc"],
222
- # Lots of BLiMP tasks – use verifier function below to see if you've included everything.
223
- "blimp": ["adjunct_island","anaphor_gender_agreement","anaphor_number_agreement","animate_subject_passive","animate_subject_trans",
224
- "causative","complex_NP_island","coordinate_structure_constraint_complex_left_branch","coordinate_structure_constraint_object_extraction","determiner_noun_agreement_1",
225
- "determiner_noun_agreement_2","determiner_noun_agreement_irregular_1","determiner_noun_agreement_irregular_2","determiner_noun_agreement_with_adjective_1",
226
- "determiner_noun_agreement_with_adj_2","determiner_noun_agreement_with_adj_irregular_1","determiner_noun_agreement_with_adj_irregular_2","distractor_agreement_relational_noun",
227
- "distractor_agreement_relative_clause","drop_argument","ellipsis_n_bar_1","ellipsis_n_bar_2",
228
- "existential_there_object_raising", "existential_there_quantifiers_1",
229
- "existential_there_quantifiers_2", "existential_there_subject_raising", "expletive_it_object_raising",
230
- "inchoative", "intransitive","irregular_past_participle_adjectives", "irregular_past_participle_verbs",
231
- "irregular_plural_subject_verb_agreement_1", "irregular_plural_subject_verb_agreement_2", "left_branch_island_echo_question", "left_branch_island_simple_question",
232
- "matrix_question_npi_licensor_present", "npi_present_1", "npi_present_2", "only_npi_licensor_present", "only_npi_scope", "passive_1", "passive_2",
233
- "principle_A_case_1", "principle_A_case_2", "principle_A_c_command", "principle_A_domain_1",
234
- "principle_A_domain_2", "principle_A_domain_3", "principle_A_reconstruction", "regular_plural_subject_verb_agreement_1",
235
- "regular_plural_subject_verb_agreement_2", "sentential_negation_npi_licensor_present", "sentential_negation_npi_scope", "sentential_subject_island",
236
- "superlative_quantifiers_1", "superlative_quantifiers_2", "tough_vs_raising_1", "tough_vs_raising_2",
237
- "transitive", "wh_island", "wh_questions_object_gap", "wh_questions_subject_gap",
238
- "wh_questions_subject_gap_long_distance", "wh_vs_that_no_gap", "wh_vs_that_no_gap_long_distance", "wh_vs_that_with_gap",
239
- "wh_vs_that_with_gap_long_distance"
240
- ],
241
- "blimp_supplement": ["hypernym", "qa_congruence_easy", "qa_congruence_tricky",
242
- "subject_aux_inversion", "turn_taking"],
243
- "ewok": ["agent-properties", "material-dynamics", "material-properties", "physical-dynamics",
244
- "physical-interactions", "physical-relations", "quantitative-properties",
245
- "social-interactions", "social-properties", "social-relations", "spatial-relations"]
246
- }
247
-
248
- VISION_TASKS = {
249
- "vqa": ["vqa"],
250
- "winoground": ["winoground"],
251
- "devbench": ["lex-viz_vocab", "gram-trog", "sem-things"]
252
- }
253
-
254
- NUM_EXPECTED_EXAMPLES = {
255
- "glue": {
256
- "cola": 522,
257
- "sst2": 436,
258
- "mrpc": 204,
259
- "qqp": 20215,
260
- "mnli": 4908,
261
- "mnli-mm": 4916,
262
- "qnli": 2732,
263
- "rte": 139,
264
- "boolq": 1635,
265
- "multirc": 2424,
266
- "wsc": 52
267
- },
268
- "blimp": {
269
- "adjunct_island": 928,
270
- "anaphor_gender_agreement": 971,
271
- "anaphor_number_agreement": 931,
272
- "animate_subject_passive": 895,
273
- "animate_subject_trans": 923,
274
- "causative": 818,
275
- "complex_NP_island": 846,
276
- "coordinate_structure_constraint_complex_left_branch": 906,
277
- "coordinate_structure_constraint_object_extraction": 949,
278
- "determiner_noun_agreement_1": 929,
279
- "determiner_noun_agreement_2": 931,
280
- "determiner_noun_agreement_irregular_1": 681,
281
- "determiner_noun_agreement_irregular_2": 820,
282
- "determiner_noun_agreement_with_adjective_1": 933,
283
- "determiner_noun_agreement_with_adj_2": 941,
284
- "determiner_noun_agreement_with_adj_irregular_1": 718,
285
- "determiner_noun_agreement_with_adj_irregular_2": 840,
286
- "distractor_agreement_relational_noun": 788,
287
- "distractor_agreement_relative_clause": 871,
288
- "drop_argument": 920,
289
- "ellipsis_n_bar_1": 802,
290
- "ellipsis_n_bar_2": 828,
291
- "existential_there_object_raising": 812,
292
- "existential_there_quantifiers_1": 930,
293
- "existential_there_quantifiers_2": 911,
294
- "existential_there_subject_raising": 924,
295
- "expletive_it_object_raising": 759,
296
- "inchoative": 855,
297
- "intransitive": 868,
298
- "irregular_past_participle_adjectives": 961,
299
- "irregular_past_participle_verbs": 942,
300
- "irregular_plural_subject_verb_agreement_1": 804,
301
- "irregular_plural_subject_verb_agreement_2": 892,
302
- "left_branch_island_echo_question": 947,
303
- "left_branch_island_simple_question": 951,
304
- "matrix_question_npi_licensor_present": 929,
305
- "npi_present_1": 909,
306
- "npi_present_2": 914,
307
- "only_npi_licensor_present": 882,
308
- "only_npi_scope": 837,
309
- "passive_1": 840,
310
- "passive_2": 903,
311
- "principle_A_case_1": 912,
312
- "principle_A_case_2": 915,
313
- "principle_A_c_command": 946,
314
- "principle_A_domain_1": 914,
315
- "principle_A_domain_2": 915,
316
- "principle_A_domain_3": 941,
317
- "principle_A_reconstruction": 967,
318
- "regular_plural_subject_verb_agreement_1": 890,
319
- "regular_plural_subject_verb_agreement_2": 945,
320
- "sentential_negation_npi_licensor_present": 919,
321
- "sentential_negation_npi_scope": 871,
322
- "sentential_subject_island": 961,
323
- "superlative_quantifiers_1": 979,
324
- "superlative_quantifiers_2": 986,
325
- "tough_vs_raising_1": 948,
326
- "tough_vs_raising_2": 920,
327
- "transitive": 868,
328
- "wh_island": 960,
329
- "wh_questions_object_gap": 859,
330
- "wh_questions_subject_gap": 898,
331
- "wh_questions_subject_gap_long_distance": 857,
332
- "wh_vs_that_no_gap": 861,
333
- "wh_vs_that_no_gap_long_distance": 875,
334
- "wh_vs_that_with_gap": 919,
335
- "wh_vs_that_with_gap_long_distance": 910
336
- },
337
- "blimp_supplement": {
338
- "hypernym": 842,
339
- "qa_congruence_easy": 64,
340
- "qa_congruence_tricky": 165,
341
- "subject_aux_inversion": 3867,
342
- "turn_taking": 280
343
- },
344
- "ewok": {
345
- "agent-properties": 2210,
346
- "material-dynamics": 770,
347
- "material-properties": 170,
348
- "physical-dynamics": 120,
349
- "physical-interactions": 556,
350
- "physical-relations": 818,
351
- "quantitative-properties": 314,
352
- "social-interactions": 294,
353
- "social-properties": 328,
354
- "social-relations": 1548,
355
- "spatial-relations": 490
356
- },
357
- "vqa": {
358
- "vqa": 25230
359
- },
360
- "winoground": {
361
- "winoground": 746
362
- },
363
- "devbench": {
364
- "lex-viz_vocab": 119,
365
- "gram-trog": 76,
366
- "sem-things": 1854
367
- }
368
- }
 
53
 
54
 
55
 
 
 
 
 
 
 
56
  ##############################################################################################################
57
  # Version 3
58
  auto_eval_column_dict_mib_subgraph = []
 
89
  print(f"Field name: {field[0]}, Display name: {field[2].name}")
90
 
91
 
 
 
 
 
92
  # Create the dataclass for MIB columns
93
  AutoEvalColumn_mib_subgraph = make_dataclass("AutoEvalColumn_mib_subgraph", auto_eval_column_dict_mib_subgraph, frozen=True)
94
 
 
108
  BENCHMARK_COLS_MIB_CAUSALGRAPH = []
109
 
110
 
 
 
 
 
 
 
 
 
 
111
  auto_eval_column_dict_mib_causalgraph = []
112
 
113
  # Only include Method column as required
 
135
  frozen=True
136
  )
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  ## For the queue columns in the submission tab
140
  @dataclass(frozen=True)
 
160
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
161
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
162
 
163
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/read_evals.py CHANGED
@@ -123,11 +123,9 @@ class EvalResult_MIB_SUBGRAPH:
123
 
124
  # Initialize all possible columns with '-'
125
  expected_models = TasksMib_Subgraph.get_all_models()
126
- # expected_tasks = TasksMib_Subgraph.get_all_tasks()
127
 
128
  for task in TasksMib_Subgraph:
129
  for model in task.value.models:
130
- # print(f"task is {task}, task.value.benchmark is {task.value.benchmark}, model is {model}")
131
  data_dict[f"{task.value.benchmark}_{model}"] = '-'
132
 
133
  all_scores = []
@@ -167,11 +165,8 @@ class EvalResult_MIB_SUBGRAPH:
167
  def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
168
  """From the path of the results folder root, extract all needed info for MIB results"""
169
  model_result_filepaths = []
170
-
171
- # print(f"results_path is {results_path}")
172
-
173
  for root, dirnames, files in os.walk(results_path):
174
- # print(f"root is {root}, dirnames is {dirnames}, files is {files}")
175
  # We should only have json files in model results
176
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
177
  continue
@@ -185,14 +180,11 @@ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_
185
  for file in files:
186
  model_result_filepaths.append(os.path.join(root, file))
187
 
188
- # print(f"model_result_filepaths is {model_result_filepaths}")
189
-
190
  eval_results = []
191
  for model_result_filepath in model_result_filepaths:
192
  try:
193
  eval_result = EvalResult_MIB_SUBGRAPH("", "", {}) # Create empty instance
194
  result = eval_result.init_from_json_file(model_result_filepath)
195
- # print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
196
  # Verify the result can be converted to dict format
197
  result.to_dict()
198
  eval_results.append(result)
@@ -236,9 +228,6 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
236
  return aggregated_df
237
 
238
 
239
-
240
-
241
-
242
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
243
  """
244
  Creates a DataFrame where columns are model_task and cells are averaged over interventions.
 
123
 
124
  # Initialize all possible columns with '-'
125
  expected_models = TasksMib_Subgraph.get_all_models()
 
126
 
127
  for task in TasksMib_Subgraph:
128
  for model in task.value.models:
 
129
  data_dict[f"{task.value.benchmark}_{model}"] = '-'
130
 
131
  all_scores = []
 
165
  def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
166
  """From the path of the results folder root, extract all needed info for MIB results"""
167
  model_result_filepaths = []
168
+
 
 
169
  for root, dirnames, files in os.walk(results_path):
 
170
  # We should only have json files in model results
171
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
172
  continue
 
180
  for file in files:
181
  model_result_filepaths.append(os.path.join(root, file))
182
 
 
 
183
  eval_results = []
184
  for model_result_filepath in model_result_filepaths:
185
  try:
186
  eval_result = EvalResult_MIB_SUBGRAPH("", "", {}) # Create empty instance
187
  result = eval_result.init_from_json_file(model_result_filepath)
 
188
  # Verify the result can be converted to dict format
189
  result.to_dict()
190
  eval_results.append(result)
 
228
  return aggregated_df
229
 
230
 
 
 
 
231
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
232
  """
233
  Creates a DataFrame where columns are model_task and cells are averaged over interventions.
src/submission/check_validity.py CHANGED
@@ -16,7 +16,6 @@ from huggingface_hub.hf_api import ModelInfo
16
  from transformers import AutoConfig
17
  from transformers.models.auto.tokenization_auto import AutoTokenizer
18
 
19
- from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
20
  from src.envs import EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH
21
 
22
  TASKS = ["ioi", "mcqa", "arithmetic-addition", "arithmetic-subtraction", "arc-easy", "arc-challenge"]
@@ -246,72 +245,6 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
246
  return set(file_names), users_to_submission_dates
247
 
248
 
249
- def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
250
- out_msg = ""
251
- for task in TEXT_TASKS:
252
- if task not in predictions:
253
- out_msg = f"Error: {task} not present"
254
- break
255
- for subtask in TEXT_TASKS[task]:
256
- if subtask not in predictions[task]:
257
- out_msg = f"Error: {subtask} not present under {task}"
258
- break
259
- if out_msg != "":
260
- break
261
- if "vqa" in predictions or "winoground" in predictions or "devbench" in predictions:
262
- for task in VISION_TASKS:
263
- if task not in predictions:
264
- out_msg = f"Error: {task} not present"
265
- break
266
- for subtask in VISION_TASKS[task]:
267
- if subtask not in predictions[task]:
268
- out_msg = f"Error: {subtask} not present under {task}"
269
- break
270
- if out_msg != "":
271
- break
272
-
273
- # Make sure all examples have predictions, and that predictions are the correct type
274
- for task in predictions:
275
- for subtask in predictions[task]:
276
- if task == "devbench":
277
- a = np.array(predictions[task][subtask]["predictions"])
278
- if subtask == "sem-things":
279
- required_shape = (1854, 1854)
280
- elif subtask == "gram-trog":
281
- required_shape = (76, 4, 1)
282
- elif subtask == "lex-viz_vocab":
283
- required_shape = (119, 4, 1)
284
- if a.shape[0] != required_shape[0] or a.shape[1] != required_shape[1]:
285
- out_msg = f"Error: Wrong shape for results for `{subtask}` in `{task}`."
286
- break
287
- if not str(a.dtype).startswith("float"):
288
- out_msg = f"Error: Results for `{subtask}` ({task}) \
289
- should be floats but aren't."
290
- break
291
- continue
292
-
293
- num_expected_examples = NUM_EXPECTED_EXAMPLES[task][subtask]
294
- if len(predictions[task][subtask]["predictions"]) != num_expected_examples:
295
- out_msg = f"Error: {subtask} has the wrong number of examples."
296
- break
297
-
298
- if task == "glue":
299
- if type(predictions[task][subtask]["predictions"][0]["pred"]) != int:
300
- out_msg = f"Error: results for `{subtask}` (`{task}`) should be integers but aren't."
301
- break
302
- else:
303
- if type(predictions[task][subtask]["predictions"][0]["pred"]) != str:
304
- out_msg = f"Error: results for `{subtask}` (`{task}`) should be strings but aren't."
305
- break
306
-
307
- if out_msg != "":
308
- break
309
-
310
- if out_msg != "":
311
- return False, out_msg
312
- return True, "Upload successful."
313
-
314
-
315
  def _format_time(earliest_time):
316
  time_left = (earliest_time.tz_convert("UTC") + timedelta(weeks=1)) - pd.Timestamp.utcnow()
317
  hours = time_left.seconds // 3600
 
16
  from transformers import AutoConfig
17
  from transformers.models.auto.tokenization_auto import AutoTokenizer
18
 
 
19
  from src.envs import EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH
20
 
21
  TASKS = ["ioi", "mcqa", "arithmetic-addition", "arithmetic-subtraction", "arc-easy", "arc-challenge"]
 
245
  return set(file_names), users_to_submission_dates
246
 
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  def _format_time(earliest_time):
249
  time_left = (earliest_time.tz_convert("UTC") + timedelta(weeks=1)) - pd.Timestamp.utcnow()
250
  hours = time_left.seconds // 3600
src/submission/submit.py CHANGED
@@ -9,7 +9,6 @@ from src.submission.check_validity import (
9
  already_submitted_models,
10
  get_model_size,
11
  is_model_on_hub,
12
- is_valid_predictions,
13
  parse_huggingface_url
14
  )
15
  import gradio as gr
@@ -89,101 +88,6 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact
89
  gr.Column(visible=False)
90
  ]
91
 
92
- def add_new_eval(
93
- model_name: str,
94
- model_id: str,
95
- revision: str,
96
- track: str,
97
- predictions: dict,
98
- ):
99
- global REQUESTED_MODELS
100
- global USERS_TO_SUBMISSION_DATES
101
- if not REQUESTED_MODELS:
102
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
103
-
104
- out_message = ""
105
-
106
- user_name = ""
107
- model_path = model_name
108
- if "/" in model_name:
109
- user_name = model_name.split("/")[0]
110
- model_path = model_name.split("/")[1]
111
-
112
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
113
-
114
- if track is None:
115
- return styled_error("Please select a track.")
116
-
117
- # Does the model actually exist?
118
- if revision == "":
119
- revision = "main"
120
-
121
- out_message = ""
122
-
123
- # Is the model info correctly filled?
124
- print("Made it before 1")
125
- try:
126
- model_info = API.model_info(repo_id=model_id, revision=revision)
127
- except Exception:
128
- out_message += styled_warning("Could not get your model information. The leaderboard entry will not have a link to its HF repo.") + "<br>"
129
- print("Made it after 1")
130
-
131
- try:
132
- predictions_OK, error_msg = is_valid_predictions(predictions)
133
- if not predictions_OK:
134
- return styled_error(error_msg) + "<br>"
135
- except:
136
- return styled_error(error_msg) + "<br>"
137
-
138
- print("Made it after 3")
139
-
140
- # Seems good, creating the eval
141
- print("Adding new eval")
142
-
143
- eval_entry = {
144
- "model_name": model_name,
145
- "hf_repo": model_id,
146
- "revision": revision,
147
- "track": track,
148
- "predictions": predictions,
149
- "status": "PENDING",
150
- "submitted_time": current_time,
151
- }
152
-
153
- print("Made it after 4")
154
-
155
- # Check for duplicate submission
156
- if f"{model_name}_{revision}_{track}" in REQUESTED_MODELS:
157
- return styled_error("A model with this name has been already submitted.")
158
-
159
- print("Creating eval file")
160
- OUT_DIR = f"{EVAL_REQUESTS}/{user_name}"
161
- os.makedirs(OUT_DIR, exist_ok=True)
162
- out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
163
-
164
- print("Made it after 5")
165
-
166
- with open(out_path, "w") as f:
167
- f.write(json.dumps(eval_entry))
168
-
169
- print("Uploading eval file")
170
- API.upload_file(
171
- path_or_fileobj=out_path,
172
- path_in_repo=out_path.split("eval-queue/")[1],
173
- repo_id=QUEUE_REPO,
174
- repo_type="dataset",
175
- commit_message=f"Add {model_name} to eval queue",
176
- )
177
-
178
- print("Made it after 6")
179
-
180
- # Remove the local file
181
- os.remove(out_path)
182
-
183
- return styled_message(
184
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the request to show in the PENDING list."
185
- )
186
-
187
  def remove_submission(track: str, method_name: str, _id: str):
188
  if track is None:
189
  return gr.Textbox(f"Please select a track.", visible=True)
 
9
  already_submitted_models,
10
  get_model_size,
11
  is_model_on_hub,
 
12
  parse_huggingface_url
13
  )
14
  import gradio as gr
 
88
  gr.Column(visible=False)
89
  ]
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  def remove_submission(track: str, method_name: str, _id: str):
92
  if track is None:
93
  return gr.Textbox(f"Please select a track.", visible=True)