Spaces:

mib-bench
/

leaderboard

Restarting

App Files Files Community

Aaron Mueller commited on Jun 5

Commit

11e2149

1 Parent(s): f8ca5d3

code cleanup

Browse files

Files changed (6) hide show

app.py +10 -168
src/about.py +7 -7
src/display/utils.py +1 -206
src/leaderboard/read_evals.py +1 -12
src/submission/check_validity.py +0 -67
src/submission/submit.py +0 -96

app.py CHANGED Viewed

@@ -42,83 +42,6 @@ from src.submission.check_validity import verify_circuit_submission, verify_caus
 from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
-# class SmartSelectColumns(SelectColumns):
-#     """
-#     Enhanced SelectColumns component with basic filtering functionality.
-#     """
-#     def __init__(
-#         self,
-#         benchmark_keywords: Optional[List[str]] = None,
-#         model_keywords: Optional[List[str]] = None,
-#         initial_selected: Optional[List[str]] = None,
-#         **kwargs
-#     ):
-#         """
-#         Initialize SmartSelectColumns with minimal configuration.
-#         Args:
-#             benchmark_keywords: List of benchmark names to filter by
-#             model_keywords: List of model names to filter by
-#             initial_selected: List of columns to show initially
-#         """
-#         super().__init__(**kwargs)
-#         self.benchmark_keywords = benchmark_keywords or []
-#         self.model_keywords = model_keywords or []
-#         self.initial_selected = initial_selected or []
-#     def get_filtered_groups(self, df: pd.DataFrame) -> Dict[str, List[str]]:
-#         """
-#         Create column groups based on simple substring matching.
-#         """
-#         filtered_groups = {}
-#         # Create benchmark groups
-#         for benchmark in self.benchmark_keywords:
-#             matching_cols = [
-#                 col for col in df.columns
-#                 if benchmark in col.lower()
-#             ]
-#             if matching_cols:
-#                 group_name = f"Benchmark group for {benchmark}"
-#                 filtered_groups[group_name] = matching_cols
-#         # Create model groups
-#         for model in self.model_keywords:
-#             matching_cols = [
-#                 col for col in df.columns
-#                 if model in col.lower()
-#             ]
-#             if matching_cols:
-#                 group_name = f"Model group for {model}"
-#                 filtered_groups[group_name] = matching_cols
-#         return filtered_groups
-#     def update(
-#         self,
-#         value: Union[pd.DataFrame, Dict[str, List[str]], Any]
-#     ) -> Dict:
-#         """Update component with new values."""
-#         if isinstance(value, pd.DataFrame):
-#             choices = list(value.columns)
-#             selected = self.initial_selected if self.initial_selected else choices
-#             filtered_cols = self.get_filtered_groups(value)
-#             return {
-#                 "choices": choices,
-#                 "value": selected,
-#                 "filtered_cols": filtered_cols
-#             }
-#         if hasattr(value, '__dataclass_fields__'):
-#             field_names = [field.name for field in fields(value)]
-#             return {
-#                 "choices": field_names,
-#                 "value": self.initial_selected if self.initial_selected else field_names
-#             }
-#         return super().update(value)
 from gradio_leaderboard import SelectColumns, Leaderboard
 import pandas as pd
 from typing import List, Dict, Optional
@@ -290,16 +213,11 @@ LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_M
 LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
                                                                   metric_type="CMD")
-# LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
 # In app.py, modify the LEADERBOARD initialization
 LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
     EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
 )
-# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
 (
     finished_eval_queue_df_subgraph,
     pending_eval_queue_df_subgraph,
@@ -320,8 +238,6 @@ def init_leaderboard_mib_subgraph(dataframe, track):
     print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
-    # First, create our display name mapping
-    # This is like creating a translation dictionary between internal names and display names
     model_name_mapping = {
         "qwen2_5": "Qwen-2.5",
         "gpt2": "GPT-2",
@@ -377,12 +293,8 @@ def init_leaderboard_mib_subgraph(dataframe, track):
     # Combine all groups using display names
     all_groups = benchmark_groups + model_groups
     all_columns = [col for group in all_groups for col in group]
-    # Important: We need to rename our DataFrame columns to match display names
     renamed_df = dataframe.rename(columns=display_mapping)
-    # all_columns = [c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default]
-    # all_columns = [c.name for c in fields(AutoEvalColumn_mib_subgraph)]
     all_columns = renamed_df.columns.tolist()
@@ -390,45 +302,10 @@ def init_leaderboard_mib_subgraph(dataframe, track):
     return Leaderboard(
         value=renamed_df,  # Use DataFrame with display names
         datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
-        # select_columns=SelectColumns(
-        #     default_selection=all_columns,  # Now contains display names
-        #     label="Filter Results:",
-        # ),
         search_columns=["Method"],
         hide_columns=["eval_name"],
         interactive=False,
     ), renamed_df
-# @dataclass
-# class TaskMIB_Causalgraph:
-#     benchmark: str      # task name in json (ioi/arithmetic)
-#     models: list[str]   # list of models to show as sub-columns
-#     col_name: str       # display name in leaderboard
-#     metrics: list[str]  # metrics to store (average_score)
-# class TasksMib_Causalgraph(Enum):
-#     task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "ioi_task", ["average_score"])
-#     task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
-#     task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
-#     task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
-#     @classmethod
-#     def get_all_tasks(cls):
-#         """Returns a list of all task benchmarks"""
-#         return [task.value.benchmark for task in cls]
-#     @classmethod
-#     def get_all_models(cls):
-#         """Returns a list of all unique models across all tasks"""
-#         models = set()
-#         for task in cls:
-#             models.update(task.value.models)
-#         return sorted(list(models))
-# ioi_task
-# 4_answer_MCQA
 def init_leaderboard_mib_causalgraph(dataframe, track):
@@ -694,11 +571,6 @@ with demo:
         # Then modify the Causal Graph tab section
         with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
             with gr.Tabs() as causalgraph_tabs:
-                # with gr.TabItem("Detailed View", id=0):
-                #     leaderboard_detailed, data = init_leaderboard_mib_causalgraph(
-                #         LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
-                #         "Causal Graph"
-                #     )
                 with gr.TabItem("Highest View", id=0):
                     gr.Markdown("""
                     ### Filtering Options
@@ -759,11 +631,6 @@ with demo:
                         inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
                         outputs=leaderboard_averaged
                     )
-                    # leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
-                    #     LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
-                    #     "Causal Graph"
-                    # )
         with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
             # Track selection
@@ -776,7 +643,6 @@ with demo:
                 elem_id="track_selector"
             )
-            # with gr.Group(visible=False) as circuit_ui:
             with gr.Column(visible=False, elem_id="bordered-column") as circuit_ui:
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT_SUBGRAPH, elem_classes="markdown-text")
@@ -799,33 +665,13 @@ with demo:
                             "within those submodules (e.g., MLP1 neuron 295)?"
                     )
-            # with gr.Group(visible=False) as causal_ui:
             with gr.Column(visible=False, elem_id="bordered-column") as causal_ui:
                 gr.Markdown(EVALUATION_QUEUE_TEXT_CAUSALVARIABLE, elem_classes="markdown-text")
-                """
-                with gr.Row():
-                    layer = gr.Number(
-                        label="Layer Number",
-                        precision=0,
-                        minimum=0,
-                        info="Integer specifying the model layer"
-                    )
-                    token_position = gr.Number(
-                        label="Token Position",
-                        precision=0,
-                        minimum=0,
-                        info="Integer specifying token position"
-                    )
-                """
                 with gr.Row():
                     hf_repo_cg = gr.Textbox(
                         label="HuggingFace Repository URL",
                         placeholder="https://huggingface.co/username/repo/path",
                         info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). "                    )
-                    # code_upload = gr.File(
-                    #     label="Upload Python file implementing your featurization function",
-                    #     file_types=[".py"],
-                    # )
             # Common fields
             with gr.Group():
@@ -884,11 +730,6 @@ with demo:
                     submission_errors, submission_warnings = verify_circuit_submission(hf_repo, level)
                 elif not breaking_error:
-                    # if not (isinstance(layer, int) and isinstance(token_position, int)):
-                    #     errors.append("Layer and token position must be integers")
-                    # if not code_upload:
-                    #     errors.append("Code file upload is required")
                     submission_errors, submission_warnings = verify_causal_variable_submission(hf_repo)
                 if not breaking_error:
@@ -986,17 +827,18 @@ with demo:
             - Maximum 2 valid submissions per HuggingFace account per week
             - Invalid submissions don't count toward your limit
             - Rate limit tracked on a rolling basis: a submission no longer counts toward the limit as soon as 7 days have passed since the submission time
             """)
-    # with gr.Row():
-    #     with gr.Accordion("📙 Citation", open=False):
-    #         citation_button = gr.Textbox(
-    #             value=CITATION_BUTTON_TEXT,
-    #             label=CITATION_BUTTON_LABEL,
-    #             lines=20,
-    #             elem_id="citation-button",
-    #             show_copy_button=True,
-    #         )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)

 from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
 from gradio_leaderboard import SelectColumns, Leaderboard
 import pandas as pd
 from typing import List, Dict, Optional
 LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
                                                                   metric_type="CMD")
 # In app.py, modify the LEADERBOARD initialization
 LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
     EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
 )
 (
     finished_eval_queue_df_subgraph,
     pending_eval_queue_df_subgraph,
     print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
     model_name_mapping = {
         "qwen2_5": "Qwen-2.5",
         "gpt2": "GPT-2",
     # Combine all groups using display names
     all_groups = benchmark_groups + model_groups
     all_columns = [col for group in all_groups for col in group]
     renamed_df = dataframe.rename(columns=display_mapping)
     all_columns = renamed_df.columns.tolist()
     return Leaderboard(
         value=renamed_df,  # Use DataFrame with display names
         datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
         search_columns=["Method"],
         hide_columns=["eval_name"],
         interactive=False,
     ), renamed_df
 def init_leaderboard_mib_causalgraph(dataframe, track):
         # Then modify the Causal Graph tab section
         with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
             with gr.Tabs() as causalgraph_tabs:
                 with gr.TabItem("Highest View", id=0):
                     gr.Markdown("""
                     ### Filtering Options
                         inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
                         outputs=leaderboard_averaged
                     )
         with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
             # Track selection
                 elem_id="track_selector"
             )
             with gr.Column(visible=False, elem_id="bordered-column") as circuit_ui:
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT_SUBGRAPH, elem_classes="markdown-text")
                             "within those submodules (e.g., MLP1 neuron 295)?"
                     )
             with gr.Column(visible=False, elem_id="bordered-column") as causal_ui:
                 gr.Markdown(EVALUATION_QUEUE_TEXT_CAUSALVARIABLE, elem_classes="markdown-text")
                 with gr.Row():
                     hf_repo_cg = gr.Textbox(
                         label="HuggingFace Repository URL",
                         placeholder="https://huggingface.co/username/repo/path",
                         info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). "                    )
             # Common fields
             with gr.Group():
                     submission_errors, submission_warnings = verify_circuit_submission(hf_repo, level)
                 elif not breaking_error:
                     submission_errors, submission_warnings = verify_causal_variable_submission(hf_repo)
                 if not breaking_error:
             - Maximum 2 valid submissions per HuggingFace account per week
             - Invalid submissions don't count toward your limit
             - Rate limit tracked on a rolling basis: a submission no longer counts toward the limit as soon as 7 days have passed since the submission time
+            - The queues can take up to an hour to update; don't fret if your submission doesn't show up immediately!
             """)
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=10,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)

src/about.py CHANGED Viewed

@@ -166,12 +166,12 @@ It will keep the PENDING status until it has been run on the private test set.
 """
 CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
-CITATION_BUTTON_TEXT = r"""
-@article{mib-2025,
-    title = {{MIB}: A Mechanistic Interpretability Benchmark},
-    author = {Aaron Mueller and Atticus Geiger and Sarah Wiegreffe and Dana Arad and Iv{\'a}n Arcuschin and Adam Belfki and Yik Siu Chan and Jaden Fiotto-Kaufman and Tal Haklay and Michael Hanna and Jing Huang and Rohan Gupta and Yaniv Nikankin and Hadas Orgad and Nikhil Prakash and Anja Reusch and Aruna Sankaranarayanan and Shun Shao and Alessandro Stolfo and Martin Tutek and Amir Zur and David Bau and Yonatan Belinkov},
-    year = {2025},
-    note = {To appear},
-    journal = {arXiv preprint}
 }
 """

 """
 CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
+CITATION_BUTTON_TEXT = r"""@article{mib-2025,
+	title = {{MIB}: A Mechanistic Interpretability Benchmark},
+	author = {Aaron Mueller and Atticus Geiger and Sarah Wiegreffe and Dana Arad and Iv{\'a}n Arcuschin and Adam Belfki and Yik Siu Chan and Jaden Fiotto-Kaufman and Tal Haklay and Michael Hanna and Jing Huang and Rohan Gupta and Yaniv Nikankin and Hadas Orgad and Nikhil Prakash and Anja Reusch and Aruna Sankaranarayanan and Shun Shao and Alessandro Stolfo and Martin Tutek and Amir Zur and David Bau and Yonatan Belinkov},
+	year = {2025},
+	journal = {CoRR},
+	volume = {arXiv:2504.13151},
+	url = {https://arxiv.org/abs/2504.13151v1}
 }
 """

src/display/utils.py CHANGED Viewed

@@ -53,12 +53,6 @@ AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_
 ##############################################################################################################
 # Version 3
 auto_eval_column_dict_mib_subgraph = []
@@ -95,10 +89,6 @@ for field in auto_eval_column_dict_mib_subgraph:
     print(f"Field name: {field[0]}, Display name: {field[2].name}")
 # Create the dataclass for MIB columns
 AutoEvalColumn_mib_subgraph = make_dataclass("AutoEvalColumn_mib_subgraph", auto_eval_column_dict_mib_subgraph, frozen=True)
@@ -118,15 +108,6 @@ COLS_MIB_CAUSALGRAPH = []
 BENCHMARK_COLS_MIB_CAUSALGRAPH = []
 auto_eval_column_dict_mib_causalgraph = []
 # Only include Method column as required
@@ -154,40 +135,6 @@ AutoEvalColumn_mib_causalgraph = make_dataclass(
     frozen=True
 )
-# # Column selection for display
-# COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
-# BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"{model}_{task.value.benchmark}_{intervention}".lower()
-#                                  for task in TasksMib_Causalgraph
-#                                  for model in task.value.models
-#                                  for intervention in task.value.interventions]
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
@@ -213,156 +160,4 @@ COLS_MULTIMODAL = [c.name for c in fields(AutoEvalColumnMultimodal) if not c.hid
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]
-# BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
-TEXT_TASKS = {
-    "glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
-            "boolq", "multirc", "wsc"],
-    # Lots of BLiMP tasks – use verifier function below to see if you've included everything.
-    "blimp": ["adjunct_island","anaphor_gender_agreement","anaphor_number_agreement","animate_subject_passive","animate_subject_trans",
-        "causative","complex_NP_island","coordinate_structure_constraint_complex_left_branch","coordinate_structure_constraint_object_extraction","determiner_noun_agreement_1",
-        "determiner_noun_agreement_2","determiner_noun_agreement_irregular_1","determiner_noun_agreement_irregular_2","determiner_noun_agreement_with_adjective_1",
-        "determiner_noun_agreement_with_adj_2","determiner_noun_agreement_with_adj_irregular_1","determiner_noun_agreement_with_adj_irregular_2","distractor_agreement_relational_noun",
-        "distractor_agreement_relative_clause","drop_argument","ellipsis_n_bar_1","ellipsis_n_bar_2",
-        "existential_there_object_raising", "existential_there_quantifiers_1",
-        "existential_there_quantifiers_2", "existential_there_subject_raising", "expletive_it_object_raising",
-        "inchoative", "intransitive","irregular_past_participle_adjectives", "irregular_past_participle_verbs",
-        "irregular_plural_subject_verb_agreement_1", "irregular_plural_subject_verb_agreement_2", "left_branch_island_echo_question", "left_branch_island_simple_question",
-        "matrix_question_npi_licensor_present", "npi_present_1", "npi_present_2", "only_npi_licensor_present", "only_npi_scope", "passive_1", "passive_2",
-        "principle_A_case_1", "principle_A_case_2", "principle_A_c_command", "principle_A_domain_1",
-        "principle_A_domain_2", "principle_A_domain_3", "principle_A_reconstruction", "regular_plural_subject_verb_agreement_1",
-        "regular_plural_subject_verb_agreement_2", "sentential_negation_npi_licensor_present", "sentential_negation_npi_scope", "sentential_subject_island",
-        "superlative_quantifiers_1", "superlative_quantifiers_2", "tough_vs_raising_1", "tough_vs_raising_2",
-        "transitive", "wh_island", "wh_questions_object_gap", "wh_questions_subject_gap",
-        "wh_questions_subject_gap_long_distance", "wh_vs_that_no_gap", "wh_vs_that_no_gap_long_distance", "wh_vs_that_with_gap",
-        "wh_vs_that_with_gap_long_distance"
-    ],
-    "blimp_supplement": ["hypernym", "qa_congruence_easy", "qa_congruence_tricky",
-                "subject_aux_inversion", "turn_taking"],
-    "ewok": ["agent-properties", "material-dynamics", "material-properties", "physical-dynamics",
-            "physical-interactions", "physical-relations", "quantitative-properties",
-            "social-interactions", "social-properties", "social-relations", "spatial-relations"]
-}
-VISION_TASKS = {
-    "vqa": ["vqa"],
-    "winoground": ["winoground"],
-    "devbench": ["lex-viz_vocab", "gram-trog", "sem-things"]
-}
-NUM_EXPECTED_EXAMPLES = {
-    "glue": {
-        "cola": 522,
-        "sst2": 436,
-        "mrpc": 204,
-        "qqp": 20215,
-        "mnli": 4908,
-        "mnli-mm": 4916,
-        "qnli": 2732,
-        "rte": 139,
-        "boolq": 1635,
-        "multirc": 2424,
-        "wsc": 52
-    },
-    "blimp": {
-        "adjunct_island": 928,
-        "anaphor_gender_agreement": 971,
-        "anaphor_number_agreement": 931,
-        "animate_subject_passive": 895,
-        "animate_subject_trans": 923,
-        "causative": 818,
-        "complex_NP_island": 846,
-        "coordinate_structure_constraint_complex_left_branch": 906,
-        "coordinate_structure_constraint_object_extraction": 949,
-        "determiner_noun_agreement_1": 929,
-        "determiner_noun_agreement_2": 931,
-        "determiner_noun_agreement_irregular_1": 681,
-        "determiner_noun_agreement_irregular_2": 820,
-        "determiner_noun_agreement_with_adjective_1": 933,
-        "determiner_noun_agreement_with_adj_2": 941,
-        "determiner_noun_agreement_with_adj_irregular_1": 718,
-        "determiner_noun_agreement_with_adj_irregular_2": 840,
-        "distractor_agreement_relational_noun": 788,
-        "distractor_agreement_relative_clause": 871,
-        "drop_argument": 920,
-        "ellipsis_n_bar_1": 802,
-        "ellipsis_n_bar_2": 828,
-        "existential_there_object_raising": 812,
-        "existential_there_quantifiers_1": 930,
-        "existential_there_quantifiers_2": 911,
-        "existential_there_subject_raising": 924,
-        "expletive_it_object_raising": 759,
-        "inchoative": 855,
-        "intransitive": 868,
-        "irregular_past_participle_adjectives": 961,
-        "irregular_past_participle_verbs": 942,
-        "irregular_plural_subject_verb_agreement_1": 804,
-        "irregular_plural_subject_verb_agreement_2": 892,
-        "left_branch_island_echo_question": 947,
-        "left_branch_island_simple_question": 951,
-        "matrix_question_npi_licensor_present": 929,
-        "npi_present_1": 909,
-        "npi_present_2": 914,
-        "only_npi_licensor_present": 882,
-        "only_npi_scope": 837,
-        "passive_1": 840,
-        "passive_2": 903,
-        "principle_A_case_1": 912,
-        "principle_A_case_2": 915,
-        "principle_A_c_command": 946,
-        "principle_A_domain_1": 914,
-        "principle_A_domain_2": 915,
-        "principle_A_domain_3": 941,
-        "principle_A_reconstruction": 967,
-        "regular_plural_subject_verb_agreement_1": 890,
-        "regular_plural_subject_verb_agreement_2": 945,
-        "sentential_negation_npi_licensor_present": 919,
-        "sentential_negation_npi_scope": 871,
-        "sentential_subject_island": 961,
-        "superlative_quantifiers_1": 979,
-        "superlative_quantifiers_2": 986,
-        "tough_vs_raising_1": 948,
-        "tough_vs_raising_2": 920,
-        "transitive": 868,
-        "wh_island": 960,
-        "wh_questions_object_gap": 859,
-        "wh_questions_subject_gap": 898,
-        "wh_questions_subject_gap_long_distance": 857,
-        "wh_vs_that_no_gap": 861,
-        "wh_vs_that_no_gap_long_distance": 875,
-        "wh_vs_that_with_gap": 919,
-        "wh_vs_that_with_gap_long_distance": 910
-    },
-    "blimp_supplement": {
-        "hypernym": 842,
-        "qa_congruence_easy": 64,
-        "qa_congruence_tricky": 165,
-        "subject_aux_inversion": 3867,
-        "turn_taking": 280
-    },
-    "ewok": {
-        "agent-properties": 2210,
-        "material-dynamics": 770,
-        "material-properties": 170,
-        "physical-dynamics": 120,
-        "physical-interactions": 556,
-        "physical-relations": 818,
-        "quantitative-properties": 314,
-        "social-interactions": 294,
-        "social-properties": 328,
-        "social-relations": 1548,
-        "spatial-relations": 490
-    },
-    "vqa": {
-        "vqa": 25230
-    },
-    "winoground": {
-        "winoground": 746
-    },
-    "devbench": {
-        "lex-viz_vocab": 119,
-        "gram-trog": 76,
-        "sem-things": 1854
-    }
-}

 ##############################################################################################################
 # Version 3
 auto_eval_column_dict_mib_subgraph = []
     print(f"Field name: {field[0]}, Display name: {field[2].name}")
 # Create the dataclass for MIB columns
 AutoEvalColumn_mib_subgraph = make_dataclass("AutoEvalColumn_mib_subgraph", auto_eval_column_dict_mib_subgraph, frozen=True)
 BENCHMARK_COLS_MIB_CAUSALGRAPH = []
 auto_eval_column_dict_mib_causalgraph = []
 # Only include Method column as required
     frozen=True
 )
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/leaderboard/read_evals.py CHANGED Viewed

@@ -123,11 +123,9 @@ class EvalResult_MIB_SUBGRAPH:
         # Initialize all possible columns with '-'
         expected_models = TasksMib_Subgraph.get_all_models()
-        # expected_tasks = TasksMib_Subgraph.get_all_tasks()
         for task in TasksMib_Subgraph:
             for model in task.value.models:
-                # print(f"task is {task}, task.value.benchmark is {task.value.benchmark}, model is {model}")
                 data_dict[f"{task.value.benchmark}_{model}"] = '-'
         all_scores = []
@@ -167,11 +165,8 @@ class EvalResult_MIB_SUBGRAPH:
 def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
     """From the path of the results folder root, extract all needed info for MIB results"""
     model_result_filepaths = []
-    # print(f"results_path is {results_path}")
     for root, dirnames, files in os.walk(results_path):
-        # print(f"root is {root}, dirnames is {dirnames}, files is {files}")
         # We should only have json files in model results
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
@@ -185,14 +180,11 @@ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
-    # print(f"model_result_filepaths is {model_result_filepaths}")
     eval_results = []
     for model_result_filepath in model_result_filepaths:
         try:
             eval_result = EvalResult_MIB_SUBGRAPH("", "", {})  # Create empty instance
             result = eval_result.init_from_json_file(model_result_filepath)
-            # print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
             # Verify the result can be converted to dict format
             result.to_dict()
             eval_results.append(result)
@@ -236,9 +228,6 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
     return aggregated_df
 def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     """
     Creates a DataFrame where columns are model_task and cells are averaged over interventions.

         # Initialize all possible columns with '-'
         expected_models = TasksMib_Subgraph.get_all_models()
         for task in TasksMib_Subgraph:
             for model in task.value.models:
                 data_dict[f"{task.value.benchmark}_{model}"] = '-'
         all_scores = []
 def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
     """From the path of the results folder root, extract all needed info for MIB results"""
     model_result_filepaths = []
     for root, dirnames, files in os.walk(results_path):
         # We should only have json files in model results
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = []
     for model_result_filepath in model_result_filepaths:
         try:
             eval_result = EvalResult_MIB_SUBGRAPH("", "", {})  # Create empty instance
             result = eval_result.init_from_json_file(model_result_filepath)
             # Verify the result can be converted to dict format
             result.to_dict()
             eval_results.append(result)
     return aggregated_df
 def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     """
     Creates a DataFrame where columns are model_task and cells are averaged over interventions.

src/submission/check_validity.py CHANGED Viewed

@@ -16,7 +16,6 @@ from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
-from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
 from src.envs import EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH
 TASKS = ["ioi", "mcqa", "arithmetic-addition", "arithmetic-subtraction", "arc-easy", "arc-challenge"]
@@ -246,72 +245,6 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
     return set(file_names), users_to_submission_dates
-def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
-    out_msg = ""
-    for task in TEXT_TASKS:
-        if task not in predictions:
-            out_msg = f"Error: {task} not present"
-            break
-        for subtask in TEXT_TASKS[task]:
-            if subtask not in predictions[task]:
-                out_msg = f"Error: {subtask} not present under {task}"
-                break
-        if out_msg != "":
-            break
-    if "vqa" in predictions or "winoground" in predictions or "devbench" in predictions:
-        for task in VISION_TASKS:
-            if task not in predictions:
-                out_msg = f"Error: {task} not present"
-                break
-            for subtask in VISION_TASKS[task]:
-                if subtask not in predictions[task]:
-                    out_msg = f"Error: {subtask} not present under {task}"
-                    break
-            if out_msg != "":
-                break
-    # Make sure all examples have predictions, and that predictions are the correct type
-    for task in predictions:
-        for subtask in predictions[task]:
-            if task == "devbench":
-                a = np.array(predictions[task][subtask]["predictions"])
-                if subtask == "sem-things":
-                    required_shape = (1854, 1854)
-                elif subtask == "gram-trog":
-                    required_shape = (76, 4, 1)
-                elif subtask == "lex-viz_vocab":
-                    required_shape = (119, 4, 1)
-                if a.shape[0] != required_shape[0] or a.shape[1] != required_shape[1]:
-                    out_msg = f"Error: Wrong shape for results for `{subtask}` in `{task}`."
-                    break
-                if not str(a.dtype).startswith("float"):
-                    out_msg = f"Error: Results for `{subtask}` ({task}) \
-                        should be floats but aren't."
-                    break
-                continue
-            num_expected_examples = NUM_EXPECTED_EXAMPLES[task][subtask]
-            if len(predictions[task][subtask]["predictions"]) != num_expected_examples:
-                out_msg = f"Error: {subtask} has the wrong number of examples."
-                break
-            if task == "glue":
-                if type(predictions[task][subtask]["predictions"][0]["pred"]) != int:
-                    out_msg = f"Error: results for `{subtask}` (`{task}`) should be integers but aren't."
-                    break
-            else:
-                if type(predictions[task][subtask]["predictions"][0]["pred"]) != str:
-                    out_msg = f"Error: results for `{subtask}` (`{task}`) should be strings but aren't."
-                    break
-        if out_msg != "":
-            break
-    if out_msg != "":
-        return False, out_msg
-    return True, "Upload successful."
 def _format_time(earliest_time):
     time_left = (earliest_time.tz_convert("UTC") + timedelta(weeks=1)) - pd.Timestamp.utcnow()
     hours = time_left.seconds // 3600

 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from src.envs import EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH
 TASKS = ["ioi", "mcqa", "arithmetic-addition", "arithmetic-subtraction", "arc-easy", "arc-challenge"]
     return set(file_names), users_to_submission_dates
 def _format_time(earliest_time):
     time_left = (earliest_time.tz_convert("UTC") + timedelta(weeks=1)) - pd.Timestamp.utcnow()
     hours = time_left.seconds // 3600

src/submission/submit.py CHANGED Viewed

@@ -9,7 +9,6 @@ from src.submission.check_validity import (
     already_submitted_models,
     get_model_size,
     is_model_on_hub,
-    is_valid_predictions,
     parse_huggingface_url
 )
 import gradio as gr
@@ -89,101 +88,6 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact
         gr.Column(visible=False)
     ]
-def add_new_eval(
-    model_name: str,
-    model_id: str,
-    revision: str,
-    track: str,
-    predictions: dict,
-):
-    global REQUESTED_MODELS
-    global USERS_TO_SUBMISSION_DATES
-    if not REQUESTED_MODELS:
-        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
-    out_message = ""
-    user_name = ""
-    model_path = model_name
-    if "/" in model_name:
-        user_name = model_name.split("/")[0]
-        model_path = model_name.split("/")[1]
-    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    if track is None:
-        return styled_error("Please select a track.")
-    # Does the model actually exist?
-    if revision == "":
-        revision = "main"
-    out_message = ""
-    # Is the model info correctly filled?
-    print("Made it before 1")
-    try:
-        model_info = API.model_info(repo_id=model_id, revision=revision)
-    except Exception:
-        out_message += styled_warning("Could not get your model information. The leaderboard entry will not have a link to its HF repo.") + "<br>"
-    print("Made it after 1")
-    try:
-        predictions_OK, error_msg = is_valid_predictions(predictions)
-        if not predictions_OK:
-            return styled_error(error_msg) + "<br>"
-    except:
-        return styled_error(error_msg) + "<br>"
-    print("Made it after 3")
-    # Seems good, creating the eval
-    print("Adding new eval")
-    eval_entry = {
-        "model_name": model_name,
-        "hf_repo": model_id,
-        "revision": revision,
-        "track": track,
-        "predictions": predictions,
-        "status": "PENDING",
-        "submitted_time": current_time,
-    }
-    print("Made it after 4")
-    # Check for duplicate submission
-    if f"{model_name}_{revision}_{track}" in REQUESTED_MODELS:
-        return styled_error("A model with this name has been already submitted.")
-    print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS}/{user_name}"
-    os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
-    print("Made it after 5")
-    with open(out_path, "w") as f:
-        f.write(json.dumps(eval_entry))
-    print("Uploading eval file")
-    API.upload_file(
-        path_or_fileobj=out_path,
-        path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
-        repo_type="dataset",
-        commit_message=f"Add {model_name} to eval queue",
-    )
-    print("Made it after 6")
-    # Remove the local file
-    os.remove(out_path)
-    return styled_message(
-        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the request to show in the PENDING list."
-    )
 def remove_submission(track: str, method_name: str, _id: str):
     if track is None:
         return gr.Textbox(f"Please select a track.", visible=True)

     already_submitted_models,
     get_model_size,
     is_model_on_hub,
     parse_huggingface_url
 )
 import gradio as gr
         gr.Column(visible=False)
     ]
 def remove_submission(track: str, method_name: str, _id: str):
     if track is None:
         return gr.Textbox(f"Please select a track.", visible=True)