Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

Aaron Mueller commited on Apr 8

Commit

3c343e0

1 Parent(s): c57af6c

leaderboard update

Browse files

Files changed (8) hide show

app.py +152 -116
src/about.py +19 -42
src/display/utils.py +8 -8
src/envs.py +5 -5
src/leaderboard/read_evals.py +3 -3
src/populate.py +24 -49
src/submission/check_validity.py +245 -22
src/submission/submit.py +114 -3

app.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import json
 import gzip
 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
@@ -21,8 +24,6 @@ from src.about import (
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
-    BENCHMARK_COLS,
-    BENCHMARK_COLS_MULTIMODAL,
     BENCHMARK_COLS_MIB_SUBGRAPH,
     COLS,
     COLS_MIB_SUBGRAPH,
@@ -34,10 +35,10 @@ from src.display.utils import (
     AutoEvalColumn_mib_causalgraph,
     fields,
 )
-from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
-from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
-from src.submission.submit import add_new_eval
 from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
@@ -244,27 +245,35 @@ def restart_space():
-### Space initialisation
 try:
-    # print(EVAL_REQUESTS_PATH)
     snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 try:
-    # print(RESULTS_REPO_MIB_SUBGRAPH)
     snapshot_download(
         repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 try:
-    # print(RESULTS_REPO_MIB_CAUSALGRAPH)
     snapshot_download(
         repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
@@ -277,26 +286,25 @@ def _sigmoid(x):
     except:
         return "-"
-LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
-LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
                                                                   metric_type="F=")
 # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
 # In app.py, modify the LEADERBOARD initialization
 LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
-    EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
-    EVAL_REQUESTS_PATH
 )
 # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
@@ -392,10 +400,6 @@ def init_leaderboard_mib_subgraph(dataframe, track):
 def init_leaderboard_mib_causalgraph(dataframe, track):
-    # print("Debugging column issues:")
-    # print("\nActual DataFrame columns:")
-    # print(dataframe.columns.tolist())
     model_name_mapping = {
         "Qwen2ForCausalLM": "Qwen-2.5",
         "GPT2ForCausalLM": "GPT-2",
@@ -419,18 +423,7 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
             display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
             display_mapping[field_name] = display_name
-    # print(dataframe)
     renamed_df = dataframe.rename(columns=display_mapping)
-    # idx_to_method = {0: "Full Vector", 1: "DAS", 2: "DBM", 3: "PCA", 4: "SAE"}
-    # idx_to_scores = {0: [0.38, 0.36, 0.38, 0.42],
-    #                  1: [0.56, 0.62, 0.54, 0.51],
-    #                  2: [0.43, 0.41, 0.53, 0.49],
-    #                  3: [0.26, 0.20, 0.32, 0.40],
-    #                  4: ["-", "-", 0.33, "-"]}
-    # renamed_df.loc[0]["Method"] = "Full Vector"
-    # for i in range(5):
-    #     renamed_df.loc[i] = [idx_to_method[i]] + idx_to_scores[i]
     print(renamed_df)
@@ -438,11 +431,6 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
     return Leaderboard(
         value=renamed_df,
         datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
-        # select_columns=SelectColumns(
-        #     default_selection=["Method"],  # Start with just Method column
-        #     cant_deselect=["Method"],      # Method column should always be visible
-        #     label="Select Columns to Display:",
-        # ),
         search_columns=["Method"],
         hide_columns=["eval_name"],
         bool_checkboxgroup_label="Hide models",
@@ -455,8 +443,6 @@ def init_leaderboard(dataframe, track):
         raise ValueError("Leaderboard DataFrame is empty or None.")
     # filter for correct track
     dataframe = dataframe.loc[dataframe["Track"] == track]
-    # print(f"\n\n\n dataframe is {dataframe}\n\n\n")
     return Leaderboard(
         value=dataframe,
@@ -577,17 +563,6 @@ def update_leaderboard(dataframe: pd.DataFrame, selected_task_substrings: List[s
         filtered_dataframe.loc[:, "Score"] = np.where(filtered_dataframe.eq("-").any(axis=1), "-", s_means.round(2))
         filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
-    # if show_average:
-    #     print([row for index, row in filtered_dataframe.iterrows()])
-    #     filtered_dataframe["Average"] = [round(np.mean(row.values()), 2) if "-" not in row.values() else "-" for index, row in filtered_dataframe.iterrows()]
-    #     # Sort by Average score descending
-    #     if 'Average' in dataframe.columns:
-    #         # Convert '-' to NaN for sorting purposes
-    #         df['Average'] = pd.to_numeric(['Average'], errors='coerce')
-    #         df = df.sort_values(by=['Average'], ascending=True, na_position='last')
-    #         # Convert NaN back to '-'
-    #         df['Average'] = df['Average'].fillna('-')
     return filtered_dataframe
 def process_url(url):
@@ -600,18 +575,6 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        # with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
-        #     leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
-        # with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
-        #     leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
-        # with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
-        #     leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
-        # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
-        #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        # with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
-        #     leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
         with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
             with gr.Tabs() as subgraph_tabs:
                 with gr.TabItem("F+", id=0):
@@ -622,11 +585,6 @@ with demo:
                     You can combine filters to see specific task-model combinations.
                     """)
                     # CheckboxGroup for selecting substrings
-                    # substring_checkbox = gr.CheckboxGroup(
-                    #     choices=PRESET_SUBSTRINGS,
-                    #     label="Filter results:",
-                    #     value=PRESET_SUBSTRINGS,  # Default to all substrings selected
-                    # )
                     task_substring_checkbox = gr.CheckboxGroup(
                         choices=TASK_SUBSTRINGS,
                         label="View tasks:",
@@ -660,11 +618,6 @@ with demo:
                     You can combine filters to see specific task-model combinations.
                     """)
                     # CheckboxGroup for selecting substrings
-                    # substring_checkbox = gr.CheckboxGroup(
-                    #     choices=PRESET_SUBSTRINGS,
-                    #     label="Filter results:",
-                    #     value=PRESET_SUBSTRINGS,  # Default to all substrings selected
-                    # )
                     task_substring_checkbox = gr.CheckboxGroup(
                         choices=TASK_SUBSTRINGS,
                         label="View tasks:",
@@ -705,11 +658,6 @@ with demo:
                     Use the dropdown menus below to filter results by specific tasks or models.
                     You can combine filters to see specific task-model combinations.
                     """)
-                    # substring_checkbox = gr.CheckboxGroup(
-                    #     choices=PRESET_SUBSTRINGS,
-                    #     label="Filter results:",
-                    #     value=PRESET_SUBSTRINGS,  # Default to all substrings selected
-                    # )
                     task_substring_checkbox = gr.CheckboxGroup(
                         choices=TASK_SUBSTRINGS,
                         label="View tasks:",
@@ -757,11 +705,24 @@ with demo:
             with gr.Group(visible=False) as circuit_ui:
                 gr.Markdown("### Circuit Localization Requirements")
-                hf_repo = gr.Textbox(
-                    label="HuggingFace Repository URL",
-                    placeholder="https://huggingface.co/username/repo/tree/main/path",
-                    info="Must be a valid HuggingFace URL pointing to a folder with 10 circuit files (.json or .pt)"
-                )
             with gr.Group(visible=False) as causal_ui:
                 gr.Markdown("### Causal Variable Localization Requirements")
@@ -778,15 +739,22 @@ with demo:
                         minimum=0,
                         info="Integer specifying token position"
                     )
-                code_upload = gr.File(
-                    label="Upload Python file implementing your featurization function",
-                    file_types=[".py"],
-                )
             # Common fields
             with gr.Group():
-                gr.Markdown("### Team Information")
-                team_name = gr.Textbox(label="Team Name")
                 contact_email = gr.Textbox(label="Contact Email")
             # Dynamic UI logic
@@ -801,47 +769,115 @@ with demo:
             track.change(toggle_ui, track, [circuit_ui, causal_ui])
             # Submission handling
-            status = gr.Textbox(label="Submission Status", visible=False)
-            def handle_submission(track, hf_repo, layer, token_position, code_upload, team_name, contact_email):
                 errors = []
                 # Validate common fields
-                if not team_name.strip():
-                    errors.append("Team name is required")
                 if "@" not in contact_email or "." not in contact_email:
                     errors.append("Valid email address is required")
-                # Track-specific validation
-                if "Circuit" in track:
-                    if not hf_repo.startswith("https://huggingface.co/"):
-                        errors.append("Invalid HuggingFace URL - must start with https://huggingface.co/")
                     else:
-                        # Check rate limit only for valid HF submissions
-                        username = get_hf_username(hf_repo)
-                        rate = 0  # TODO: check submissions queue for rates
-                        rate_limit = 2
-                        if rate > rate_limit:
-                            errors.append("Rate limit exceeded (max 2 submissions per week per HF account)")
-                else:
                     if not (isinstance(layer, int) and isinstance(token_position, int)):
                         errors.append("Layer and token position must be integers")
                     if not code_upload:
                         errors.append("Code file upload is required")
-                if errors:
-                    return gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
-                # Process valid submission
-                return gr.Textbox("✅ Submission received! Thank you for your entry.", visible=True)
             submit_btn = gr.Button("Submit Entry", variant="primary")
             submit_btn.click(
                 handle_submission,
-                inputs=[track, hf_repo, layer, token_position, code_upload, team_name, contact_email],
-                outputs=status
             )
             # Add info about rate limits
             gr.Markdown("""
@@ -864,4 +900,4 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.launch(share=True, ssr_mode=False)

 import json
 import gzip
+import os
+import shutil
+import secrets
 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     BENCHMARK_COLS_MIB_SUBGRAPH,
     COLS,
     COLS_MIB_SUBGRAPH,
     AutoEvalColumn_mib_causalgraph,
     fields,
 )
+from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
+from src.populate import get_evaluation_queue_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
+from src.submission.submit import upload_to_queue, remove_submission
+from src.submission.check_validity import verify_circuit_submission, verify_causal_variable_submission, check_rate_limit
 from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
+### Space initialisation - refresh caches
 try:
+    if os.path.exists(EVAL_REQUESTS_SUBGRAPH):
+        shutil.rmtree(EVAL_REQUESTS_SUBGRAPH)
     snapshot_download(
+        repo_id=QUEUE_REPO_SUBGRAPH, local_dir=EVAL_REQUESTS_SUBGRAPH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+try:
+    if os.path.exists(EVAL_REQUESTS_CAUSALGRAPH):
+        shutil.rmtree(EVAL_REQUESTS_CAUSALGRAPH)
+    snapshot_download(
+        repo_id=QUEUE_REPO_CAUSALGRAPH, local_dir=EVAL_REQUESTS_CAUSALGRAPH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 try:
+    if os.path.exists(EVAL_RESULTS_MIB_SUBGRAPH_PATH):
+        shutil.rmtree(EVAL_RESULTS_MIB_SUBGRAPH_PATH)
     snapshot_download(
         repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 try:
+    if os.path.exists(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH):
+        shutil.rmtree(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH)
     snapshot_download(
         repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
     except:
         return "-"
+LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
+LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
                                                                   metric_type="F=")
 # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
 # In app.py, modify the LEADERBOARD initialization
 LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
+    EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
 )
 # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
+# (
+#     finished_eval_queue_df,
+#     running_eval_queue_df,
+#     pending_eval_queue_df,
+# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard_mib_causalgraph(dataframe, track):
     model_name_mapping = {
         "Qwen2ForCausalLM": "Qwen-2.5",
         "GPT2ForCausalLM": "GPT-2",
             display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
             display_mapping[field_name] = display_name
     renamed_df = dataframe.rename(columns=display_mapping)
     print(renamed_df)
     return Leaderboard(
         value=renamed_df,
         datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
         search_columns=["Method"],
         hide_columns=["eval_name"],
         bool_checkboxgroup_label="Hide models",
         raise ValueError("Leaderboard DataFrame is empty or None.")
     # filter for correct track
     dataframe = dataframe.loc[dataframe["Track"] == track]
     return Leaderboard(
         value=dataframe,
         filtered_dataframe.loc[:, "Score"] = np.where(filtered_dataframe.eq("-").any(axis=1), "-", s_means.round(2))
         filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
     return filtered_dataframe
 def process_url(url):
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
             with gr.Tabs() as subgraph_tabs:
                 with gr.TabItem("F+", id=0):
                     You can combine filters to see specific task-model combinations.
                     """)
                     # CheckboxGroup for selecting substrings
                     task_substring_checkbox = gr.CheckboxGroup(
                         choices=TASK_SUBSTRINGS,
                         label="View tasks:",
                     You can combine filters to see specific task-model combinations.
                     """)
                     # CheckboxGroup for selecting substrings
                     task_substring_checkbox = gr.CheckboxGroup(
                         choices=TASK_SUBSTRINGS,
                         label="View tasks:",
                     Use the dropdown menus below to filter results by specific tasks or models.
                     You can combine filters to see specific task-model combinations.
                     """)
                     task_substring_checkbox = gr.CheckboxGroup(
                         choices=TASK_SUBSTRINGS,
                         label="View tasks:",
             with gr.Group(visible=False) as circuit_ui:
                 gr.Markdown("### Circuit Localization Requirements")
+                with gr.Row():
+                    hf_repo_circ = gr.Textbox(
+                        label="HuggingFace Repository URL",
+                        placeholder="https://huggingface.co/username/repo/path",
+                        info="Must be a valid HuggingFace URL pointing to folders containing either 1 importance score file per task/model, or " \
+                             "9 circuit files per task/model (.json or .pt). " \
+                             "Remove 'tree', 'resolve', and the branch name (e.g., '/tree/main/') from URL if present."
+                    )
+                    level = gr.Radio(
+                        choices=[
+                            "Edge",
+                            "Node (submodule)",
+                            "Node (neuron)"
+                        ],
+                        label="Level of granularity",
+                        info="Is your circuit defined by its inclusion/exclusion of certain edges (e.g., MLP1 to H10L12), of certain submodules (e.g., MLP1), or of neurons " \
+                            "within those submodules (e.g., MLP1 neuron 295)?"
+                    )
             with gr.Group(visible=False) as causal_ui:
                 gr.Markdown("### Causal Variable Localization Requirements")
                         minimum=0,
                         info="Integer specifying token position"
                     )
+                with gr.Row():
+                    hf_repo_cg = gr.Textbox(
+                        label="HuggingFace Repository URL",
+                        placeholder="https://huggingface.co/username/repo/path",
+                        info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " \
+                             "Remove 'tree', 'resolve', and the branch name (e.g., '/tree/main/') from URL if present."
+                    )
+                    code_upload = gr.File(
+                        label="Upload Python file implementing your featurization function",
+                        file_types=[".py"],
+                    )
             # Common fields
             with gr.Group():
+                gr.Markdown("### Submission Information")
+                method_name = gr.Textbox(label="Method Name")
                 contact_email = gr.Textbox(label="Contact Email")
             # Dynamic UI logic
             track.change(toggle_ui, track, [circuit_ui, causal_ui])
             # Submission handling
+            status = gr.Textbox(label="Submission Status", visible=True)
+            def handle_submission(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email):
                 errors = []
+                warnings = []
+                breaking_error = False
+                hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
                 # Validate common fields
+                if not method_name.strip():
+                    errors.append("Method name is required")
                 if "@" not in contact_email or "." not in contact_email:
                     errors.append("Valid email address is required")
+                if not level:
+                    errors.append("Level of granularity is required")
+                if not hf_repo.startswith("https://huggingface.co/") and not hf_repo.startswith("http://huggingface.co/"):
+                    errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
+                    breaking_error = True
+                else:
+                    repo_info = hf_repo.split("huggingface.co/")[1]
+                    if len(repo_info.split("/")) < 2:
+                        errors.append("Could not read username or repo name from HF URL")
+                        breaking_error = True
                     else:
+                        user_name, repo_name = repo_info.split("/")[:2]
+                        under_rate_limit, time_left = check_rate_limit(track, user_name, contact_email)
+                        if not under_rate_limit:
+                            errors.append(f"Rate limit exceeded (max 2 submissions per week). Please try again in {time_left}. " \
+                                          "(If you're trying again after a failed validation, either remove the previous entry below or try again in about 30 minutes.")
+                            breaking_error = True
+                # Track-specific validation
+                if "Circuit" in track and not breaking_error:
+                    submission_errors, submission_warnings = verify_circuit_submission(hf_repo, level)
+                elif not breaking_error:
                     if not (isinstance(layer, int) and isinstance(token_position, int)):
                         errors.append("Layer and token position must be integers")
                     if not code_upload:
                         errors.append("Code file upload is required")
+                    submission_errors, submission_warnings = verify_causal_variable_submission(hf_repo, layer, token_position, code_upload)
+                if not breaking_error:
+                    errors.extend(submission_errors)
+                    warnings.extend(submission_warnings)
+                    _id = secrets.token_urlsafe(12)
+                if errors:
+                    return [
+                        gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True),
+                        None, None,
+                        gr.Column(visible=False),
+                    ]
+                elif warnings:
+                    return [
+                        gr.Textbox("Warnings:", visible=True),
+                        gr.Markdown("\n".join(f"• {w}" for w in warnings)),
+                        (track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id),
+                        gr.Column(visible=True)
+                    ]
+                else:
+                    return upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id)
+            # New warning confirmation dialog
+            warning_modal = gr.Column(visible=False, variant="panel")
+            with warning_modal:
+                gr.Markdown("### ⚠️ Submission Warnings")
+                warning_display = gr.Markdown()
+                proceed_btn = gr.Button("Proceed Anyway", variant="primary")
+                cancel_btn = gr.Button("Cancel Submission", variant="secondary")
+            # Store submission data between callbacks
+            pending_submission = gr.State()
             submit_btn = gr.Button("Submit Entry", variant="primary")
             submit_btn.click(
                 handle_submission,
+                inputs=[track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email],
+                outputs=[status, warning_display, pending_submission, warning_modal]
+            )
+            proceed_btn.click(
+                lambda x: upload_to_queue(*x),
+                inputs=pending_submission,
+                outputs=[status, warning_display, pending_submission, warning_modal]
+            )
+            cancel_btn.click(
+                lambda: [gr.Textbox("Submission canceled.", visible=True), None, None, gr.Column(visible=False)],
+                outputs=[status, warning_display, pending_submission, warning_modal]
             )
+            with gr.Group():
+                gr.Markdown("### Remove Submission from Queue")
+                with gr.Row():
+                    name_r = gr.Textbox(label="Method Name")
+                    _id_r = gr.Textbox(label = "Submission ID")
+                status_r = gr.Textbox(label="Removal Status", visible=False)
+                remove_button = gr.Button("Remove Entry")
+                remove_button.click(
+                    remove_submission,
+                    inputs=[track, name_r, _id_r],
+                    outputs=[status_r]
+                )
             # Add info about rate limits
             gr.Markdown("""
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch(share=True, ssr_mode=False)

src/about.py CHANGED Viewed

@@ -7,11 +7,6 @@ class Task:
     metric: str
     col_name: str
-# Select your tasks here
-# ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("blimp", "acc", "BLiMP")
@@ -19,19 +14,6 @@ class Tasks(Enum):
     task2 = Task("glue", "acc", "(Super)GLUE")
     task3 = Task("ewok", "acc", "EWoK")
-class TasksMultimodal(Enum):
-    task0 = Task("blimp", "acc", "BLiMP")
-    task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
-    task2 = Task("glue", "acc", "(Super)GLUE")
-    task3 = Task("ewok", "acc", "EWoK")
-    task4 = Task("vqa", "acc", "VQA")
-    task5 = Task("winoground", "acc", "Winoground")
-    task6 = Task("devbench", "acc", "DevBench")
 @dataclass
 class TaskMIB_Subgraph:
     benchmark: str      # task name in json (ioi/arithmetic)
@@ -118,15 +100,8 @@ class TasksMib_Causalgraph(Enum):
-NUM_FEWSHOT = 0 # Change with your few shot
-# ---------------------------------------------------
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Benchmark 2024 Leaderboards</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
@@ -135,34 +110,36 @@ The leaderboards for each track of the 2024 Mechanistic Interpretability Benchma
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
-This leaderboard displays scores from the 2024 BabyLM Challenge. Each track has its own tab.
 """
 EVALUATION_QUEUE_TEXT = """
 ## Circuit localization track:
-You'll need 10 circuits per task/model combination. For each critical threshold k and previous threshold k_-1,
-the circuit should contain no fewer than k_-1% of components, and no more than k% of components. Create a HuggingFace
-dataset or model repository; this will house your circuits. Make a folder where the circuits (and *only* the circuits)
-are contained. Do not worry about the ordering of the files; our evaluation script will read the circuits and sort them
-by size. Provide a link to this folder below.
 For specifications about the file format for a circuit, see the README on our project GitHub: TODO
-Once your model makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
-The evaluations are handled by the National Deep Inference Framework (NDIF).
 ## Causal variable localization track:
 """
-CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
 CITATION_BUTTON_TEXT = r"""
-@article{hu2024findingssecondbabylmchallenge,
-      title={Findings of the Second BabyLM Challenge: Sample-Efficient Pretraining on Developmentally Plausible Corpora},
-      author={Michael Y. Hu and Aaron Mueller and Candace Ross and Adina Williams and Tal Linzen and Chengxu Zhuang and Ryan Cotterell and Leshem Choshen and Alex Warstadt and Ethan Gotlieb Wilcox},
-      year={2024},
-      journal={Computing Research Repository},
-      volume={arXiv:2412.05149},
-      url={https://arxiv.org/abs/2412.05149},
 }
 """

     metric: str
     col_name: str
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("blimp", "acc", "BLiMP")
     task2 = Task("glue", "acc", "(Super)GLUE")
     task3 = Task("ewok", "acc", "EWoK")
 @dataclass
 class TaskMIB_Subgraph:
     benchmark: str      # task name in json (ioi/arithmetic)
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Benchmark Leaderboards</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
+This leaderboard displays scores on the private test set for the Mechanistic Interpretability Benchmark. Each track has its own tab.
 """
 EVALUATION_QUEUE_TEXT = """
 ## Circuit localization track:
+You'll need either (i) 1 circuit per task/model combinaton with floating-point importance scores for each edge or node, or (ii) 9 circuits per model/task with binary membership scores for each edge or node.
+If (ii), then for each critical threshold k, the circuit should contain no more than k% of edges. See [here]() for examples of each valid circuit format.
+Create a folder in a HuggingFace repository to hold your circuits. At the URL you provide, there should be one folder per task/model combination; these folders
+should contain your circuit(s). As long as the folders contain the model and task names, you do not need to worry about the circuit filenames.
+If you provide more circuits than needed, our evaluation script will take the first 9 lexicographically.
 For specifications about the file format for a circuit, see the README on our project GitHub: TODO
+Once your submission has been validated and makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
 ## Causal variable localization track:
+You'll need to provide a link to a HuggingFace repository containing your trained featurizer, the layer on which the featurizer was trained, and the code needed to load and run your featurizer.
+See TODO for an example.
 """
+CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
 CITATION_BUTTON_TEXT = r"""
+@article{mib-2025,
+    title = {{MIB}: A Mechanistic Interpretability Benchmark},
+    author = {Aaron Mueller and Atticus Geiger and Sarah Wiegreffe and Dana Arad and Iv{\'a}n Arcuschin and Adam Belfki and Yik Siu Chan and Jaden Fiotto-Kaufman and Tal Haklay and Michael Hanna and Jing Huang and Rohan Gupta and Yaniv Nikankin and Hadas Orgad and Nikhil Prakash and Anja Reusch and Aruna Sankaranarayanan and Shun Shao and Alessandro Stolfo and Martin Tutek and Amir Zur and David Bau and Yonatan Belinkov},
+    year = {2025},
+    note = {To appear},
+    journal = {arXiv preprint}
 }
 """

src/display/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from enum import Enum
 import pandas as pd
-from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph, TasksMib_Causalgraph
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -28,8 +28,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
 auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
 auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
 #Scores
-for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
@@ -38,10 +38,10 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
 auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
 auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
-for task in TasksMultimodal:
-    auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
-    if task.value.col_name in ("ewok", "EWoK"):   # make sure this appears in the right order
-        auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
 auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
 auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
@@ -214,7 +214,7 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]
-BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
 TEXT_TASKS = {
     "glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",

 import pandas as pd
+from src.about import Tasks, TasksMib_Subgraph, TasksMib_Causalgraph
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
 auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
 #Scores
+# for task in Tasks:
+#     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
 auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
+# for task in TasksMultimodal:
+#     auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+#     if task.value.col_name in ("ewok", "EWoK"):   # make sure this appears in the right order
+#         auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
 auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
 auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]
+# BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
 TEXT_TASKS = {
     "glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",

src/envs.py CHANGED Viewed

@@ -6,24 +6,24 @@ from huggingface_hub import HfApi
 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "mech-interp-bench" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"
 # RESULTS_REPO = f"{OWNER}/results-mib-test"
-# QUEUE_REPO = f"{OWNER}/requests"
-QUEUE_REPO = f"shunshao/requests-mib-test"
 RESULTS_REPO_MIB_SUBGRAPH = f"{OWNER}/subgraph-results"
 RESULTS_REPO_MIB_CAUSALGRAPH = f"{OWNER}/causalgraph-results"
 # RESULTS_REPO_MIB_CAUSALGRAPH = f"shunshao/causalgraph-results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
-EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 # EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_RESULTS_MIB_SUBGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-subgraph")
 EVAL_RESULTS_MIB_CAUSALGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-causalgraph")

 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+OWNER = "mib-bench" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"
 # RESULTS_REPO = f"{OWNER}/results-mib-test"
+QUEUE_REPO_SUBGRAPH = f"{OWNER}/requests-subgraph"
+QUEUE_REPO_CAUSALGRAPH = f"{OWNER}/requests-causalgraph"
 RESULTS_REPO_MIB_SUBGRAPH = f"{OWNER}/subgraph-results"
 RESULTS_REPO_MIB_CAUSALGRAPH = f"{OWNER}/causalgraph-results"
 # RESULTS_REPO_MIB_CAUSALGRAPH = f"shunshao/causalgraph-results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
+EVAL_REQUESTS_SUBGRAPH = os.path.join(CACHE_PATH, "eval-queue-subgraph")
+EVAL_REQUESTS_CAUSALGRAPH = os.path.join(CACHE_PATH, "eval-queue-causalgraph")
 # EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_RESULTS_MIB_SUBGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-subgraph")
 EVAL_RESULTS_MIB_CAUSALGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-causalgraph")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -8,7 +8,7 @@ import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, Tasks, TasksMultimodal
 from src.submission.check_validity import is_model_on_hub
 from src.about import TasksMib_Subgraph
@@ -144,7 +144,7 @@ class EvalResult_MIB_SUBGRAPH:
         return data_dict
-def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
     """From the path of the results folder root, extract all needed info for MIB results"""
     model_result_filepaths = []
@@ -487,7 +487,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     return averaged_df
-def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """From the path of the results folder root, extract all needed info for MIB causal graph results"""
     model_result_filepaths = []

 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal
 from src.submission.check_validity import is_model_on_hub
 from src.about import TasksMib_Subgraph
         return data_dict
+def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
     """From the path of the results folder root, extract all needed info for MIB results"""
     model_result_filepaths = []
     return averaged_df
+def get_raw_eval_results_mib_causalgraph(results_path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """From the path of the results folder root, extract all needed info for MIB causal graph results"""
     model_result_filepaths = []

src/populate.py CHANGED Viewed

@@ -8,37 +8,11 @@ from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueu
 from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
 from src.about import TasksMib_Causalgraph
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    """Creates a dataframe from all the individual experiment results"""
-    # print(f"results_path is {results_path}, requests_path is {requests_path}")
-    raw_data = get_raw_eval_results(results_path, requests_path)
-    # print(f"raw_data is {raw_data}")
-    all_data_json = [v.to_dict() for v in raw_data]
-    # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
-    all_data_json_filtered = []
-    for item in all_data_json:
-        item["Track"] = item["eval_name"].split("_")[-1]
-        item["ioi"] = 0
-        item["mcqa"] = 0
-        if "VQA" in benchmark_cols and "VQA" in item:
-            all_data_json_filtered.append(item)
-        if "VQA" not in benchmark_cols and "VQA" not in item:
-            all_data_json_filtered.append(item)
-    all_data_json = all_data_json_filtered
-    df = pd.DataFrame.from_records(all_data_json)
-    df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
-    df = df[has_no_nan_values(df, benchmark_cols)]
-    return df
-def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list,
                                     metric_type = "F+") -> pd.DataFrame:
     """Creates a dataframe from all the MIB experiment results"""
     # print(f"results_path is {results_path}, requests_path is {requests_path}")
-    raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
     all_data_json = [v.to_dict(metric_type=metric_type) for v in raw_data]
     # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
@@ -122,10 +96,10 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
-def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     # print(f"results_path is {results_path}, requests_path is {requests_path}")
-    detailed_df, aggregated_df, intervention_averaged_df = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
     # all_data_json = [v.to_dict() for v in raw_detailed_df]
     # detailed_df = pd.DataFrame.from_records(all_data_json)
@@ -175,27 +149,28 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
             with open(file_path) as fp:
                 data = json.load(fp)
-            if "still_on_hub" in data and data["still_on_hub"]:
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-            else:
-                data[EvalQueueColumn.model.name] = data["model"]
-                data[EvalQueueColumn.revision.name] = "N/A"
             all_evals.append(data)
-        elif ".md" not in entry:
-            # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
     finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)

 from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
 from src.about import TasksMib_Causalgraph
+def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_cols: list,
                                     metric_type = "F+") -> pd.DataFrame:
     """Creates a dataframe from all the MIB experiment results"""
     # print(f"results_path is {results_path}, requests_path is {requests_path}")
+    raw_data = get_raw_eval_results_mib_subgraph(results_path)
     all_data_json = [v.to_dict(metric_type=metric_type) for v in raw_data]
     # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
+def get_leaderboard_df_mib_causalgraph(results_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     # print(f"results_path is {results_path}, requests_path is {requests_path}")
+    detailed_df, aggregated_df, intervention_averaged_df = get_raw_eval_results_mib_causalgraph(results_path)
     # all_data_json = [v.to_dict() for v in raw_detailed_df]
     # detailed_df = pd.DataFrame.from_records(all_data_json)
             with open(file_path) as fp:
                 data = json.load(fp)
+            # if "still_on_hub" in data and data["still_on_hub"]:
+            #     data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
+            #     data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+            # else:
+            #     data[EvalQueueColumn.model.name] = data["model"]
+            #     data[EvalQueueColumn.revision.name] = "N/A"
             all_evals.append(data)
+        # elif ".md" not in entry:
+        #     # this is a folder
+        #     sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
+        #     for sub_entry in sub_entries:
+        #         file_path = os.path.join(save_path, entry, sub_entry)
+        #         with open(file_path) as fp:
+        #             data = json.load(fp)
+        #         data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+        #         data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+        #         all_evals.append(data)
+    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN", "PREVALIDATION"]]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
     finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)

src/submission/check_validity.py CHANGED Viewed

@@ -1,38 +1,24 @@
 import json
 import os
 import re
 import numpy as np
 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
-import huggingface_hub
 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
-def check_model_card(repo_id: str) -> tuple[bool, str]:
-    """Checks if the model card and license exist and have been filled"""
-    try:
-        card = ModelCard.load(repo_id)
-    except huggingface_hub.utils.EntryNotFoundError:
-        return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
-    # Enforce license metadata
-    if card.data.license is None:
-        if not ("license_name" in card.data and "license_link" in card.data):
-            return False, (
-                "License not found. Please add a license to your model card using the `license` metadata or a"
-                " `license_name`/`license_link` pair."
-            )
-    # Enforce card content
-    if len(card.text) < 200:
-        return False, "Please add a description to your model card, it is too short."
-    return True, ""
 def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
@@ -73,10 +59,12 @@ def get_model_size(model_info: ModelInfo, precision: str):
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     """Gets the model architecture from the configuration"""
     return model_info.config.get("architectures", "Unknown")
 def already_submitted_models(requested_models_dir: str) -> set[str]:
     """Gather a list of already submitted models to avoid duplicates"""
     depth = 1
@@ -101,6 +89,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
     return set(file_names), users_to_submission_dates
 def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
     out_msg = ""
     for task in TEXT_TASKS:
@@ -164,4 +153,238 @@ def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
     if out_msg != "":
         return False, out_msg
-    return True, "Upload successful."

 import json
 import os
+import shutil
 import re
 import numpy as np
+import pandas as pd
+import gradio as gr
+from urllib.parse import urlparse
 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
+from typing import Literal
+from huggingface_hub import HfApi, HfFileSystem, hf_hub_url, get_hf_file_metadata
 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
+from src.envs import EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH
 def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     """Gets the model architecture from the configuration"""
     return model_info.config.get("architectures", "Unknown")
 def already_submitted_models(requested_models_dir: str) -> set[str]:
     """Gather a list of already submitted models to avoid duplicates"""
     depth = 1
     return set(file_names), users_to_submission_dates
 def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
     out_msg = ""
     for task in TEXT_TASKS:
     if out_msg != "":
         return False, out_msg
+    return True, "Upload successful."
+def _format_time(earliest_time):
+    time_left = (earliest_time.tz_convert("UTC") + timedelta(weeks=1)) - pd.Timestamp.utcnow()
+    hours = time_left.seconds // 3600
+    minutes, seconds = divmod(time_left.seconds % 3600, 60)
+    time_left_formatted = f"{hours:02}:{minutes:02}:{seconds:02}"
+    if time_left.days > 0:
+        time_left_formatted = f"{time_left.days} days, {time_left_formatted}"
+    return time_left_formatted
+def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
+    """Creates the different dataframes for the evaluation queues requests"""
+    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
+    all_evals = []
+    for entry in entries:
+        if ".json" in entry:
+            file_path = os.path.join(save_path, entry)
+            with open(file_path) as fp:
+                data = json.load(fp)
+            # if "still_on_hub" in data and data["still_on_hub"]:
+            #     data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
+            #     data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+            # else:
+            #     data[EvalQueueColumn.model.name] = data["model"]
+            #     data[EvalQueueColumn.revision.name] = "N/A"
+            all_evals.append(data)
+        elif ".md" not in entry:
+            # this is a folder
+            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
+            for sub_entry in sub_entries:
+                file_path = os.path.join(save_path, entry, sub_entry)
+                with open(file_path) as fp:
+                    data = json.load(fp)
+                all_evals.append(data)
+    return pd.DataFrame(all_evals)
+def check_rate_limit(track, user_name, contact_email):
+    if "Circuit" in track:
+        save_path = EVAL_REQUESTS_SUBGRAPH
+    else:
+        save_path = EVAL_REQUESTS_CAUSALGRAPH
+    evaluation_queue = get_evaluation_queue_df(save_path, ["user_name", "contact_email"])
+    if evaluation_queue.empty:
+        return True, None
+    one_week_ago = pd.Timestamp.utcnow() - timedelta(weeks=1)
+    user_name_occurrences = evaluation_queue[evaluation_queue["user_name"] == user_name]
+    user_name_occurrences["submit_time"] = pd.to_datetime(user_name_occurrences["submit_time"], utc=True)
+    user_name_occurrences = user_name_occurrences[user_name_occurrences["submit_time"] >= one_week_ago]
+    email_occurrences = evaluation_queue[evaluation_queue["contact_email"] == contact_email.lower()]
+    email_occurrences["submit_time"] = pd.to_datetime(email_occurrences["submit_time"], utc=True)
+    email_occurrences = email_occurrences[email_occurrences["submit_time"] >= one_week_ago]
+    if user_name_occurrences.shape[0] >= 2:
+        earliest_time = user_name_occurrences["submit_time"].min()
+        time_left_formatted = _format_time(earliest_time)
+        return False, time_left_formatted
+    if email_occurrences.shape[0] >= 2:
+        earliest_time = email_occurrences["submit_time"].min()
+        time_left_formatted = _format_time(earliest_time)
+        return False, time_left_formatted
+    return True, None
+def parse_huggingface_url(url: str):
+    """
+    Extracts repo_id and subfolder path from a Hugging Face URL.
+    Returns (repo_id, folder_path).
+    """
+    # Handle cases where the input is already a repo_id (no URL)
+    if not url.startswith(("http://", "https://")):
+        return url, None
+    parsed = urlparse(url)
+    path_parts = parsed.path.strip("/").split("/")
+    # Extract repo_id (username/repo_name)
+    if len(path_parts) < 2:
+        raise ValueError("Invalid Hugging Face URL: Could not extract repo_id.")
+    repo_id = f"{path_parts[0]}/{path_parts[1]}"
+    # Extract folder path (if in /tree/ or /blob/)
+    if "tree" in path_parts or "blob" in path_parts:
+        try:
+            branch_idx = path_parts.index("tree") if "tree" in path_parts else path_parts.index("blob")
+            folder_path = "/".join(path_parts[branch_idx + 2:])  # Skip "tree/main" or "blob/main"
+        except (ValueError, IndexError):
+            folder_path = None
+    else:
+        folder_path = None
+    return repo_id, folder_path
+def validate_directory(fs: HfFileSystem, repo_id: str, dirname: str, curr_tm: str, circuit_level:Literal['edge', 'node','neuron']='edge'):
+    errors = []
+    warnings = []
+    task, model = curr_tm.split("_")
+    curr_tm_display = curr_tm.replace("_", "/")
+    files = fs.ls(dirname)
+    # Detect whether multi-circuit or importances
+    is_multiple_circuits = False
+    files = [f["name"] for f in files if (f["name"].endswith(".json") or f["name"].endswith(".pt"))]
+    if len(files) == 1:
+        is_multiple_circuits = False
+    elif len(files) > 1:
+        is_multiple_circuits = True
+        if len(files) < 9:
+            errors.append(f"Folder for {curr_tm_display} contains multiple circuits, but not enough. If you intended to submit importances, include only one circuit in the folder. Otherwise, please add the rest of the circuits.")
+    else:
+        warnings.append(f"Directory present for {curr_tm_display} but is empty")
+    offset = 0
+    for idx, file in enumerate(files):
+        file_suffix = file.split(repo_id + "/")[1]
+        file_url = hf_hub_url(repo_id=repo_id, filename=file_suffix)
+        file_info = get_hf_file_metadata(file_url)
+        file_size_mb = file_info.size / (1024 * 1024)
+        if file_size_mb > 150:
+            warnings.append(f"Will skip file >150MB: {file}")
+            offset -= 1
+            continue
+        if is_multiple_circuits and idx + offset >= 9:
+            break
+    return errors, warnings
+def verify_circuit_submission(hf_repo, level, progress=gr.Progress()):
+    VALID_COMBINATIONS = [
+        "ioi_gpt2", "ioi_qwen2.5", "ioi_gemma2", "ioi_llama3", "ioi_interpbench",
+        "mcqa_qwen2.5", "mcqa_gemma2", "mcqa_llama3",
+        "arithmetic-addition_llama3", "arithmetic-subtraction_llama3",
+        "arc-easy_gemma2", "arc-easy_llama3",
+        "arc-challenge_llama3"
+    ]
+    TASKS = ["ioi", "mcqa", "arithmetic-addition", "arithmetic-subtraction", "arc-easy", "arc-challenge"]
+    MODELS = ["gpt2", "qwen2.5", "gemma2", "llama3", "interpbench"]
+    errors = []
+    warnings = []
+    directories_present = {tm: False for tm in VALID_COMBINATIONS}
+    directories_valid = {tm: False for tm in VALID_COMBINATIONS}
+    fs = HfFileSystem()
+    path = hf_repo
+    level = level
+    folder_path = path.split("huggingface.co/")[1]
+    repo_id = "/".join(folder_path.split("/")[:2])
+    try:
+        files = fs.listdir(folder_path)
+    except Exception as e:
+        errors.append(f"Could not open Huggingface URL: {e}")
+        return errors, warnings
+    file_counts = 0
+    for dirname in progress.tqdm(files, desc="Validating directories in repo"):
+        file_counts += 1
+        if file_counts >= 30:
+            warnings.append("Folder contains many files/directories; stopped at 30.")
+            break
+        circuit_dir = dirname["name"]
+        dirname_proc = circuit_dir.lower().split("/")[-1]
+        if not fs.isdir(circuit_dir):
+            continue
+        curr_task = None
+        curr_model = None
+        # Look for task names in filename
+        for task in TASKS:
+            if dirname_proc.startswith(task) or f"_{task}" in dirname_proc:
+                curr_task = task
+        # Look for model names in filename
+        for model in MODELS:
+            if dirname_proc.startswith(model) or f"_{model}" in dirname_proc:
+                curr_model = model
+        if curr_task is not None and curr_model is not None:
+            curr_tm = f"{curr_task}_{curr_model}"
+            if curr_tm in VALID_COMBINATIONS:
+                directories_present[curr_tm] = True
+            else:
+                continue
+        else:
+            continue
+        # Parse circuits directory
+        print(f"validating {circuit_dir}")
+        vd_errors, vd_warnings = validate_directory(fs, repo_id, circuit_dir, curr_tm, level)
+        errors.extend(vd_errors)
+        warnings.extend(vd_warnings)
+        if len(vd_errors) == 0:
+            directories_valid[curr_tm] = True
+    task_set, model_set = set(), set()
+    for tm in directories_present:
+        if not directories_present[tm]:
+            continue
+        if not directories_valid[tm]:
+            warnings.append(f"Directory found for {tm.replace('_', '/')}, but circuits not valid or present")
+            continue
+        task, model = tm.split("_")
+        task_set.add(task)
+        model_set.add(model)
+    if len(task_set) < 2:
+        errors.append("At least 2 tasks are required")
+    if len(model_set) < 2:
+        errors.append("At least 2 models are required")
+    no_tm_display = [tm.replace("_", "/") for tm in directories_valid if not directories_valid[tm]]
+    if len(no_tm_display) > 0:
+        warnings.append(f"No valid circuits or importance scores found for the following tasks/models: {*no_tm_display,}")
+    return errors, warnings
+def verify_causal_variable_submission(hf_repo, layer, position, code_upload):
+    return

src/submission/submit.py CHANGED Viewed

@@ -1,20 +1,96 @@
 import json
 import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 from src.submission.check_validity import (
     already_submitted_models,
-    check_model_card,
     get_model_size,
     is_model_on_hub,
     is_valid_predictions,
 )
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model_name: str,
     model_id: str,
@@ -83,7 +159,7 @@ def add_new_eval(
         return styled_error("A model with this name has been already submitted.")
     print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
@@ -109,3 +185,38 @@ def add_new_eval(
     return styled_message(
         "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the request to show in the PENDING list."
     )

 import json
 import os
+import smtplib
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, TOKEN, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH
 from src.submission.check_validity import (
     already_submitted_models,
     get_model_size,
     is_model_on_hub,
     is_valid_predictions,
+    parse_huggingface_url
 )
+import gradio as gr
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
+def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id):
+    errors = []
+    hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
+    try:
+        repo_info = hf_repo.split("huggingface.co/")[1]
+        user_name, repo_name = repo_info.split("/")[:2]
+    except Exception as e:
+        errors.append("Error processing HF URL: could not get username and repo name")
+    try:
+        commit_hash = API.list_repo_commits("/".join([user_name, repo_name]))[0].commit_id
+    except Exception as e:
+        errors.append("Could not get commit hash of provided Huggingface repo")
+    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    if not errors:
+        if "Circuit" in track:
+            eval_entry = {
+                "hf_repo": hf_repo,
+                "user_name": user_name,
+                "revision": commit_hash,
+                "circuit_level": level.lower(),
+                "method_name": method_name,
+                "contact_email": contact_email.lower(),
+                "submit_time": current_time,
+                "status": "PREVALIDATION",
+                "_id": _id
+            }
+            QUEUE_REPO = QUEUE_REPO_SUBGRAPH
+            EVAL_REQUESTS = EVAL_REQUESTS_SUBGRAPH
+        else:
+            eval_entry = {
+                "hf_repo": hf_repo,
+                "user_name": user_name,
+                "revision": commit_hash,
+                "layer": layer,
+                "token_position": token_position,
+                "code_upload": code_upload,
+                "method_name": method_name,
+                "contact_email": contact_email.lower(),
+                "submit_time": current_time,
+                "status": "PREVALIDATION",
+                "_id": _id
+            }
+            QUEUE_REPO = QUEUE_REPO_CAUSALGRAPH
+            EVAL_REQUESTS = EVAL_REQUESTS_CAUSALGRAPH
+        OUT_DIR = f"{EVAL_REQUESTS}/"
+        os.makedirs(OUT_DIR, exist_ok=True)
+        out_path = f"{OUT_DIR}/{method_name}_{_id}_{current_time}.json"
+        with open(out_path, 'w') as f:
+            f.write(json.dumps(eval_entry))
+        try:
+            API.upload_file(
+                path_or_fileobj=out_path,
+                path_in_repo=out_path.split("/")[-1],
+                repo_id=QUEUE_REPO,
+                repo_type="dataset",
+                commit_message=f"Add {method_name}_{_id}_{current_time}.json to eval queue"
+            )
+        except Exception as e:
+            errors.append(f"Could not upload entry to eval queue: {e}")
+    if errors:
+        status = gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
+    else:
+        status = gr.Textbox(f"✅ Submission received! Your ID is \"{_id}\". You'll receive an email once we've validated your submission.", visible=True)
+    return [
+        status,
+        None, None,
+        gr.Column(visible=False)
+    ]
 def add_new_eval(
     model_name: str,
     model_id: str,
         return styled_error("A model with this name has been already submitted.")
     print("Creating eval file")
+    OUT_DIR = f"{EVAL_REQUESTS}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
     return styled_message(
         "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the request to show in the PENDING list."
     )
+def remove_submission(track: str, method_name: str, _id: str):
+    if track is None:
+        return gr.Textbox(f"Please select a track.", visible=True)
+    if "Circuit" in track:
+        QUEUE_REPO = QUEUE_REPO_SUBGRAPH
+        EVAL_REQUESTS = EVAL_REQUESTS_SUBGRAPH
+    else:
+        QUEUE_REPO = QUEUE_REPO_CAUSALGRAPH
+        EVAL_REQUESTS = EVAL_REQUESTS_CAUSALGRAPH
+    OUT_DIR = f"{EVAL_REQUESTS}/"
+    os.makedirs(OUT_DIR, exist_ok=True)
+    files = os.listdir(OUT_DIR)
+    out_paths = [f for f in files if f.startswith(f"{method_name}_{_id}")]
+    if out_paths:
+        filename = out_paths[0]
+        filepath = os.path.join(OUT_DIR, filename)
+        with open(filepath, 'r') as f:
+            data = json.load(f)
+        hf_repo = data["hf_repo"]
+        try:
+            API.delete_file(
+                path_in_repo=filename,
+                repo_id=QUEUE_REPO,
+                repo_type="dataset"
+            )
+        except Exception as e:
+            return gr.Textbox(f"Could not delete entry from eval queue: {e}", visible=True)
+        os.remove(filepath)
+        status = "Submission removed from queue."
+    else:
+        status = "Submission not found in queue."
+    return gr.Textbox(status, visible=True)