Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

Aaron Mueller commited on Apr 17

Commit

5ed4bca

1 Parent(s): 3b802b7

update HF url handling

Browse files

Files changed (7) hide show

app.py +67 -43
src/about.py +40 -12
src/display/utils.py +2 -3
src/leaderboard/read_evals.py +27 -11
src/populate.py +10 -7
src/submission/check_validity.py +10 -6
src/submission/submit.py +9 -7

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ from copy import deepcopy
 from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
@@ -38,7 +38,7 @@ from src.display.utils import (
 from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
 from src.populate import get_evaluation_queue_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
 from src.submission.submit import upload_to_queue, remove_submission
-from src.submission.check_validity import verify_circuit_submission, verify_causal_variable_submission, check_rate_limit
 from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
@@ -288,7 +288,7 @@ def _sigmoid(x):
 LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
 LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
-                                                                  metric_type="F=")
 # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
 # In app.py, modify the LEADERBOARD initialization
@@ -300,14 +300,15 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
 # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
-# (
-#     finished_eval_queue_df,
-#     running_eval_queue_df,
-#     pending_eval_queue_df,
-# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard_mib_subgraph(dataframe, track):
@@ -577,7 +578,7 @@ with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
             with gr.Tabs() as subgraph_tabs:
-                with gr.TabItem("F+", id=0):
                     # Add description for filters
                     gr.Markdown("""
                     ### Filtering Options
@@ -610,7 +611,7 @@ with demo:
                         outputs=leaderboard
                     )
                     print(f"Leaderboard is {leaderboard}")
-                with gr.TabItem("F=", id=1):
                     # Add description for filters
                     gr.Markdown("""
                     ### Filtering Options
@@ -690,9 +691,7 @@ with demo:
                         "Causal Graph"
                     )
-        with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown("## 🏆 Submission Portal")
             # Track selection
             track = gr.Radio(
                 choices=[
@@ -704,28 +703,30 @@ with demo:
             )
             with gr.Group(visible=False) as circuit_ui:
-                gr.Markdown("### Circuit Localization Requirements")
-                with gr.Row():
-                    hf_repo_circ = gr.Textbox(
-                        label="HuggingFace Repository URL",
-                        placeholder="https://huggingface.co/username/repo/path",
-                        info="Must be a valid HuggingFace URL pointing to folders containing either 1 importance score file per task/model, or " \
-                             "9 circuit files per task/model (.json or .pt). " \
-                             "Remove 'tree', 'resolve', and the branch name (e.g., '/tree/main/') from URL if present."
-                    )
-                    level = gr.Radio(
-                        choices=[
-                            "Edge",
-                            "Node (submodule)",
-                            "Node (neuron)"
-                        ],
-                        label="Level of granularity",
-                        info="Is your circuit defined by its inclusion/exclusion of certain edges (e.g., MLP1 to H10L12), of certain submodules (e.g., MLP1), or of neurons " \
-                            "within those submodules (e.g., MLP1 neuron 295)?"
-                    )
             with gr.Group(visible=False) as causal_ui:
-                gr.Markdown("### Causal Variable Localization Requirements")
                 with gr.Row():
                     layer = gr.Number(
                         label="Layer Number",
@@ -743,9 +744,7 @@ with demo:
                     hf_repo_cg = gr.Textbox(
                         label="HuggingFace Repository URL",
                         placeholder="https://huggingface.co/username/repo/path",
-                        info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " \
-                             "Remove 'tree', 'resolve', and the branch name (e.g., '/tree/main/') from URL if present."
-                    )
                     code_upload = gr.File(
                         label="Upload Python file implementing your featurization function",
                         file_types=[".py"],
@@ -791,12 +790,12 @@ with demo:
                     errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
                     breaking_error = True
                 else:
-                    repo_info = hf_repo.split("huggingface.co/")[1]
-                    if len(repo_info.split("/")) < 2:
                         errors.append("Could not read username or repo name from HF URL")
                         breaking_error = True
                     else:
-                        user_name, repo_name = repo_info.split("/")[:2]
                         under_rate_limit, time_left = check_rate_limit(track, user_name, contact_email)
                         if not under_rate_limit:
                             errors.append(f"Rate limit exceeded (max 2 submissions per week). Please try again in {time_left}. " \
@@ -841,8 +840,8 @@ with demo:
             with warning_modal:
                 gr.Markdown("### ⚠️ Submission Warnings")
                 warning_display = gr.Markdown()
-                proceed_btn = gr.Button("Proceed Anyway", variant="primary")
-                cancel_btn = gr.Button("Cancel Submission", variant="secondary")
             # Store submission data between callbacks
             pending_submission = gr.State()
@@ -865,6 +864,31 @@ with demo:
                 outputs=[status, warning_display, pending_submission, warning_modal]
             )
             with gr.Group():
                 gr.Markdown("### Remove Submission from Queue")
                 with gr.Row():

 from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT_SUBGRAPH, EVALUATION_QUEUE_TEXT_CAUSALVARIABLE,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
 from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
 from src.populate import get_evaluation_queue_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
 from src.submission.submit import upload_to_queue, remove_submission
+from src.submission.check_validity import verify_circuit_submission, verify_causal_variable_submission, check_rate_limit, parse_huggingface_url
 from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
 LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
 LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
+                                                                  metric_type="CMD")
 # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
 # In app.py, modify the LEADERBOARD initialization
 # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
+(
+    finished_eval_queue_df_subgraph,
+    pending_eval_queue_df_subgraph,
+) = get_evaluation_queue_df(EVAL_REQUESTS_SUBGRAPH, EVAL_COLS)
+# (
+#     finished_eval_queue_df_causalvariable,
+#     pending_eval_queue_df_causalvariable,
+# ) = get_evaluation_queue_df(EVAL_REQUESTS_CAUSALGRAPH, EVAL_COLS)
 def init_leaderboard_mib_subgraph(dataframe, track):
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
             with gr.Tabs() as subgraph_tabs:
+                with gr.TabItem("CPR", id=0):
                     # Add description for filters
                     gr.Markdown("""
                     ### Filtering Options
                         outputs=leaderboard
                     )
                     print(f"Leaderboard is {leaderboard}")
+                with gr.TabItem("CMD", id=1):
                     # Add description for filters
                     gr.Markdown("""
                     ### Filtering Options
                         "Causal Graph"
                     )
+        with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
             # Track selection
             track = gr.Radio(
                 choices=[
             )
             with gr.Group(visible=False) as circuit_ui:
+                with gr.Column():
+                    with gr.Row():
+                        gr.Markdown(EVALUATION_QUEUE_TEXT_SUBGRAPH, elem_classes="markdown-text")
+                    with gr.Row():
+                        hf_repo_circ = gr.Textbox(
+                            label="HuggingFace Repository URL",
+                            placeholder="https://huggingface.co/username/repo/path",
+                            info="Must be a valid HuggingFace URL pointing to folders containing either 1 importance score file per task/model, or " \
+                                "9 circuit files per task/model (.json or .pt)."
+                        )
+                        level = gr.Radio(
+                            choices=[
+                                "Edge",
+                                "Node (submodule)",
+                                "Node (neuron)"
+                            ],
+                            label="Level of granularity",
+                            info="Is your circuit defined by its inclusion/exclusion of certain edges (e.g., MLP1 to H10L12), of certain submodules (e.g., MLP1), or of neurons " \
+                                "within those submodules (e.g., MLP1 neuron 295)?"
+                        )
             with gr.Group(visible=False) as causal_ui:
+                gr.Markdown(EVALUATION_QUEUE_TEXT_CAUSALVARIABLE, elem_classes="markdown-text")
                 with gr.Row():
                     layer = gr.Number(
                         label="Layer Number",
                     hf_repo_cg = gr.Textbox(
                         label="HuggingFace Repository URL",
                         placeholder="https://huggingface.co/username/repo/path",
+                        info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). "                    )
                     code_upload = gr.File(
                         label="Upload Python file implementing your featurization function",
                         file_types=[".py"],
                     errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
                     breaking_error = True
                 else:
+                    repo_id, subfolder, revision = parse_huggingface_url(hf_repo)
+                    if repo_id is None:
                         errors.append("Could not read username or repo name from HF URL")
                         breaking_error = True
                     else:
+                        user_name, repo_name = repo_id.split("/")
                         under_rate_limit, time_left = check_rate_limit(track, user_name, contact_email)
                         if not under_rate_limit:
                             errors.append(f"Rate limit exceeded (max 2 submissions per week). Please try again in {time_left}. " \
             with warning_modal:
                 gr.Markdown("### ⚠️ Submission Warnings")
                 warning_display = gr.Markdown()
+                proceed_btn = gr.Button("Proceed Anyway", variant="secondary")
+                cancel_btn = gr.Button("Cancel Submission", variant="primary")
             # Store submission data between callbacks
             pending_submission = gr.State()
                 outputs=[status, warning_display, pending_submission, warning_modal]
             )
+            with gr.Column():
+                with gr.Accordion(
+                    f"✅ Finished Evaluations ({len(finished_eval_queue_df_subgraph)})",
+                    open=False,
+                ):
+                    with gr.Row():
+                        finished_eval_table = gr.components.Dataframe(
+                            value=finished_eval_queue_df_subgraph,
+                            headers=EVAL_COLS,
+                            datatype=EVAL_TYPES,
+                            row_count=5,
+                        )
+                with gr.Accordion(
+                    f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df_subgraph)})",
+                    open=False,
+                ):
+                    with gr.Row():
+                        pending_eval_table = gr.components.Dataframe(
+                            value=pending_eval_queue_df_subgraph,
+                            headers=EVAL_COLS,
+                            datatype=EVAL_TYPES,
+                            row_count=5,
+                        )
             with gr.Group():
                 gr.Markdown("### Remove Submission from Queue")
                 with gr.Row():

src/about.py CHANGED Viewed

@@ -105,7 +105,7 @@ TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Ben
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-The leaderboards for each track of the 2024 Mechanistic Interpretability Benchmark.
 """
 # Which evaluations are you running? how can people reproduce what you have?
@@ -113,24 +113,52 @@ LLM_BENCHMARKS_TEXT = f"""
 This leaderboard displays scores on the private test set for the Mechanistic Interpretability Benchmark. Each track has its own tab.
 """
-EVALUATION_QUEUE_TEXT = """
-## Circuit localization track:
 You'll need either (i) 1 circuit per task/model combinaton with floating-point importance scores for each edge or node, or (ii) 9 circuits per model/task with binary membership scores for each edge or node.
-If (ii), then for each critical threshold k, the circuit should contain no more than k% of edges. See [here]() for examples of each valid circuit format.
-Create a folder in a HuggingFace repository to hold your circuits. At the URL you provide, there should be one folder per task/model combination; these folders
-should contain your circuit(s). As long as the folders contain the model and task names, you do not need to worry about the circuit filenames.
-If you provide more circuits than needed, our evaluation script will take the first 9 lexicographically.
-For specifications about the file format for a circuit, see the README on our project GitHub: TODO
-Once your submission has been validated and makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
-## Causal variable localization track:
-You'll need to provide a link to a HuggingFace repository containing your trained featurizer, the layer on which the featurizer was trained, and the code needed to load and run your featurizer.
-See TODO for an example.
 """
 CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"

 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+The leaderboards for each track of the Mechanistic Interpretability Benchmark.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 This leaderboard displays scores on the private test set for the Mechanistic Interpretability Benchmark. Each track has its own tab.
 """
+EVALUATION_QUEUE_TEXT_SUBGRAPH = """
+## Circuit localization track
+### 1. Collect your circuits
 You'll need either (i) 1 circuit per task/model combinaton with floating-point importance scores for each edge or node, or (ii) 9 circuits per model/task with binary membership scores for each edge or node.
+For specifications about the file formats we accept, see the README on [our project GitHub](https://github.com/hannamw/MIB-subgraph-track).
+### 2. Upload your circuits
+Create a HuggingFace repository, and create a folder in that repository that will hold all of your circuit folders.
+At the URL you provide, there should be one folder per task/model combination; these folders
+should contain your circuit(s). As long as the folder names contain the model and task names, you do not need to worry about the circuit filenames.
+If you provide more circuits than needed, our evaluation script will take the first 9 lexicographically in a given folder. We provide examples of valid
+submissions: see [here](https://huggingface.co/mib-bench/mib-circuits-example/tree/main/importances/json) for a submission using importance scores and
+[here](https://huggingface.co/mib-bench/mib-circuits-example/tree/main/multiple_circuits/pt) for a submission uploading multiple circuits.
+### 3. Manage your submission in the queue
+If your submission passes all checks, it will be added to the queue. You will receive a submission ID here when you do this; be sure to save it!
+This will allow you to remove your submission from the queue (e.g., if you find a bug in your circuits). This will prevent you from needing to wait until
+next week to resubmit.
+Before your submission has been validated by our backend, it will have the "PREVALIDATION" status in the queue. Once it has been validated, it will have the "PENDING" status.
+It will keep the PENDING status until it has been run on the private test set.
+"""
+EVALUATION_QUEUE_TEXT_CAUSALVARIABLE = """
+## Causal variable localization track
+### 1. Collect your materials
+You'll need the following:
+* A trained featurizer saved as a .pt object.
+* A python function that can load and run forward passes with your featurizer.
+* A dynamic token alignment function.
+* A hypothesized feature location.
+### 2. Upload your materials
+Create a HuggingFace repository, and create a folder in that repository that will hold all of your materials.
+At the URL you provide, each of the above materials should be present. We will take the first python script lexicographically
+as your featurizer function, and the first .pt file lexicographically as your featurizer.
+### 3. Manage your submission in the queue
+If your submission passes all checks, it will be added to the queue. You will receive a submission ID here when you do this; be sure to save it!
+This will allow you to remove your submission from the queue (e.g., if you find a bug in your circuits). This will prevent you from needing to wait until
+next week to resubmit.
+Before your submission has been validated by our backend, it will have the "PREVALIDATION" status in the queue. Once it has been validated, it will have the "PENDING" status.
+It will keep the PENDING status until it has been run on the private test set.
 """
 CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"

src/display/utils.py CHANGED Viewed

@@ -192,10 +192,9 @@ AutoEvalColumn_mib_causalgraph = make_dataclass(
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
-    model = ColumnContent("model", "markdown", True)
-    track = ColumnContent("track", "str", True)
     revision = ColumnContent("revision", "str", True)
-    private = ColumnContent("private", "bool", True)
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need

 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
+    method_name = ColumnContent("method_name", "str", True)
+    repo_id = ColumnContent("hf_repo", "markdown", True)
     revision = ColumnContent("revision", "str", True)
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need

src/leaderboard/read_evals.py CHANGED Viewed

@@ -18,11 +18,11 @@ import pandas as pd
-def compute_area(edge_counts, faithfulnesses, log_scale=True):
     # Return None if either list is empty
     if not edge_counts or not faithfulnesses:
         return None, None, None
     percentages = [e / max(edge_counts) for e in edge_counts]
     area_under = 0.
     area_from_100 = 0.
@@ -72,13 +72,23 @@ class EvalResult_MIB_SUBGRAPH:
             # Keep exact scores structure from JSON
             scores = model_result.get("scores", {})
-            # for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:
             for task in TasksMib_Subgraph.get_all_tasks():
                 if task in scores:
-                    results[task][model_name] = {
-                        "edge_counts": scores[task]["edge_counts"],
-                        "faithfulness": scores[task]["faithfulness"]
-                    }
         return EvalResult_MIB_SUBGRAPH(
             eval_name=method_name,
@@ -100,7 +110,7 @@ class EvalResult_MIB_SUBGRAPH:
             df_transformed.loc[i] = row.apply(lambda x: self._sigmoid(x) if isinstance(x, (float, int)) else x)
         return df_transformed
-    def to_dict(self, metric_type="F+"):
         """Converts the Eval Result to a dict for dataframe display"""
         data_dict = {
             "eval_name": self.eval_name,
@@ -122,9 +132,15 @@ class EvalResult_MIB_SUBGRAPH:
             for model, metrics in task_results.items():
                 col_name = f"{task}_{model}"
-                if not metrics or not metrics["edge_counts"] or not metrics["faithfulness"]:
                     continue
                 faithfulness = metrics["faithfulness"]
                 if isinstance(faithfulness[0], list):
                     faithfulness = faithfulness[0]
@@ -134,7 +150,7 @@ class EvalResult_MIB_SUBGRAPH:
                     continue
                 area_under, area_from_100, _ = result
-                score = area_under if metric_type == "F+" else area_from_100
                 data_dict[col_name] = round(score, 2)
                 all_scores.append(score)
                 transformed_scores.append(self._sigmoid(score))

+def compute_area(edge_counts, faithfulnesses):
     # Return None if either list is empty
     if not edge_counts or not faithfulnesses:
         return None, None, None
     percentages = [e / max(edge_counts) for e in edge_counts]
     area_under = 0.
     area_from_100 = 0.
             # Keep exact scores structure from JSON
             scores = model_result.get("scores", {})
             for task in TasksMib_Subgraph.get_all_tasks():
                 if task in scores:
+                    if "CPR" in scores[task]:
+                        results[task][model_name] = {"CPR": {}, "CMD": {}}
+                        results[task][model_name]["CPR"] = {
+                            "edge_counts": scores[task]["CPR"]["edge_counts"],
+                            "faithfulness": scores[task]["CPR"]["faithfulness"]
+                        }
+                        results[task][model_name]["CMD"] = {
+                            "edge_counts": scores[task]["CMD"]["edge_counts"],
+                            "faithfulness": scores[task]["CMD"]["faithfulness"]
+                        }
+                    else:
+                        results[task][model_name] = {
+                            "edge_counts": scores[task]["edge_counts"],
+                            "faithfulness": scores[task]["faithfulness"]
+                        }
         return EvalResult_MIB_SUBGRAPH(
             eval_name=method_name,
             df_transformed.loc[i] = row.apply(lambda x: self._sigmoid(x) if isinstance(x, (float, int)) else x)
         return df_transformed
+    def to_dict(self, metric_type="CPR"):
         """Converts the Eval Result to a dict for dataframe display"""
         data_dict = {
             "eval_name": self.eval_name,
             for model, metrics in task_results.items():
                 col_name = f"{task}_{model}"
+                if not metrics:
                     continue
+                if not metrics[metric_type] and (not metrics["edge_counts"] or not metrics["faithfulness"]):
+                    continue
+                if metric_type in metrics:
+                    metrics = metrics[metric_type]
                 faithfulness = metrics["faithfulness"]
                 if isinstance(faithfulness[0], list):
                     faithfulness = faithfulness[0]
                     continue
                 area_under, area_from_100, _ = result
+                score = area_under if metric_type == "CPR" else area_from_100
                 data_dict[col_name] = round(score, 2)
                 all_scores.append(score)
                 transformed_scores.append(self._sigmoid(score))

src/populate.py CHANGED Viewed

@@ -7,9 +7,10 @@ from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
 from src.about import TasksMib_Causalgraph
 def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_cols: list,
-                                    metric_type = "F+") -> pd.DataFrame:
     """Creates a dataframe from all the MIB experiment results"""
     # print(f"results_path is {results_path}, requests_path is {requests_path}")
     raw_data = get_raw_eval_results_mib_subgraph(results_path)
@@ -19,7 +20,7 @@ def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_col
     # Convert to dataframe
     df = pd.DataFrame.from_records(all_data_json)
-    ascending = False if metric_type == "F+" else True
     # Sort by Average score descending
     if 'Average' in df.columns:
@@ -170,10 +171,12 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
         #         data[EvalQueueColumn.revision.name] = data.get("revision", "main")
         #         all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN", "PREVALIDATION"]]
-    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
-    df_running = pd.DataFrame.from_records(running_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]

 from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
 from src.about import TasksMib_Causalgraph
+from src.submission.check_validity import parse_huggingface_url
 def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_cols: list,
+                                    metric_type = "CPR") -> pd.DataFrame:
     """Creates a dataframe from all the MIB experiment results"""
     # print(f"results_path is {results_path}, requests_path is {requests_path}")
     raw_data = get_raw_eval_results_mib_subgraph(results_path)
     # Convert to dataframe
     df = pd.DataFrame.from_records(all_data_json)
+    ascending = False if metric_type == "CPR" else True
     # Sort by Average score descending
     if 'Average' in df.columns:
         #         data[EvalQueueColumn.revision.name] = data.get("revision", "main")
         #         all_evals.append(data)
+    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "PREVALIDATION"]]
+    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL" or e["status"] == "FAILED"]
+    for list in (pending_list, finished_list):
+        for item in list:
+            item["track"] = "Circuit Localization"
+            item["hf_repo"] = parse_huggingface_url(item["hf_repo"])[0]
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
+    return df_finished[cols], df_pending[cols]

src/submission/check_validity.py CHANGED Viewed

@@ -238,23 +238,26 @@ def parse_huggingface_url(url: str):
     parsed = urlparse(url)
     path_parts = parsed.path.strip("/").split("/")
     # Extract repo_id (username/repo_name)
     if len(path_parts) < 2:
-        raise ValueError("Invalid Hugging Face URL: Could not extract repo_id.")
-    repo_id = f"{path_parts[0]}/{path_parts[1]}"
     # Extract folder path (if in /tree/ or /blob/)
     if "tree" in path_parts or "blob" in path_parts:
         try:
             branch_idx = path_parts.index("tree") if "tree" in path_parts else path_parts.index("blob")
             folder_path = "/".join(path_parts[branch_idx + 2:])  # Skip "tree/main" or "blob/main"
         except (ValueError, IndexError):
             folder_path = None
     else:
         folder_path = None
-    return repo_id, folder_path
 def validate_directory(fs: HfFileSystem, repo_id: str, dirname: str, curr_tm: str, circuit_level:Literal['edge', 'node','neuron']='edge'):
@@ -318,10 +321,11 @@ def verify_circuit_submission(hf_repo, level, progress=gr.Progress()):
     path = hf_repo
     level = level
-    folder_path = path.split("huggingface.co/")[1]
-    repo_id = "/".join(folder_path.split("/")[:2])
     try:
-        files = fs.listdir(folder_path)
     except Exception as e:
         errors.append(f"Could not open Huggingface URL: {e}")
         return errors, warnings

     parsed = urlparse(url)
     path_parts = parsed.path.strip("/").split("/")
+    revision = "main"
     # Extract repo_id (username/repo_name)
     if len(path_parts) < 2:
+        return None, None, None     # Can't extract repo_id
+    else:
+        repo_id = f"{path_parts[0]}/{path_parts[1]}"
     # Extract folder path (if in /tree/ or /blob/)
     if "tree" in path_parts or "blob" in path_parts:
         try:
             branch_idx = path_parts.index("tree") if "tree" in path_parts else path_parts.index("blob")
             folder_path = "/".join(path_parts[branch_idx + 2:])  # Skip "tree/main" or "blob/main"
+            revision = path_parts[branch_idx + 1]
         except (ValueError, IndexError):
             folder_path = None
     else:
         folder_path = None
+    return repo_id, folder_path, revision
 def validate_directory(fs: HfFileSystem, repo_id: str, dirname: str, curr_tm: str, circuit_level:Literal['edge', 'node','neuron']='edge'):
     path = hf_repo
     level = level
+    repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
+    folder_path = repo_id + "/" + folder_path
     try:
+        files = fs.listdir(folder_path, revision=revision)
     except Exception as e:
         errors.append(f"Could not open Huggingface URL: {e}")
         return errors, warnings

src/submission/submit.py CHANGED Viewed

@@ -20,15 +20,17 @@ USERS_TO_SUBMISSION_DATES = None
 def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id):
     errors = []
     hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
     try:
-        repo_info = hf_repo.split("huggingface.co/")[1]
-        user_name, repo_name = repo_info.split("/")[:2]
     except Exception as e:
         errors.append("Error processing HF URL: could not get username and repo name")
-    try:
-        commit_hash = API.list_repo_commits("/".join([user_name, repo_name]))[0].commit_id
-    except Exception as e:
-        errors.append("Could not get commit hash of provided Huggingface repo")
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     if not errors:
@@ -84,7 +86,7 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_positio
     if errors:
         status = gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
     else:
-        status = gr.Textbox(f"✅ Submission received! Your ID is \"{_id}\". You'll receive an email once we've validated your submission.", visible=True)
     return [
         status,
         None, None,

 def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id):
     errors = []
     hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
+    repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
     try:
+        user_name, repo_name = repo_id.split("/")
     except Exception as e:
         errors.append("Error processing HF URL: could not get username and repo name")
+    if revision is None or revision == "main":
+        try:
+            commit_hash = API.list_repo_commits(repo_id)[0].commit_id
+        except Exception as e:
+            errors.append("Could not get commit hash of provided Huggingface repo")
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     if not errors:
     if errors:
         status = gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
     else:
+        status = gr.Textbox(f"✅ Submission received! Your submission ID is \"{_id}\". Save this so that you can manage your submission on the queue.", visible=True)
     return [
         status,
         None, None,