Aaron Mueller commited on
Commit
5ed4bca
Β·
1 Parent(s): 3b802b7

update HF url handling

Browse files
app.py CHANGED
@@ -17,7 +17,7 @@ from copy import deepcopy
17
  from src.about import (
18
  CITATION_BUTTON_LABEL,
19
  CITATION_BUTTON_TEXT,
20
- EVALUATION_QUEUE_TEXT,
21
  INTRODUCTION_TEXT,
22
  LLM_BENCHMARKS_TEXT,
23
  TITLE,
@@ -38,7 +38,7 @@ from src.display.utils import (
38
  from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
39
  from src.populate import get_evaluation_queue_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
40
  from src.submission.submit import upload_to_queue, remove_submission
41
- from src.submission.check_validity import verify_circuit_submission, verify_causal_variable_submission, check_rate_limit
42
 
43
  from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
44
 
@@ -288,7 +288,7 @@ def _sigmoid(x):
288
 
289
  LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
290
  LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
291
- metric_type="F=")
292
 
293
  # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
294
  # In app.py, modify the LEADERBOARD initialization
@@ -300,14 +300,15 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
300
  # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
301
  # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
302
 
303
- # (
304
- # finished_eval_queue_df,
305
- # running_eval_queue_df,
306
- # pending_eval_queue_df,
307
- # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
308
-
309
-
310
 
 
 
 
 
311
 
312
 
313
  def init_leaderboard_mib_subgraph(dataframe, track):
@@ -577,7 +578,7 @@ with demo:
577
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
578
  with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
579
  with gr.Tabs() as subgraph_tabs:
580
- with gr.TabItem("F+", id=0):
581
  # Add description for filters
582
  gr.Markdown("""
583
  ### Filtering Options
@@ -610,7 +611,7 @@ with demo:
610
  outputs=leaderboard
611
  )
612
  print(f"Leaderboard is {leaderboard}")
613
- with gr.TabItem("F=", id=1):
614
  # Add description for filters
615
  gr.Markdown("""
616
  ### Filtering Options
@@ -690,9 +691,7 @@ with demo:
690
  "Causal Graph"
691
  )
692
 
693
- with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
694
- gr.Markdown("## πŸ† Submission Portal")
695
-
696
  # Track selection
697
  track = gr.Radio(
698
  choices=[
@@ -704,28 +703,30 @@ with demo:
704
  )
705
 
706
  with gr.Group(visible=False) as circuit_ui:
707
- gr.Markdown("### Circuit Localization Requirements")
708
- with gr.Row():
709
- hf_repo_circ = gr.Textbox(
710
- label="HuggingFace Repository URL",
711
- placeholder="https://huggingface.co/username/repo/path",
712
- info="Must be a valid HuggingFace URL pointing to folders containing either 1 importance score file per task/model, or " \
713
- "9 circuit files per task/model (.json or .pt). " \
714
- "Remove 'tree', 'resolve', and the branch name (e.g., '/tree/main/') from URL if present."
715
- )
716
- level = gr.Radio(
717
- choices=[
718
- "Edge",
719
- "Node (submodule)",
720
- "Node (neuron)"
721
- ],
722
- label="Level of granularity",
723
- info="Is your circuit defined by its inclusion/exclusion of certain edges (e.g., MLP1 to H10L12), of certain submodules (e.g., MLP1), or of neurons " \
724
- "within those submodules (e.g., MLP1 neuron 295)?"
725
- )
 
 
726
 
727
  with gr.Group(visible=False) as causal_ui:
728
- gr.Markdown("### Causal Variable Localization Requirements")
729
  with gr.Row():
730
  layer = gr.Number(
731
  label="Layer Number",
@@ -743,9 +744,7 @@ with demo:
743
  hf_repo_cg = gr.Textbox(
744
  label="HuggingFace Repository URL",
745
  placeholder="https://huggingface.co/username/repo/path",
746
- info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " \
747
- "Remove 'tree', 'resolve', and the branch name (e.g., '/tree/main/') from URL if present."
748
- )
749
  code_upload = gr.File(
750
  label="Upload Python file implementing your featurization function",
751
  file_types=[".py"],
@@ -791,12 +790,12 @@ with demo:
791
  errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
792
  breaking_error = True
793
  else:
794
- repo_info = hf_repo.split("huggingface.co/")[1]
795
- if len(repo_info.split("/")) < 2:
796
  errors.append("Could not read username or repo name from HF URL")
797
  breaking_error = True
798
  else:
799
- user_name, repo_name = repo_info.split("/")[:2]
800
  under_rate_limit, time_left = check_rate_limit(track, user_name, contact_email)
801
  if not under_rate_limit:
802
  errors.append(f"Rate limit exceeded (max 2 submissions per week). Please try again in {time_left}. " \
@@ -841,8 +840,8 @@ with demo:
841
  with warning_modal:
842
  gr.Markdown("### ⚠️ Submission Warnings")
843
  warning_display = gr.Markdown()
844
- proceed_btn = gr.Button("Proceed Anyway", variant="primary")
845
- cancel_btn = gr.Button("Cancel Submission", variant="secondary")
846
 
847
  # Store submission data between callbacks
848
  pending_submission = gr.State()
@@ -865,6 +864,31 @@ with demo:
865
  outputs=[status, warning_display, pending_submission, warning_modal]
866
  )
867
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868
  with gr.Group():
869
  gr.Markdown("### Remove Submission from Queue")
870
  with gr.Row():
 
17
  from src.about import (
18
  CITATION_BUTTON_LABEL,
19
  CITATION_BUTTON_TEXT,
20
+ EVALUATION_QUEUE_TEXT_SUBGRAPH, EVALUATION_QUEUE_TEXT_CAUSALVARIABLE,
21
  INTRODUCTION_TEXT,
22
  LLM_BENCHMARKS_TEXT,
23
  TITLE,
 
38
  from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
39
  from src.populate import get_evaluation_queue_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
40
  from src.submission.submit import upload_to_queue, remove_submission
41
+ from src.submission.check_validity import verify_circuit_submission, verify_causal_variable_submission, check_rate_limit, parse_huggingface_url
42
 
43
  from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
44
 
 
288
 
289
  LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
290
  LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
291
+ metric_type="CMD")
292
 
293
  # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
294
  # In app.py, modify the LEADERBOARD initialization
 
300
  # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
301
  # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
302
 
303
+ (
304
+ finished_eval_queue_df_subgraph,
305
+ pending_eval_queue_df_subgraph,
306
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_SUBGRAPH, EVAL_COLS)
 
 
 
307
 
308
+ # (
309
+ # finished_eval_queue_df_causalvariable,
310
+ # pending_eval_queue_df_causalvariable,
311
+ # ) = get_evaluation_queue_df(EVAL_REQUESTS_CAUSALGRAPH, EVAL_COLS)
312
 
313
 
314
  def init_leaderboard_mib_subgraph(dataframe, track):
 
578
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
579
  with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
580
  with gr.Tabs() as subgraph_tabs:
581
+ with gr.TabItem("CPR", id=0):
582
  # Add description for filters
583
  gr.Markdown("""
584
  ### Filtering Options
 
611
  outputs=leaderboard
612
  )
613
  print(f"Leaderboard is {leaderboard}")
614
+ with gr.TabItem("CMD", id=1):
615
  # Add description for filters
616
  gr.Markdown("""
617
  ### Filtering Options
 
691
  "Causal Graph"
692
  )
693
 
694
+ with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
 
 
695
  # Track selection
696
  track = gr.Radio(
697
  choices=[
 
703
  )
704
 
705
  with gr.Group(visible=False) as circuit_ui:
706
+ with gr.Column():
707
+ with gr.Row():
708
+ gr.Markdown(EVALUATION_QUEUE_TEXT_SUBGRAPH, elem_classes="markdown-text")
709
+
710
+ with gr.Row():
711
+ hf_repo_circ = gr.Textbox(
712
+ label="HuggingFace Repository URL",
713
+ placeholder="https://huggingface.co/username/repo/path",
714
+ info="Must be a valid HuggingFace URL pointing to folders containing either 1 importance score file per task/model, or " \
715
+ "9 circuit files per task/model (.json or .pt)."
716
+ )
717
+ level = gr.Radio(
718
+ choices=[
719
+ "Edge",
720
+ "Node (submodule)",
721
+ "Node (neuron)"
722
+ ],
723
+ label="Level of granularity",
724
+ info="Is your circuit defined by its inclusion/exclusion of certain edges (e.g., MLP1 to H10L12), of certain submodules (e.g., MLP1), or of neurons " \
725
+ "within those submodules (e.g., MLP1 neuron 295)?"
726
+ )
727
 
728
  with gr.Group(visible=False) as causal_ui:
729
+ gr.Markdown(EVALUATION_QUEUE_TEXT_CAUSALVARIABLE, elem_classes="markdown-text")
730
  with gr.Row():
731
  layer = gr.Number(
732
  label="Layer Number",
 
744
  hf_repo_cg = gr.Textbox(
745
  label="HuggingFace Repository URL",
746
  placeholder="https://huggingface.co/username/repo/path",
747
+ info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " )
 
 
748
  code_upload = gr.File(
749
  label="Upload Python file implementing your featurization function",
750
  file_types=[".py"],
 
790
  errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
791
  breaking_error = True
792
  else:
793
+ repo_id, subfolder, revision = parse_huggingface_url(hf_repo)
794
+ if repo_id is None:
795
  errors.append("Could not read username or repo name from HF URL")
796
  breaking_error = True
797
  else:
798
+ user_name, repo_name = repo_id.split("/")
799
  under_rate_limit, time_left = check_rate_limit(track, user_name, contact_email)
800
  if not under_rate_limit:
801
  errors.append(f"Rate limit exceeded (max 2 submissions per week). Please try again in {time_left}. " \
 
840
  with warning_modal:
841
  gr.Markdown("### ⚠️ Submission Warnings")
842
  warning_display = gr.Markdown()
843
+ proceed_btn = gr.Button("Proceed Anyway", variant="secondary")
844
+ cancel_btn = gr.Button("Cancel Submission", variant="primary")
845
 
846
  # Store submission data between callbacks
847
  pending_submission = gr.State()
 
864
  outputs=[status, warning_display, pending_submission, warning_modal]
865
  )
866
 
867
+ with gr.Column():
868
+ with gr.Accordion(
869
+ f"βœ… Finished Evaluations ({len(finished_eval_queue_df_subgraph)})",
870
+ open=False,
871
+ ):
872
+ with gr.Row():
873
+ finished_eval_table = gr.components.Dataframe(
874
+ value=finished_eval_queue_df_subgraph,
875
+ headers=EVAL_COLS,
876
+ datatype=EVAL_TYPES,
877
+ row_count=5,
878
+ )
879
+
880
+ with gr.Accordion(
881
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df_subgraph)})",
882
+ open=False,
883
+ ):
884
+ with gr.Row():
885
+ pending_eval_table = gr.components.Dataframe(
886
+ value=pending_eval_queue_df_subgraph,
887
+ headers=EVAL_COLS,
888
+ datatype=EVAL_TYPES,
889
+ row_count=5,
890
+ )
891
+
892
  with gr.Group():
893
  gr.Markdown("### Remove Submission from Queue")
894
  with gr.Row():
src/about.py CHANGED
@@ -105,7 +105,7 @@ TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Ben
105
 
106
  # What does your leaderboard evaluate?
107
  INTRODUCTION_TEXT = """
108
- The leaderboards for each track of the 2024 Mechanistic Interpretability Benchmark.
109
  """
110
 
111
  # Which evaluations are you running? how can people reproduce what you have?
@@ -113,24 +113,52 @@ LLM_BENCHMARKS_TEXT = f"""
113
  This leaderboard displays scores on the private test set for the Mechanistic Interpretability Benchmark. Each track has its own tab.
114
  """
115
 
116
- EVALUATION_QUEUE_TEXT = """
117
- ## Circuit localization track:
118
 
 
119
  You'll need either (i) 1 circuit per task/model combinaton with floating-point importance scores for each edge or node, or (ii) 9 circuits per model/task with binary membership scores for each edge or node.
120
- If (ii), then for each critical threshold k, the circuit should contain no more than k% of edges. See [here]() for examples of each valid circuit format.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
- Create a folder in a HuggingFace repository to hold your circuits. At the URL you provide, there should be one folder per task/model combination; these folders
123
- should contain your circuit(s). As long as the folders contain the model and task names, you do not need to worry about the circuit filenames.
124
- If you provide more circuits than needed, our evaluation script will take the first 9 lexicographically.
125
 
126
- For specifications about the file format for a circuit, see the README on our project GitHub: TODO
 
 
 
 
 
127
 
128
- Once your submission has been validated and makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
 
 
 
129
 
130
- ## Causal variable localization track:
 
 
 
131
 
132
- You'll need to provide a link to a HuggingFace repository containing your trained featurizer, the layer on which the featurizer was trained, and the code needed to load and run your featurizer.
133
- See TODO for an example.
134
  """
135
 
136
  CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
 
105
 
106
  # What does your leaderboard evaluate?
107
  INTRODUCTION_TEXT = """
108
+ The leaderboards for each track of the Mechanistic Interpretability Benchmark.
109
  """
110
 
111
  # Which evaluations are you running? how can people reproduce what you have?
 
113
  This leaderboard displays scores on the private test set for the Mechanistic Interpretability Benchmark. Each track has its own tab.
114
  """
115
 
116
+ EVALUATION_QUEUE_TEXT_SUBGRAPH = """
117
+ ## Circuit localization track
118
 
119
+ ### 1. Collect your circuits
120
  You'll need either (i) 1 circuit per task/model combinaton with floating-point importance scores for each edge or node, or (ii) 9 circuits per model/task with binary membership scores for each edge or node.
121
+ For specifications about the file formats we accept, see the README on [our project GitHub](https://github.com/hannamw/MIB-subgraph-track).
122
+
123
+ ### 2. Upload your circuits
124
+ Create a HuggingFace repository, and create a folder in that repository that will hold all of your circuit folders.
125
+ At the URL you provide, there should be one folder per task/model combination; these folders
126
+ should contain your circuit(s). As long as the folder names contain the model and task names, you do not need to worry about the circuit filenames.
127
+ If you provide more circuits than needed, our evaluation script will take the first 9 lexicographically in a given folder. We provide examples of valid
128
+ submissions: see [here](https://huggingface.co/mib-bench/mib-circuits-example/tree/main/importances/json) for a submission using importance scores and
129
+ [here](https://huggingface.co/mib-bench/mib-circuits-example/tree/main/multiple_circuits/pt) for a submission uploading multiple circuits.
130
+
131
+ ### 3. Manage your submission in the queue
132
+ If your submission passes all checks, it will be added to the queue. You will receive a submission ID here when you do this; be sure to save it!
133
+ This will allow you to remove your submission from the queue (e.g., if you find a bug in your circuits). This will prevent you from needing to wait until
134
+ next week to resubmit.
135
+
136
+ Before your submission has been validated by our backend, it will have the "PREVALIDATION" status in the queue. Once it has been validated, it will have the "PENDING" status.
137
+ It will keep the PENDING status until it has been run on the private test set.
138
+ """
139
 
140
+ EVALUATION_QUEUE_TEXT_CAUSALVARIABLE = """
141
+ ## Causal variable localization track
 
142
 
143
+ ### 1. Collect your materials
144
+ You'll need the following:
145
+ * A trained featurizer saved as a .pt object.
146
+ * A python function that can load and run forward passes with your featurizer.
147
+ * A dynamic token alignment function.
148
+ * A hypothesized feature location.
149
 
150
+ ### 2. Upload your materials
151
+ Create a HuggingFace repository, and create a folder in that repository that will hold all of your materials.
152
+ At the URL you provide, each of the above materials should be present. We will take the first python script lexicographically
153
+ as your featurizer function, and the first .pt file lexicographically as your featurizer.
154
 
155
+ ### 3. Manage your submission in the queue
156
+ If your submission passes all checks, it will be added to the queue. You will receive a submission ID here when you do this; be sure to save it!
157
+ This will allow you to remove your submission from the queue (e.g., if you find a bug in your circuits). This will prevent you from needing to wait until
158
+ next week to resubmit.
159
 
160
+ Before your submission has been validated by our backend, it will have the "PREVALIDATION" status in the queue. Once it has been validated, it will have the "PENDING" status.
161
+ It will keep the PENDING status until it has been run on the private test set.
162
  """
163
 
164
  CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
src/display/utils.py CHANGED
@@ -192,10 +192,9 @@ AutoEvalColumn_mib_causalgraph = make_dataclass(
192
  ## For the queue columns in the submission tab
193
  @dataclass(frozen=True)
194
  class EvalQueueColumn: # Queue column
195
- model = ColumnContent("model", "markdown", True)
196
- track = ColumnContent("track", "str", True)
197
  revision = ColumnContent("revision", "str", True)
198
- private = ColumnContent("private", "bool", True)
199
  status = ColumnContent("status", "str", True)
200
 
201
  ## All the model information that we might need
 
192
  ## For the queue columns in the submission tab
193
  @dataclass(frozen=True)
194
  class EvalQueueColumn: # Queue column
195
+ method_name = ColumnContent("method_name", "str", True)
196
+ repo_id = ColumnContent("hf_repo", "markdown", True)
197
  revision = ColumnContent("revision", "str", True)
 
198
  status = ColumnContent("status", "str", True)
199
 
200
  ## All the model information that we might need
src/leaderboard/read_evals.py CHANGED
@@ -18,11 +18,11 @@ import pandas as pd
18
 
19
 
20
 
21
- def compute_area(edge_counts, faithfulnesses, log_scale=True):
22
  # Return None if either list is empty
23
  if not edge_counts or not faithfulnesses:
24
  return None, None, None
25
-
26
  percentages = [e / max(edge_counts) for e in edge_counts]
27
  area_under = 0.
28
  area_from_100 = 0.
@@ -72,13 +72,23 @@ class EvalResult_MIB_SUBGRAPH:
72
  # Keep exact scores structure from JSON
73
  scores = model_result.get("scores", {})
74
 
75
- # for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:
76
  for task in TasksMib_Subgraph.get_all_tasks():
77
  if task in scores:
78
- results[task][model_name] = {
79
- "edge_counts": scores[task]["edge_counts"],
80
- "faithfulness": scores[task]["faithfulness"]
81
- }
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  return EvalResult_MIB_SUBGRAPH(
84
  eval_name=method_name,
@@ -100,7 +110,7 @@ class EvalResult_MIB_SUBGRAPH:
100
  df_transformed.loc[i] = row.apply(lambda x: self._sigmoid(x) if isinstance(x, (float, int)) else x)
101
  return df_transformed
102
 
103
- def to_dict(self, metric_type="F+"):
104
  """Converts the Eval Result to a dict for dataframe display"""
105
  data_dict = {
106
  "eval_name": self.eval_name,
@@ -122,9 +132,15 @@ class EvalResult_MIB_SUBGRAPH:
122
  for model, metrics in task_results.items():
123
  col_name = f"{task}_{model}"
124
 
125
- if not metrics or not metrics["edge_counts"] or not metrics["faithfulness"]:
126
  continue
127
-
 
 
 
 
 
 
128
  faithfulness = metrics["faithfulness"]
129
  if isinstance(faithfulness[0], list):
130
  faithfulness = faithfulness[0]
@@ -134,7 +150,7 @@ class EvalResult_MIB_SUBGRAPH:
134
  continue
135
 
136
  area_under, area_from_100, _ = result
137
- score = area_under if metric_type == "F+" else area_from_100
138
  data_dict[col_name] = round(score, 2)
139
  all_scores.append(score)
140
  transformed_scores.append(self._sigmoid(score))
 
18
 
19
 
20
 
21
+ def compute_area(edge_counts, faithfulnesses):
22
  # Return None if either list is empty
23
  if not edge_counts or not faithfulnesses:
24
  return None, None, None
25
+
26
  percentages = [e / max(edge_counts) for e in edge_counts]
27
  area_under = 0.
28
  area_from_100 = 0.
 
72
  # Keep exact scores structure from JSON
73
  scores = model_result.get("scores", {})
74
 
 
75
  for task in TasksMib_Subgraph.get_all_tasks():
76
  if task in scores:
77
+ if "CPR" in scores[task]:
78
+ results[task][model_name] = {"CPR": {}, "CMD": {}}
79
+ results[task][model_name]["CPR"] = {
80
+ "edge_counts": scores[task]["CPR"]["edge_counts"],
81
+ "faithfulness": scores[task]["CPR"]["faithfulness"]
82
+ }
83
+ results[task][model_name]["CMD"] = {
84
+ "edge_counts": scores[task]["CMD"]["edge_counts"],
85
+ "faithfulness": scores[task]["CMD"]["faithfulness"]
86
+ }
87
+ else:
88
+ results[task][model_name] = {
89
+ "edge_counts": scores[task]["edge_counts"],
90
+ "faithfulness": scores[task]["faithfulness"]
91
+ }
92
 
93
  return EvalResult_MIB_SUBGRAPH(
94
  eval_name=method_name,
 
110
  df_transformed.loc[i] = row.apply(lambda x: self._sigmoid(x) if isinstance(x, (float, int)) else x)
111
  return df_transformed
112
 
113
+ def to_dict(self, metric_type="CPR"):
114
  """Converts the Eval Result to a dict for dataframe display"""
115
  data_dict = {
116
  "eval_name": self.eval_name,
 
132
  for model, metrics in task_results.items():
133
  col_name = f"{task}_{model}"
134
 
135
+ if not metrics:
136
  continue
137
+
138
+ if not metrics[metric_type] and (not metrics["edge_counts"] or not metrics["faithfulness"]):
139
+ continue
140
+
141
+ if metric_type in metrics:
142
+ metrics = metrics[metric_type]
143
+
144
  faithfulness = metrics["faithfulness"]
145
  if isinstance(faithfulness[0], list):
146
  faithfulness = faithfulness[0]
 
150
  continue
151
 
152
  area_under, area_from_100, _ = result
153
+ score = area_under if metric_type == "CPR" else area_from_100
154
  data_dict[col_name] = round(score, 2)
155
  all_scores.append(score)
156
  transformed_scores.append(self._sigmoid(score))
src/populate.py CHANGED
@@ -7,9 +7,10 @@ from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
9
  from src.about import TasksMib_Causalgraph
 
10
 
11
  def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_cols: list,
12
- metric_type = "F+") -> pd.DataFrame:
13
  """Creates a dataframe from all the MIB experiment results"""
14
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
15
  raw_data = get_raw_eval_results_mib_subgraph(results_path)
@@ -19,7 +20,7 @@ def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_col
19
 
20
  # Convert to dataframe
21
  df = pd.DataFrame.from_records(all_data_json)
22
- ascending = False if metric_type == "F+" else True
23
 
24
  # Sort by Average score descending
25
  if 'Average' in df.columns:
@@ -170,10 +171,12 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
170
  # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
171
  # all_evals.append(data)
172
 
173
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN", "PREVALIDATION"]]
174
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
175
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
 
 
 
176
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
177
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
178
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
179
- return df_finished[cols], df_running[cols], df_pending[cols]
 
7
  from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
9
  from src.about import TasksMib_Causalgraph
10
+ from src.submission.check_validity import parse_huggingface_url
11
 
12
  def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_cols: list,
13
+ metric_type = "CPR") -> pd.DataFrame:
14
  """Creates a dataframe from all the MIB experiment results"""
15
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
16
  raw_data = get_raw_eval_results_mib_subgraph(results_path)
 
20
 
21
  # Convert to dataframe
22
  df = pd.DataFrame.from_records(all_data_json)
23
+ ascending = False if metric_type == "CPR" else True
24
 
25
  # Sort by Average score descending
26
  if 'Average' in df.columns:
 
171
  # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
172
  # all_evals.append(data)
173
 
174
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "PREVALIDATION"]]
175
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL" or e["status"] == "FAILED"]
176
+ for list in (pending_list, finished_list):
177
+ for item in list:
178
+ item["track"] = "Circuit Localization"
179
+ item["hf_repo"] = parse_huggingface_url(item["hf_repo"])[0]
180
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
 
181
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
182
+ return df_finished[cols], df_pending[cols]
src/submission/check_validity.py CHANGED
@@ -238,23 +238,26 @@ def parse_huggingface_url(url: str):
238
 
239
  parsed = urlparse(url)
240
  path_parts = parsed.path.strip("/").split("/")
 
241
 
242
  # Extract repo_id (username/repo_name)
243
  if len(path_parts) < 2:
244
- raise ValueError("Invalid Hugging Face URL: Could not extract repo_id.")
245
- repo_id = f"{path_parts[0]}/{path_parts[1]}"
 
246
 
247
  # Extract folder path (if in /tree/ or /blob/)
248
  if "tree" in path_parts or "blob" in path_parts:
249
  try:
250
  branch_idx = path_parts.index("tree") if "tree" in path_parts else path_parts.index("blob")
251
  folder_path = "/".join(path_parts[branch_idx + 2:]) # Skip "tree/main" or "blob/main"
 
252
  except (ValueError, IndexError):
253
  folder_path = None
254
  else:
255
  folder_path = None
256
 
257
- return repo_id, folder_path
258
 
259
 
260
  def validate_directory(fs: HfFileSystem, repo_id: str, dirname: str, curr_tm: str, circuit_level:Literal['edge', 'node','neuron']='edge'):
@@ -318,10 +321,11 @@ def verify_circuit_submission(hf_repo, level, progress=gr.Progress()):
318
  path = hf_repo
319
  level = level
320
 
321
- folder_path = path.split("huggingface.co/")[1]
322
- repo_id = "/".join(folder_path.split("/")[:2])
 
323
  try:
324
- files = fs.listdir(folder_path)
325
  except Exception as e:
326
  errors.append(f"Could not open Huggingface URL: {e}")
327
  return errors, warnings
 
238
 
239
  parsed = urlparse(url)
240
  path_parts = parsed.path.strip("/").split("/")
241
+ revision = "main"
242
 
243
  # Extract repo_id (username/repo_name)
244
  if len(path_parts) < 2:
245
+ return None, None, None # Can't extract repo_id
246
+ else:
247
+ repo_id = f"{path_parts[0]}/{path_parts[1]}"
248
 
249
  # Extract folder path (if in /tree/ or /blob/)
250
  if "tree" in path_parts or "blob" in path_parts:
251
  try:
252
  branch_idx = path_parts.index("tree") if "tree" in path_parts else path_parts.index("blob")
253
  folder_path = "/".join(path_parts[branch_idx + 2:]) # Skip "tree/main" or "blob/main"
254
+ revision = path_parts[branch_idx + 1]
255
  except (ValueError, IndexError):
256
  folder_path = None
257
  else:
258
  folder_path = None
259
 
260
+ return repo_id, folder_path, revision
261
 
262
 
263
  def validate_directory(fs: HfFileSystem, repo_id: str, dirname: str, curr_tm: str, circuit_level:Literal['edge', 'node','neuron']='edge'):
 
321
  path = hf_repo
322
  level = level
323
 
324
+ repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
325
+
326
+ folder_path = repo_id + "/" + folder_path
327
  try:
328
+ files = fs.listdir(folder_path, revision=revision)
329
  except Exception as e:
330
  errors.append(f"Could not open Huggingface URL: {e}")
331
  return errors, warnings
src/submission/submit.py CHANGED
@@ -20,15 +20,17 @@ USERS_TO_SUBMISSION_DATES = None
20
  def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id):
21
  errors = []
22
  hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
 
 
23
  try:
24
- repo_info = hf_repo.split("huggingface.co/")[1]
25
- user_name, repo_name = repo_info.split("/")[:2]
26
  except Exception as e:
27
  errors.append("Error processing HF URL: could not get username and repo name")
28
- try:
29
- commit_hash = API.list_repo_commits("/".join([user_name, repo_name]))[0].commit_id
30
- except Exception as e:
31
- errors.append("Could not get commit hash of provided Huggingface repo")
 
32
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
33
 
34
  if not errors:
@@ -84,7 +86,7 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_positio
84
  if errors:
85
  status = gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
86
  else:
87
- status = gr.Textbox(f"βœ… Submission received! Your ID is \"{_id}\". You'll receive an email once we've validated your submission.", visible=True)
88
  return [
89
  status,
90
  None, None,
 
20
  def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id):
21
  errors = []
22
  hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
23
+ repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
24
+
25
  try:
26
+ user_name, repo_name = repo_id.split("/")
 
27
  except Exception as e:
28
  errors.append("Error processing HF URL: could not get username and repo name")
29
+ if revision is None or revision == "main":
30
+ try:
31
+ commit_hash = API.list_repo_commits(repo_id)[0].commit_id
32
+ except Exception as e:
33
+ errors.append("Could not get commit hash of provided Huggingface repo")
34
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
35
 
36
  if not errors:
 
86
  if errors:
87
  status = gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
88
  else:
89
+ status = gr.Textbox(f"βœ… Submission received! Your submission ID is \"{_id}\". Save this so that you can manage your submission on the queue.", visible=True)
90
  return [
91
  status,
92
  None, None,