Aaron Mueller commited on
Commit
3c343e0
·
1 Parent(s): c57af6c

leaderboard update

Browse files
app.py CHANGED
@@ -1,5 +1,8 @@
1
  import json
2
  import gzip
 
 
 
3
  import gradio as gr
4
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
  import pandas as pd
@@ -21,8 +24,6 @@ from src.about import (
21
  )
22
  from src.display.css_html_js import custom_css
23
  from src.display.utils import (
24
- BENCHMARK_COLS,
25
- BENCHMARK_COLS_MULTIMODAL,
26
  BENCHMARK_COLS_MIB_SUBGRAPH,
27
  COLS,
28
  COLS_MIB_SUBGRAPH,
@@ -34,10 +35,10 @@ from src.display.utils import (
34
  AutoEvalColumn_mib_causalgraph,
35
  fields,
36
  )
37
- from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
38
- from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
39
- from src.submission.submit import add_new_eval
40
-
41
 
42
  from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
43
 
@@ -244,27 +245,35 @@ def restart_space():
244
 
245
 
246
 
247
- ### Space initialisation
248
  try:
249
- # print(EVAL_REQUESTS_PATH)
 
250
  snapshot_download(
251
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
 
 
 
252
  )
253
  except Exception:
254
  restart_space()
255
-
256
 
257
  try:
258
- # print(RESULTS_REPO_MIB_SUBGRAPH)
 
259
  snapshot_download(
260
  repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
261
  )
262
  except Exception:
263
  restart_space()
264
-
265
-
266
  try:
267
- # print(RESULTS_REPO_MIB_CAUSALGRAPH)
 
268
  snapshot_download(
269
  repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
270
  )
@@ -277,26 +286,25 @@ def _sigmoid(x):
277
  except:
278
  return "-"
279
 
280
- LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
281
- LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
282
  metric_type="F=")
283
 
284
  # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
285
  # In app.py, modify the LEADERBOARD initialization
286
  LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
287
- EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
288
- EVAL_REQUESTS_PATH
289
  )
290
 
291
 
292
  # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
293
  # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
294
 
295
- (
296
- finished_eval_queue_df,
297
- running_eval_queue_df,
298
- pending_eval_queue_df,
299
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
300
 
301
 
302
 
@@ -392,10 +400,6 @@ def init_leaderboard_mib_subgraph(dataframe, track):
392
 
393
 
394
  def init_leaderboard_mib_causalgraph(dataframe, track):
395
- # print("Debugging column issues:")
396
- # print("\nActual DataFrame columns:")
397
- # print(dataframe.columns.tolist())
398
-
399
  model_name_mapping = {
400
  "Qwen2ForCausalLM": "Qwen-2.5",
401
  "GPT2ForCausalLM": "GPT-2",
@@ -419,18 +423,7 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
419
  display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
420
  display_mapping[field_name] = display_name
421
 
422
- # print(dataframe)
423
  renamed_df = dataframe.rename(columns=display_mapping)
424
-
425
- # idx_to_method = {0: "Full Vector", 1: "DAS", 2: "DBM", 3: "PCA", 4: "SAE"}
426
- # idx_to_scores = {0: [0.38, 0.36, 0.38, 0.42],
427
- # 1: [0.56, 0.62, 0.54, 0.51],
428
- # 2: [0.43, 0.41, 0.53, 0.49],
429
- # 3: [0.26, 0.20, 0.32, 0.40],
430
- # 4: ["-", "-", 0.33, "-"]}
431
- # renamed_df.loc[0]["Method"] = "Full Vector"
432
- # for i in range(5):
433
- # renamed_df.loc[i] = [idx_to_method[i]] + idx_to_scores[i]
434
 
435
  print(renamed_df)
436
 
@@ -438,11 +431,6 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
438
  return Leaderboard(
439
  value=renamed_df,
440
  datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
441
- # select_columns=SelectColumns(
442
- # default_selection=["Method"], # Start with just Method column
443
- # cant_deselect=["Method"], # Method column should always be visible
444
- # label="Select Columns to Display:",
445
- # ),
446
  search_columns=["Method"],
447
  hide_columns=["eval_name"],
448
  bool_checkboxgroup_label="Hide models",
@@ -455,8 +443,6 @@ def init_leaderboard(dataframe, track):
455
  raise ValueError("Leaderboard DataFrame is empty or None.")
456
  # filter for correct track
457
  dataframe = dataframe.loc[dataframe["Track"] == track]
458
-
459
- # print(f"\n\n\n dataframe is {dataframe}\n\n\n")
460
 
461
  return Leaderboard(
462
  value=dataframe,
@@ -577,17 +563,6 @@ def update_leaderboard(dataframe: pd.DataFrame, selected_task_substrings: List[s
577
  filtered_dataframe.loc[:, "Score"] = np.where(filtered_dataframe.eq("-").any(axis=1), "-", s_means.round(2))
578
  filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
579
 
580
- # if show_average:
581
- # print([row for index, row in filtered_dataframe.iterrows()])
582
- # filtered_dataframe["Average"] = [round(np.mean(row.values()), 2) if "-" not in row.values() else "-" for index, row in filtered_dataframe.iterrows()]
583
- # # Sort by Average score descending
584
- # if 'Average' in dataframe.columns:
585
- # # Convert '-' to NaN for sorting purposes
586
- # df['Average'] = pd.to_numeric(['Average'], errors='coerce')
587
- # df = df.sort_values(by=['Average'], ascending=True, na_position='last')
588
- # # Convert NaN back to '-'
589
- # df['Average'] = df['Average'].fillna('-')
590
-
591
  return filtered_dataframe
592
 
593
  def process_url(url):
@@ -600,18 +575,6 @@ with demo:
600
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
601
 
602
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
603
- # with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
604
- # leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
605
- # with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
606
- # leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
607
- # with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
608
- # leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
609
-
610
- # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
611
- # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
612
-
613
- # with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
614
- # leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
615
  with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
616
  with gr.Tabs() as subgraph_tabs:
617
  with gr.TabItem("F+", id=0):
@@ -622,11 +585,6 @@ with demo:
622
  You can combine filters to see specific task-model combinations.
623
  """)
624
  # CheckboxGroup for selecting substrings
625
- # substring_checkbox = gr.CheckboxGroup(
626
- # choices=PRESET_SUBSTRINGS,
627
- # label="Filter results:",
628
- # value=PRESET_SUBSTRINGS, # Default to all substrings selected
629
- # )
630
  task_substring_checkbox = gr.CheckboxGroup(
631
  choices=TASK_SUBSTRINGS,
632
  label="View tasks:",
@@ -660,11 +618,6 @@ with demo:
660
  You can combine filters to see specific task-model combinations.
661
  """)
662
  # CheckboxGroup for selecting substrings
663
- # substring_checkbox = gr.CheckboxGroup(
664
- # choices=PRESET_SUBSTRINGS,
665
- # label="Filter results:",
666
- # value=PRESET_SUBSTRINGS, # Default to all substrings selected
667
- # )
668
  task_substring_checkbox = gr.CheckboxGroup(
669
  choices=TASK_SUBSTRINGS,
670
  label="View tasks:",
@@ -705,11 +658,6 @@ with demo:
705
  Use the dropdown menus below to filter results by specific tasks or models.
706
  You can combine filters to see specific task-model combinations.
707
  """)
708
- # substring_checkbox = gr.CheckboxGroup(
709
- # choices=PRESET_SUBSTRINGS,
710
- # label="Filter results:",
711
- # value=PRESET_SUBSTRINGS, # Default to all substrings selected
712
- # )
713
  task_substring_checkbox = gr.CheckboxGroup(
714
  choices=TASK_SUBSTRINGS,
715
  label="View tasks:",
@@ -757,11 +705,24 @@ with demo:
757
 
758
  with gr.Group(visible=False) as circuit_ui:
759
  gr.Markdown("### Circuit Localization Requirements")
760
- hf_repo = gr.Textbox(
761
- label="HuggingFace Repository URL",
762
- placeholder="https://huggingface.co/username/repo/tree/main/path",
763
- info="Must be a valid HuggingFace URL pointing to a folder with 10 circuit files (.json or .pt)"
764
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
765
 
766
  with gr.Group(visible=False) as causal_ui:
767
  gr.Markdown("### Causal Variable Localization Requirements")
@@ -778,15 +739,22 @@ with demo:
778
  minimum=0,
779
  info="Integer specifying token position"
780
  )
781
- code_upload = gr.File(
782
- label="Upload Python file implementing your featurization function",
783
- file_types=[".py"],
784
- )
 
 
 
 
 
 
 
785
 
786
  # Common fields
787
  with gr.Group():
788
- gr.Markdown("### Team Information")
789
- team_name = gr.Textbox(label="Team Name")
790
  contact_email = gr.Textbox(label="Contact Email")
791
 
792
  # Dynamic UI logic
@@ -801,47 +769,115 @@ with demo:
801
  track.change(toggle_ui, track, [circuit_ui, causal_ui])
802
 
803
  # Submission handling
804
- status = gr.Textbox(label="Submission Status", visible=False)
805
 
806
- def handle_submission(track, hf_repo, layer, token_position, code_upload, team_name, contact_email):
807
  errors = []
 
808
 
 
 
 
 
809
  # Validate common fields
810
- if not team_name.strip():
811
- errors.append("Team name is required")
812
  if "@" not in contact_email or "." not in contact_email:
813
  errors.append("Valid email address is required")
 
 
814
 
815
- # Track-specific validation
816
- if "Circuit" in track:
817
- if not hf_repo.startswith("https://huggingface.co/"):
818
- errors.append("Invalid HuggingFace URL - must start with https://huggingface.co/")
 
 
 
 
819
  else:
820
- # Check rate limit only for valid HF submissions
821
- username = get_hf_username(hf_repo)
822
- rate = 0 # TODO: check submissions queue for rates
823
- rate_limit = 2
824
- if rate > rate_limit:
825
- errors.append("Rate limit exceeded (max 2 submissions per week per HF account)")
 
 
 
 
826
 
827
- else:
828
  if not (isinstance(layer, int) and isinstance(token_position, int)):
829
  errors.append("Layer and token position must be integers")
830
  if not code_upload:
831
  errors.append("Code file upload is required")
 
 
832
 
833
- if errors:
834
- return gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
 
 
835
 
836
- # Process valid submission
837
- return gr.Textbox("✅ Submission received! Thank you for your entry.", visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838
 
839
  submit_btn = gr.Button("Submit Entry", variant="primary")
840
  submit_btn.click(
841
  handle_submission,
842
- inputs=[track, hf_repo, layer, token_position, code_upload, team_name, contact_email],
843
- outputs=status
 
 
 
 
 
 
 
 
 
 
 
844
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
845
 
846
  # Add info about rate limits
847
  gr.Markdown("""
@@ -864,4 +900,4 @@ with demo:
864
  scheduler = BackgroundScheduler()
865
  scheduler.add_job(restart_space, "interval", seconds=1800)
866
  scheduler.start()
867
- demo.launch(share=True, ssr_mode=False)
 
1
  import json
2
  import gzip
3
+ import os
4
+ import shutil
5
+ import secrets
6
  import gradio as gr
7
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
8
  import pandas as pd
 
24
  )
25
  from src.display.css_html_js import custom_css
26
  from src.display.utils import (
 
 
27
  BENCHMARK_COLS_MIB_SUBGRAPH,
28
  COLS,
29
  COLS_MIB_SUBGRAPH,
 
35
  AutoEvalColumn_mib_causalgraph,
36
  fields,
37
  )
38
+ from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
39
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
40
+ from src.submission.submit import upload_to_queue, remove_submission
41
+ from src.submission.check_validity import verify_circuit_submission, verify_causal_variable_submission, check_rate_limit
42
 
43
  from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
44
 
 
245
 
246
 
247
 
248
+ ### Space initialisation - refresh caches
249
  try:
250
+ if os.path.exists(EVAL_REQUESTS_SUBGRAPH):
251
+ shutil.rmtree(EVAL_REQUESTS_SUBGRAPH)
252
  snapshot_download(
253
+ repo_id=QUEUE_REPO_SUBGRAPH, local_dir=EVAL_REQUESTS_SUBGRAPH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
254
+ )
255
+ except Exception:
256
+ restart_space()
257
+ try:
258
+ if os.path.exists(EVAL_REQUESTS_CAUSALGRAPH):
259
+ shutil.rmtree(EVAL_REQUESTS_CAUSALGRAPH)
260
+ snapshot_download(
261
+ repo_id=QUEUE_REPO_CAUSALGRAPH, local_dir=EVAL_REQUESTS_CAUSALGRAPH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
262
  )
263
  except Exception:
264
  restart_space()
 
265
 
266
  try:
267
+ if os.path.exists(EVAL_RESULTS_MIB_SUBGRAPH_PATH):
268
+ shutil.rmtree(EVAL_RESULTS_MIB_SUBGRAPH_PATH)
269
  snapshot_download(
270
  repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
271
  )
272
  except Exception:
273
  restart_space()
 
 
274
  try:
275
+ if os.path.exists(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH):
276
+ shutil.rmtree(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH)
277
  snapshot_download(
278
  repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
279
  )
 
286
  except:
287
  return "-"
288
 
289
+ LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
290
+ LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
291
  metric_type="F=")
292
 
293
  # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
294
  # In app.py, modify the LEADERBOARD initialization
295
  LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
296
+ EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
 
297
  )
298
 
299
 
300
  # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
301
  # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
302
 
303
+ # (
304
+ # finished_eval_queue_df,
305
+ # running_eval_queue_df,
306
+ # pending_eval_queue_df,
307
+ # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
308
 
309
 
310
 
 
400
 
401
 
402
  def init_leaderboard_mib_causalgraph(dataframe, track):
 
 
 
 
403
  model_name_mapping = {
404
  "Qwen2ForCausalLM": "Qwen-2.5",
405
  "GPT2ForCausalLM": "GPT-2",
 
423
  display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
424
  display_mapping[field_name] = display_name
425
 
 
426
  renamed_df = dataframe.rename(columns=display_mapping)
 
 
 
 
 
 
 
 
 
 
427
 
428
  print(renamed_df)
429
 
 
431
  return Leaderboard(
432
  value=renamed_df,
433
  datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
 
 
 
 
 
434
  search_columns=["Method"],
435
  hide_columns=["eval_name"],
436
  bool_checkboxgroup_label="Hide models",
 
443
  raise ValueError("Leaderboard DataFrame is empty or None.")
444
  # filter for correct track
445
  dataframe = dataframe.loc[dataframe["Track"] == track]
 
 
446
 
447
  return Leaderboard(
448
  value=dataframe,
 
563
  filtered_dataframe.loc[:, "Score"] = np.where(filtered_dataframe.eq("-").any(axis=1), "-", s_means.round(2))
564
  filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
565
 
 
 
 
 
 
 
 
 
 
 
 
566
  return filtered_dataframe
567
 
568
  def process_url(url):
 
575
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
576
 
577
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
 
 
 
 
 
 
 
 
 
 
 
 
578
  with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
579
  with gr.Tabs() as subgraph_tabs:
580
  with gr.TabItem("F+", id=0):
 
585
  You can combine filters to see specific task-model combinations.
586
  """)
587
  # CheckboxGroup for selecting substrings
 
 
 
 
 
588
  task_substring_checkbox = gr.CheckboxGroup(
589
  choices=TASK_SUBSTRINGS,
590
  label="View tasks:",
 
618
  You can combine filters to see specific task-model combinations.
619
  """)
620
  # CheckboxGroup for selecting substrings
 
 
 
 
 
621
  task_substring_checkbox = gr.CheckboxGroup(
622
  choices=TASK_SUBSTRINGS,
623
  label="View tasks:",
 
658
  Use the dropdown menus below to filter results by specific tasks or models.
659
  You can combine filters to see specific task-model combinations.
660
  """)
 
 
 
 
 
661
  task_substring_checkbox = gr.CheckboxGroup(
662
  choices=TASK_SUBSTRINGS,
663
  label="View tasks:",
 
705
 
706
  with gr.Group(visible=False) as circuit_ui:
707
  gr.Markdown("### Circuit Localization Requirements")
708
+ with gr.Row():
709
+ hf_repo_circ = gr.Textbox(
710
+ label="HuggingFace Repository URL",
711
+ placeholder="https://huggingface.co/username/repo/path",
712
+ info="Must be a valid HuggingFace URL pointing to folders containing either 1 importance score file per task/model, or " \
713
+ "9 circuit files per task/model (.json or .pt). " \
714
+ "Remove 'tree', 'resolve', and the branch name (e.g., '/tree/main/') from URL if present."
715
+ )
716
+ level = gr.Radio(
717
+ choices=[
718
+ "Edge",
719
+ "Node (submodule)",
720
+ "Node (neuron)"
721
+ ],
722
+ label="Level of granularity",
723
+ info="Is your circuit defined by its inclusion/exclusion of certain edges (e.g., MLP1 to H10L12), of certain submodules (e.g., MLP1), or of neurons " \
724
+ "within those submodules (e.g., MLP1 neuron 295)?"
725
+ )
726
 
727
  with gr.Group(visible=False) as causal_ui:
728
  gr.Markdown("### Causal Variable Localization Requirements")
 
739
  minimum=0,
740
  info="Integer specifying token position"
741
  )
742
+ with gr.Row():
743
+ hf_repo_cg = gr.Textbox(
744
+ label="HuggingFace Repository URL",
745
+ placeholder="https://huggingface.co/username/repo/path",
746
+ info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " \
747
+ "Remove 'tree', 'resolve', and the branch name (e.g., '/tree/main/') from URL if present."
748
+ )
749
+ code_upload = gr.File(
750
+ label="Upload Python file implementing your featurization function",
751
+ file_types=[".py"],
752
+ )
753
 
754
  # Common fields
755
  with gr.Group():
756
+ gr.Markdown("### Submission Information")
757
+ method_name = gr.Textbox(label="Method Name")
758
  contact_email = gr.Textbox(label="Contact Email")
759
 
760
  # Dynamic UI logic
 
769
  track.change(toggle_ui, track, [circuit_ui, causal_ui])
770
 
771
  # Submission handling
772
+ status = gr.Textbox(label="Submission Status", visible=True)
773
 
774
+ def handle_submission(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email):
775
  errors = []
776
+ warnings = []
777
 
778
+ breaking_error = False
779
+
780
+ hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
781
+
782
  # Validate common fields
783
+ if not method_name.strip():
784
+ errors.append("Method name is required")
785
  if "@" not in contact_email or "." not in contact_email:
786
  errors.append("Valid email address is required")
787
+ if not level:
788
+ errors.append("Level of granularity is required")
789
 
790
+ if not hf_repo.startswith("https://huggingface.co/") and not hf_repo.startswith("http://huggingface.co/"):
791
+ errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
792
+ breaking_error = True
793
+ else:
794
+ repo_info = hf_repo.split("huggingface.co/")[1]
795
+ if len(repo_info.split("/")) < 2:
796
+ errors.append("Could not read username or repo name from HF URL")
797
+ breaking_error = True
798
  else:
799
+ user_name, repo_name = repo_info.split("/")[:2]
800
+ under_rate_limit, time_left = check_rate_limit(track, user_name, contact_email)
801
+ if not under_rate_limit:
802
+ errors.append(f"Rate limit exceeded (max 2 submissions per week). Please try again in {time_left}. " \
803
+ "(If you're trying again after a failed validation, either remove the previous entry below or try again in about 30 minutes.")
804
+ breaking_error = True
805
+
806
+ # Track-specific validation
807
+ if "Circuit" in track and not breaking_error:
808
+ submission_errors, submission_warnings = verify_circuit_submission(hf_repo, level)
809
 
810
+ elif not breaking_error:
811
  if not (isinstance(layer, int) and isinstance(token_position, int)):
812
  errors.append("Layer and token position must be integers")
813
  if not code_upload:
814
  errors.append("Code file upload is required")
815
+
816
+ submission_errors, submission_warnings = verify_causal_variable_submission(hf_repo, layer, token_position, code_upload)
817
 
818
+ if not breaking_error:
819
+ errors.extend(submission_errors)
820
+ warnings.extend(submission_warnings)
821
+ _id = secrets.token_urlsafe(12)
822
 
823
+ if errors:
824
+ return [
825
+ gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True),
826
+ None, None,
827
+ gr.Column(visible=False),
828
+ ]
829
+ elif warnings:
830
+ return [
831
+ gr.Textbox("Warnings:", visible=True),
832
+ gr.Markdown("\n".join(f"• {w}" for w in warnings)),
833
+ (track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id),
834
+ gr.Column(visible=True)
835
+ ]
836
+ else:
837
+ return upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id)
838
+
839
+ # New warning confirmation dialog
840
+ warning_modal = gr.Column(visible=False, variant="panel")
841
+ with warning_modal:
842
+ gr.Markdown("### ⚠️ Submission Warnings")
843
+ warning_display = gr.Markdown()
844
+ proceed_btn = gr.Button("Proceed Anyway", variant="primary")
845
+ cancel_btn = gr.Button("Cancel Submission", variant="secondary")
846
+
847
+ # Store submission data between callbacks
848
+ pending_submission = gr.State()
849
 
850
  submit_btn = gr.Button("Submit Entry", variant="primary")
851
  submit_btn.click(
852
  handle_submission,
853
+ inputs=[track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email],
854
+ outputs=[status, warning_display, pending_submission, warning_modal]
855
+ )
856
+
857
+ proceed_btn.click(
858
+ lambda x: upload_to_queue(*x),
859
+ inputs=pending_submission,
860
+ outputs=[status, warning_display, pending_submission, warning_modal]
861
+ )
862
+
863
+ cancel_btn.click(
864
+ lambda: [gr.Textbox("Submission canceled.", visible=True), None, None, gr.Column(visible=False)],
865
+ outputs=[status, warning_display, pending_submission, warning_modal]
866
  )
867
+
868
+ with gr.Group():
869
+ gr.Markdown("### Remove Submission from Queue")
870
+ with gr.Row():
871
+ name_r = gr.Textbox(label="Method Name")
872
+ _id_r = gr.Textbox(label = "Submission ID")
873
+
874
+ status_r = gr.Textbox(label="Removal Status", visible=False)
875
+ remove_button = gr.Button("Remove Entry")
876
+ remove_button.click(
877
+ remove_submission,
878
+ inputs=[track, name_r, _id_r],
879
+ outputs=[status_r]
880
+ )
881
 
882
  # Add info about rate limits
883
  gr.Markdown("""
 
900
  scheduler = BackgroundScheduler()
901
  scheduler.add_job(restart_space, "interval", seconds=1800)
902
  scheduler.start()
903
+ demo.queue(default_concurrency_limit=40).launch(share=True, ssr_mode=False)
src/about.py CHANGED
@@ -7,11 +7,6 @@ class Task:
7
  metric: str
8
  col_name: str
9
 
10
-
11
-
12
-
13
- # Select your tasks here
14
- # ---------------------------------------------------
15
  class Tasks(Enum):
16
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
  task0 = Task("blimp", "acc", "BLiMP")
@@ -19,19 +14,6 @@ class Tasks(Enum):
19
  task2 = Task("glue", "acc", "(Super)GLUE")
20
  task3 = Task("ewok", "acc", "EWoK")
21
 
22
-
23
-
24
- class TasksMultimodal(Enum):
25
- task0 = Task("blimp", "acc", "BLiMP")
26
- task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
27
- task2 = Task("glue", "acc", "(Super)GLUE")
28
- task3 = Task("ewok", "acc", "EWoK")
29
- task4 = Task("vqa", "acc", "VQA")
30
- task5 = Task("winoground", "acc", "Winoground")
31
- task6 = Task("devbench", "acc", "DevBench")
32
-
33
-
34
-
35
  @dataclass
36
  class TaskMIB_Subgraph:
37
  benchmark: str # task name in json (ioi/arithmetic)
@@ -118,15 +100,8 @@ class TasksMib_Causalgraph(Enum):
118
 
119
 
120
 
121
-
122
-
123
- NUM_FEWSHOT = 0 # Change with your few shot
124
- # ---------------------------------------------------
125
-
126
-
127
-
128
  # Your leaderboard name
129
- TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Benchmark 2024 Leaderboards</h1>"""
130
 
131
  # What does your leaderboard evaluate?
132
  INTRODUCTION_TEXT = """
@@ -135,34 +110,36 @@ The leaderboards for each track of the 2024 Mechanistic Interpretability Benchma
135
 
136
  # Which evaluations are you running? how can people reproduce what you have?
137
  LLM_BENCHMARKS_TEXT = f"""
138
- This leaderboard displays scores from the 2024 BabyLM Challenge. Each track has its own tab.
139
  """
140
 
141
  EVALUATION_QUEUE_TEXT = """
142
  ## Circuit localization track:
143
 
144
- You'll need 10 circuits per task/model combination. For each critical threshold k and previous threshold k_-1,
145
- the circuit should contain no fewer than k_-1% of components, and no more than k% of components. Create a HuggingFace
146
- dataset or model repository; this will house your circuits. Make a folder where the circuits (and *only* the circuits)
147
- are contained. Do not worry about the ordering of the files; our evaluation script will read the circuits and sort them
148
- by size. Provide a link to this folder below.
 
149
 
150
  For specifications about the file format for a circuit, see the README on our project GitHub: TODO
151
 
152
- Once your model makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
153
- The evaluations are handled by the National Deep Inference Framework (NDIF).
154
 
155
  ## Causal variable localization track:
 
 
 
156
  """
157
 
158
- CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
159
  CITATION_BUTTON_TEXT = r"""
160
- @article{hu2024findingssecondbabylmchallenge,
161
- title={Findings of the Second BabyLM Challenge: Sample-Efficient Pretraining on Developmentally Plausible Corpora},
162
- author={Michael Y. Hu and Aaron Mueller and Candace Ross and Adina Williams and Tal Linzen and Chengxu Zhuang and Ryan Cotterell and Leshem Choshen and Alex Warstadt and Ethan Gotlieb Wilcox},
163
- year={2024},
164
- journal={Computing Research Repository},
165
- volume={arXiv:2412.05149},
166
- url={https://arxiv.org/abs/2412.05149},
167
  }
168
  """
 
7
  metric: str
8
  col_name: str
9
 
 
 
 
 
 
10
  class Tasks(Enum):
11
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
12
  task0 = Task("blimp", "acc", "BLiMP")
 
14
  task2 = Task("glue", "acc", "(Super)GLUE")
15
  task3 = Task("ewok", "acc", "EWoK")
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  @dataclass
18
  class TaskMIB_Subgraph:
19
  benchmark: str # task name in json (ioi/arithmetic)
 
100
 
101
 
102
 
 
 
 
 
 
 
 
103
  # Your leaderboard name
104
+ TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Benchmark Leaderboards</h1>"""
105
 
106
  # What does your leaderboard evaluate?
107
  INTRODUCTION_TEXT = """
 
110
 
111
  # Which evaluations are you running? how can people reproduce what you have?
112
  LLM_BENCHMARKS_TEXT = f"""
113
+ This leaderboard displays scores on the private test set for the Mechanistic Interpretability Benchmark. Each track has its own tab.
114
  """
115
 
116
  EVALUATION_QUEUE_TEXT = """
117
  ## Circuit localization track:
118
 
119
+ You'll need either (i) 1 circuit per task/model combinaton with floating-point importance scores for each edge or node, or (ii) 9 circuits per model/task with binary membership scores for each edge or node.
120
+ If (ii), then for each critical threshold k, the circuit should contain no more than k% of edges. See [here]() for examples of each valid circuit format.
121
+
122
+ Create a folder in a HuggingFace repository to hold your circuits. At the URL you provide, there should be one folder per task/model combination; these folders
123
+ should contain your circuit(s). As long as the folders contain the model and task names, you do not need to worry about the circuit filenames.
124
+ If you provide more circuits than needed, our evaluation script will take the first 9 lexicographically.
125
 
126
  For specifications about the file format for a circuit, see the README on our project GitHub: TODO
127
 
128
+ Once your submission has been validated and makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
 
129
 
130
  ## Causal variable localization track:
131
+
132
+ You'll need to provide a link to a HuggingFace repository containing your trained featurizer, the layer on which the featurizer was trained, and the code needed to load and run your featurizer.
133
+ See TODO for an example.
134
  """
135
 
136
+ CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
137
  CITATION_BUTTON_TEXT = r"""
138
+ @article{mib-2025,
139
+ title = {{MIB}: A Mechanistic Interpretability Benchmark},
140
+ author = {Aaron Mueller and Atticus Geiger and Sarah Wiegreffe and Dana Arad and Iv{\'a}n Arcuschin and Adam Belfki and Yik Siu Chan and Jaden Fiotto-Kaufman and Tal Haklay and Michael Hanna and Jing Huang and Rohan Gupta and Yaniv Nikankin and Hadas Orgad and Nikhil Prakash and Anja Reusch and Aruna Sankaranarayanan and Shun Shao and Alessandro Stolfo and Martin Tutek and Amir Zur and David Bau and Yonatan Belinkov},
141
+ year = {2025},
142
+ note = {To appear},
143
+ journal = {arXiv preprint}
 
144
  }
145
  """
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph, TasksMib_Causalgraph
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -28,8 +28,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
28
  auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
29
  auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
30
  #Scores
31
- for task in Tasks:
32
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
34
  auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
35
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
@@ -38,10 +38,10 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
38
  auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
39
  auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
40
  auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
41
- for task in TasksMultimodal:
42
- auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
43
- if task.value.col_name in ("ewok", "EWoK"): # make sure this appears in the right order
44
- auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
45
  auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
46
  auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
47
  auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
@@ -214,7 +214,7 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
214
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
215
 
216
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
217
- BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
218
 
219
  TEXT_TASKS = {
220
  "glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks, TasksMib_Subgraph, TasksMib_Causalgraph
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
28
  auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
29
  auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
30
  #Scores
31
+ # for task in Tasks:
32
+ # auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
34
  auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
35
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 
38
  auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
39
  auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
40
  auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
41
+ # for task in TasksMultimodal:
42
+ # auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
43
+ # if task.value.col_name in ("ewok", "EWoK"): # make sure this appears in the right order
44
+ # auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
45
  auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
46
  auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
47
  auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
214
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
215
 
216
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
217
+ # BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
218
 
219
  TEXT_TASKS = {
220
  "glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
src/envs.py CHANGED
@@ -6,24 +6,24 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "mech-interp-bench" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
  # RESULTS_REPO = f"{OWNER}/results-mib-test"
14
 
15
- # QUEUE_REPO = f"{OWNER}/requests"
16
- QUEUE_REPO = f"shunshao/requests-mib-test"
17
  RESULTS_REPO_MIB_SUBGRAPH = f"{OWNER}/subgraph-results"
18
  RESULTS_REPO_MIB_CAUSALGRAPH = f"{OWNER}/causalgraph-results"
19
  # RESULTS_REPO_MIB_CAUSALGRAPH = f"shunshao/causalgraph-results"
20
 
21
-
22
  # If you setup a cache later, just change HF_HOME
23
  CACHE_PATH=os.getenv("HF_HOME", ".")
24
 
25
  # Local caches
26
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
27
  # EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
28
  EVAL_RESULTS_MIB_SUBGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-subgraph")
29
  EVAL_RESULTS_MIB_CAUSALGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-causalgraph")
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "mib-bench" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
  # RESULTS_REPO = f"{OWNER}/results-mib-test"
14
 
15
+ QUEUE_REPO_SUBGRAPH = f"{OWNER}/requests-subgraph"
16
+ QUEUE_REPO_CAUSALGRAPH = f"{OWNER}/requests-causalgraph"
17
  RESULTS_REPO_MIB_SUBGRAPH = f"{OWNER}/subgraph-results"
18
  RESULTS_REPO_MIB_CAUSALGRAPH = f"{OWNER}/causalgraph-results"
19
  # RESULTS_REPO_MIB_CAUSALGRAPH = f"shunshao/causalgraph-results"
20
 
 
21
  # If you setup a cache later, just change HF_HOME
22
  CACHE_PATH=os.getenv("HF_HOME", ".")
23
 
24
  # Local caches
25
+ EVAL_REQUESTS_SUBGRAPH = os.path.join(CACHE_PATH, "eval-queue-subgraph")
26
+ EVAL_REQUESTS_CAUSALGRAPH = os.path.join(CACHE_PATH, "eval-queue-causalgraph")
27
  # EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
28
  EVAL_RESULTS_MIB_SUBGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-subgraph")
29
  EVAL_RESULTS_MIB_CAUSALGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-causalgraph")
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, Tasks, TasksMultimodal
12
  from src.submission.check_validity import is_model_on_hub
13
  from src.about import TasksMib_Subgraph
14
 
@@ -144,7 +144,7 @@ class EvalResult_MIB_SUBGRAPH:
144
  return data_dict
145
 
146
 
147
- def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
148
  """From the path of the results folder root, extract all needed info for MIB results"""
149
  model_result_filepaths = []
150
 
@@ -487,7 +487,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
487
  return averaged_df
488
 
489
 
490
- def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
491
  """From the path of the results folder root, extract all needed info for MIB causal graph results"""
492
  model_result_filepaths = []
493
 
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal
12
  from src.submission.check_validity import is_model_on_hub
13
  from src.about import TasksMib_Subgraph
14
 
 
144
  return data_dict
145
 
146
 
147
+ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
148
  """From the path of the results folder root, extract all needed info for MIB results"""
149
  model_result_filepaths = []
150
 
 
487
  return averaged_df
488
 
489
 
490
+ def get_raw_eval_results_mib_causalgraph(results_path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
491
  """From the path of the results folder root, extract all needed info for MIB causal graph results"""
492
  model_result_filepaths = []
493
 
src/populate.py CHANGED
@@ -8,37 +8,11 @@ from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueu
8
  from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
9
  from src.about import TasksMib_Causalgraph
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- # print(f"results_path is {results_path}, requests_path is {requests_path}")
14
- raw_data = get_raw_eval_results(results_path, requests_path)
15
- # print(f"raw_data is {raw_data}")
16
- all_data_json = [v.to_dict() for v in raw_data]
17
- # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
18
- all_data_json_filtered = []
19
- for item in all_data_json:
20
- item["Track"] = item["eval_name"].split("_")[-1]
21
- item["ioi"] = 0
22
- item["mcqa"] = 0
23
- if "VQA" in benchmark_cols and "VQA" in item:
24
- all_data_json_filtered.append(item)
25
- if "VQA" not in benchmark_cols and "VQA" not in item:
26
- all_data_json_filtered.append(item)
27
-
28
- all_data_json = all_data_json_filtered
29
-
30
- df = pd.DataFrame.from_records(all_data_json)
31
- df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
32
- df = df[has_no_nan_values(df, benchmark_cols)]
33
- return df
34
-
35
-
36
-
37
- def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list,
38
  metric_type = "F+") -> pd.DataFrame:
39
  """Creates a dataframe from all the MIB experiment results"""
40
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
41
- raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
42
 
43
  all_data_json = [v.to_dict(metric_type=metric_type) for v in raw_data]
44
  # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
@@ -122,10 +96,10 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
122
 
123
 
124
 
125
- def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
126
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
127
 
128
- detailed_df, aggregated_df, intervention_averaged_df = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
129
 
130
  # all_data_json = [v.to_dict() for v in raw_detailed_df]
131
  # detailed_df = pd.DataFrame.from_records(all_data_json)
@@ -175,27 +149,28 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
175
  with open(file_path) as fp:
176
  data = json.load(fp)
177
 
178
- if "still_on_hub" in data and data["still_on_hub"]:
179
- data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
180
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
181
- else:
182
- data[EvalQueueColumn.model.name] = data["model"]
183
- data[EvalQueueColumn.revision.name] = "N/A"
184
 
185
  all_evals.append(data)
186
- elif ".md" not in entry:
187
- # this is a folder
188
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
189
- for sub_entry in sub_entries:
190
- file_path = os.path.join(save_path, entry, sub_entry)
191
- with open(file_path) as fp:
192
- data = json.load(fp)
193
-
194
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
195
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
196
- all_evals.append(data)
197
-
198
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
 
199
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
200
  finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
201
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
 
8
  from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
9
  from src.about import TasksMib_Causalgraph
10
 
11
+ def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_cols: list,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  metric_type = "F+") -> pd.DataFrame:
13
  """Creates a dataframe from all the MIB experiment results"""
14
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
15
+ raw_data = get_raw_eval_results_mib_subgraph(results_path)
16
 
17
  all_data_json = [v.to_dict(metric_type=metric_type) for v in raw_data]
18
  # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
 
96
 
97
 
98
 
99
+ def get_leaderboard_df_mib_causalgraph(results_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
100
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
101
 
102
+ detailed_df, aggregated_df, intervention_averaged_df = get_raw_eval_results_mib_causalgraph(results_path)
103
 
104
  # all_data_json = [v.to_dict() for v in raw_detailed_df]
105
  # detailed_df = pd.DataFrame.from_records(all_data_json)
 
149
  with open(file_path) as fp:
150
  data = json.load(fp)
151
 
152
+ # if "still_on_hub" in data and data["still_on_hub"]:
153
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
154
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
155
+ # else:
156
+ # data[EvalQueueColumn.model.name] = data["model"]
157
+ # data[EvalQueueColumn.revision.name] = "N/A"
158
 
159
  all_evals.append(data)
160
+
161
+ # elif ".md" not in entry:
162
+ # # this is a folder
163
+ # sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
164
+ # for sub_entry in sub_entries:
165
+ # file_path = os.path.join(save_path, entry, sub_entry)
166
+ # with open(file_path) as fp:
167
+ # data = json.load(fp)
168
+
169
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
170
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
171
+ # all_evals.append(data)
172
+
173
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN", "PREVALIDATION"]]
174
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
175
  finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
176
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
src/submission/check_validity.py CHANGED
@@ -1,38 +1,24 @@
1
  import json
2
  import os
 
3
  import re
4
  import numpy as np
 
 
 
5
  from collections import defaultdict
6
  from datetime import datetime, timedelta, timezone
 
7
 
8
- import huggingface_hub
9
  from huggingface_hub import ModelCard
10
  from huggingface_hub.hf_api import ModelInfo
11
  from transformers import AutoConfig
12
  from transformers.models.auto.tokenization_auto import AutoTokenizer
13
 
14
  from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
 
15
 
16
- def check_model_card(repo_id: str) -> tuple[bool, str]:
17
- """Checks if the model card and license exist and have been filled"""
18
- try:
19
- card = ModelCard.load(repo_id)
20
- except huggingface_hub.utils.EntryNotFoundError:
21
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
22
-
23
- # Enforce license metadata
24
- if card.data.license is None:
25
- if not ("license_name" in card.data and "license_link" in card.data):
26
- return False, (
27
- "License not found. Please add a license to your model card using the `license` metadata or a"
28
- " `license_name`/`license_link` pair."
29
- )
30
-
31
- # Enforce card content
32
- if len(card.text) < 200:
33
- return False, "Please add a description to your model card, it is too short."
34
-
35
- return True, ""
36
 
37
  def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
38
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
@@ -73,10 +59,12 @@ def get_model_size(model_info: ModelInfo, precision: str):
73
  model_size = size_factor * model_size
74
  return model_size
75
 
 
76
  def get_model_arch(model_info: ModelInfo):
77
  """Gets the model architecture from the configuration"""
78
  return model_info.config.get("architectures", "Unknown")
79
 
 
80
  def already_submitted_models(requested_models_dir: str) -> set[str]:
81
  """Gather a list of already submitted models to avoid duplicates"""
82
  depth = 1
@@ -101,6 +89,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
101
 
102
  return set(file_names), users_to_submission_dates
103
 
 
104
  def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
105
  out_msg = ""
106
  for task in TEXT_TASKS:
@@ -164,4 +153,238 @@ def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
164
 
165
  if out_msg != "":
166
  return False, out_msg
167
- return True, "Upload successful."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
+ import shutil
4
  import re
5
  import numpy as np
6
+ import pandas as pd
7
+ import gradio as gr
8
+ from urllib.parse import urlparse
9
  from collections import defaultdict
10
  from datetime import datetime, timedelta, timezone
11
+ from typing import Literal
12
 
13
+ from huggingface_hub import HfApi, HfFileSystem, hf_hub_url, get_hf_file_metadata
14
  from huggingface_hub import ModelCard
15
  from huggingface_hub.hf_api import ModelInfo
16
  from transformers import AutoConfig
17
  from transformers.models.auto.tokenization_auto import AutoTokenizer
18
 
19
  from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
20
+ from src.envs import EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
24
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
 
59
  model_size = size_factor * model_size
60
  return model_size
61
 
62
+
63
  def get_model_arch(model_info: ModelInfo):
64
  """Gets the model architecture from the configuration"""
65
  return model_info.config.get("architectures", "Unknown")
66
 
67
+
68
  def already_submitted_models(requested_models_dir: str) -> set[str]:
69
  """Gather a list of already submitted models to avoid duplicates"""
70
  depth = 1
 
89
 
90
  return set(file_names), users_to_submission_dates
91
 
92
+
93
  def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
94
  out_msg = ""
95
  for task in TEXT_TASKS:
 
153
 
154
  if out_msg != "":
155
  return False, out_msg
156
+ return True, "Upload successful."
157
+
158
+
159
+ def _format_time(earliest_time):
160
+ time_left = (earliest_time.tz_convert("UTC") + timedelta(weeks=1)) - pd.Timestamp.utcnow()
161
+ hours = time_left.seconds // 3600
162
+ minutes, seconds = divmod(time_left.seconds % 3600, 60)
163
+ time_left_formatted = f"{hours:02}:{minutes:02}:{seconds:02}"
164
+ if time_left.days > 0:
165
+ time_left_formatted = f"{time_left.days} days, {time_left_formatted}"
166
+ return time_left_formatted
167
+
168
+
169
+ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
170
+ """Creates the different dataframes for the evaluation queues requests"""
171
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
172
+ all_evals = []
173
+
174
+ for entry in entries:
175
+ if ".json" in entry:
176
+ file_path = os.path.join(save_path, entry)
177
+ with open(file_path) as fp:
178
+ data = json.load(fp)
179
+
180
+ # if "still_on_hub" in data and data["still_on_hub"]:
181
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
182
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
183
+ # else:
184
+ # data[EvalQueueColumn.model.name] = data["model"]
185
+ # data[EvalQueueColumn.revision.name] = "N/A"
186
+
187
+ all_evals.append(data)
188
+
189
+ elif ".md" not in entry:
190
+ # this is a folder
191
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
192
+ for sub_entry in sub_entries:
193
+ file_path = os.path.join(save_path, entry, sub_entry)
194
+ with open(file_path) as fp:
195
+ data = json.load(fp)
196
+ all_evals.append(data)
197
+
198
+ return pd.DataFrame(all_evals)
199
+
200
+ def check_rate_limit(track, user_name, contact_email):
201
+ if "Circuit" in track:
202
+ save_path = EVAL_REQUESTS_SUBGRAPH
203
+ else:
204
+ save_path = EVAL_REQUESTS_CAUSALGRAPH
205
+
206
+ evaluation_queue = get_evaluation_queue_df(save_path, ["user_name", "contact_email"])
207
+
208
+ if evaluation_queue.empty:
209
+ return True, None
210
+
211
+ one_week_ago = pd.Timestamp.utcnow() - timedelta(weeks=1)
212
+
213
+ user_name_occurrences = evaluation_queue[evaluation_queue["user_name"] == user_name]
214
+ user_name_occurrences["submit_time"] = pd.to_datetime(user_name_occurrences["submit_time"], utc=True)
215
+ user_name_occurrences = user_name_occurrences[user_name_occurrences["submit_time"] >= one_week_ago]
216
+ email_occurrences = evaluation_queue[evaluation_queue["contact_email"] == contact_email.lower()]
217
+ email_occurrences["submit_time"] = pd.to_datetime(email_occurrences["submit_time"], utc=True)
218
+ email_occurrences = email_occurrences[email_occurrences["submit_time"] >= one_week_ago]
219
+ if user_name_occurrences.shape[0] >= 2:
220
+ earliest_time = user_name_occurrences["submit_time"].min()
221
+ time_left_formatted = _format_time(earliest_time)
222
+ return False, time_left_formatted
223
+ if email_occurrences.shape[0] >= 2:
224
+ earliest_time = email_occurrences["submit_time"].min()
225
+ time_left_formatted = _format_time(earliest_time)
226
+ return False, time_left_formatted
227
+
228
+ return True, None
229
+
230
+ def parse_huggingface_url(url: str):
231
+ """
232
+ Extracts repo_id and subfolder path from a Hugging Face URL.
233
+ Returns (repo_id, folder_path).
234
+ """
235
+ # Handle cases where the input is already a repo_id (no URL)
236
+ if not url.startswith(("http://", "https://")):
237
+ return url, None
238
+
239
+ parsed = urlparse(url)
240
+ path_parts = parsed.path.strip("/").split("/")
241
+
242
+ # Extract repo_id (username/repo_name)
243
+ if len(path_parts) < 2:
244
+ raise ValueError("Invalid Hugging Face URL: Could not extract repo_id.")
245
+ repo_id = f"{path_parts[0]}/{path_parts[1]}"
246
+
247
+ # Extract folder path (if in /tree/ or /blob/)
248
+ if "tree" in path_parts or "blob" in path_parts:
249
+ try:
250
+ branch_idx = path_parts.index("tree") if "tree" in path_parts else path_parts.index("blob")
251
+ folder_path = "/".join(path_parts[branch_idx + 2:]) # Skip "tree/main" or "blob/main"
252
+ except (ValueError, IndexError):
253
+ folder_path = None
254
+ else:
255
+ folder_path = None
256
+
257
+ return repo_id, folder_path
258
+
259
+
260
+ def validate_directory(fs: HfFileSystem, repo_id: str, dirname: str, curr_tm: str, circuit_level:Literal['edge', 'node','neuron']='edge'):
261
+ errors = []
262
+ warnings = []
263
+
264
+ task, model = curr_tm.split("_")
265
+ curr_tm_display = curr_tm.replace("_", "/")
266
+
267
+ files = fs.ls(dirname)
268
+
269
+ # Detect whether multi-circuit or importances
270
+ is_multiple_circuits = False
271
+ files = [f["name"] for f in files if (f["name"].endswith(".json") or f["name"].endswith(".pt"))]
272
+ if len(files) == 1:
273
+ is_multiple_circuits = False
274
+ elif len(files) > 1:
275
+ is_multiple_circuits = True
276
+ if len(files) < 9:
277
+ errors.append(f"Folder for {curr_tm_display} contains multiple circuits, but not enough. If you intended to submit importances, include only one circuit in the folder. Otherwise, please add the rest of the circuits.")
278
+ else:
279
+ warnings.append(f"Directory present for {curr_tm_display} but is empty")
280
+
281
+ offset = 0
282
+ for idx, file in enumerate(files):
283
+ file_suffix = file.split(repo_id + "/")[1]
284
+ file_url = hf_hub_url(repo_id=repo_id, filename=file_suffix)
285
+ file_info = get_hf_file_metadata(file_url)
286
+ file_size_mb = file_info.size / (1024 * 1024)
287
+ if file_size_mb > 150:
288
+ warnings.append(f"Will skip file >150MB: {file}")
289
+ offset -= 1
290
+ continue
291
+
292
+ if is_multiple_circuits and idx + offset >= 9:
293
+ break
294
+
295
+ return errors, warnings
296
+
297
+
298
+ def verify_circuit_submission(hf_repo, level, progress=gr.Progress()):
299
+ VALID_COMBINATIONS = [
300
+ "ioi_gpt2", "ioi_qwen2.5", "ioi_gemma2", "ioi_llama3", "ioi_interpbench",
301
+ "mcqa_qwen2.5", "mcqa_gemma2", "mcqa_llama3",
302
+ "arithmetic-addition_llama3", "arithmetic-subtraction_llama3",
303
+ "arc-easy_gemma2", "arc-easy_llama3",
304
+ "arc-challenge_llama3"
305
+ ]
306
+
307
+ TASKS = ["ioi", "mcqa", "arithmetic-addition", "arithmetic-subtraction", "arc-easy", "arc-challenge"]
308
+ MODELS = ["gpt2", "qwen2.5", "gemma2", "llama3", "interpbench"]
309
+
310
+ errors = []
311
+ warnings = []
312
+
313
+ directories_present = {tm: False for tm in VALID_COMBINATIONS}
314
+ directories_valid = {tm: False for tm in VALID_COMBINATIONS}
315
+
316
+ fs = HfFileSystem()
317
+
318
+ path = hf_repo
319
+ level = level
320
+
321
+ folder_path = path.split("huggingface.co/")[1]
322
+ repo_id = "/".join(folder_path.split("/")[:2])
323
+ try:
324
+ files = fs.listdir(folder_path)
325
+ except Exception as e:
326
+ errors.append(f"Could not open Huggingface URL: {e}")
327
+ return errors, warnings
328
+
329
+ file_counts = 0
330
+ for dirname in progress.tqdm(files, desc="Validating directories in repo"):
331
+ file_counts += 1
332
+ if file_counts >= 30:
333
+ warnings.append("Folder contains many files/directories; stopped at 30.")
334
+ break
335
+ circuit_dir = dirname["name"]
336
+ dirname_proc = circuit_dir.lower().split("/")[-1]
337
+ if not fs.isdir(circuit_dir):
338
+ continue
339
+ curr_task = None
340
+ curr_model = None
341
+ # Look for task names in filename
342
+ for task in TASKS:
343
+ if dirname_proc.startswith(task) or f"_{task}" in dirname_proc:
344
+ curr_task = task
345
+ # Look for model names in filename
346
+ for model in MODELS:
347
+ if dirname_proc.startswith(model) or f"_{model}" in dirname_proc:
348
+ curr_model = model
349
+ if curr_task is not None and curr_model is not None:
350
+ curr_tm = f"{curr_task}_{curr_model}"
351
+ if curr_tm in VALID_COMBINATIONS:
352
+ directories_present[curr_tm] = True
353
+ else:
354
+ continue
355
+ else:
356
+ continue
357
+
358
+ # Parse circuits directory
359
+ print(f"validating {circuit_dir}")
360
+ vd_errors, vd_warnings = validate_directory(fs, repo_id, circuit_dir, curr_tm, level)
361
+ errors.extend(vd_errors)
362
+ warnings.extend(vd_warnings)
363
+
364
+ if len(vd_errors) == 0:
365
+ directories_valid[curr_tm] = True
366
+
367
+ task_set, model_set = set(), set()
368
+ for tm in directories_present:
369
+ if not directories_present[tm]:
370
+ continue
371
+ if not directories_valid[tm]:
372
+ warnings.append(f"Directory found for {tm.replace('_', '/')}, but circuits not valid or present")
373
+ continue
374
+ task, model = tm.split("_")
375
+ task_set.add(task)
376
+ model_set.add(model)
377
+ if len(task_set) < 2:
378
+ errors.append("At least 2 tasks are required")
379
+ if len(model_set) < 2:
380
+ errors.append("At least 2 models are required")
381
+
382
+ no_tm_display = [tm.replace("_", "/") for tm in directories_valid if not directories_valid[tm]]
383
+ if len(no_tm_display) > 0:
384
+ warnings.append(f"No valid circuits or importance scores found for the following tasks/models: {*no_tm_display,}")
385
+
386
+ return errors, warnings
387
+
388
+
389
+ def verify_causal_variable_submission(hf_repo, layer, position, code_upload):
390
+ return
src/submission/submit.py CHANGED
@@ -1,20 +1,96 @@
1
  import json
2
  import os
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
  from src.submission.check_validity import (
8
  already_submitted_models,
9
- check_model_card,
10
  get_model_size,
11
  is_model_on_hub,
12
  is_valid_predictions,
 
13
  )
 
14
 
15
  REQUESTED_MODELS = None
16
  USERS_TO_SUBMISSION_DATES = None
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def add_new_eval(
19
  model_name: str,
20
  model_id: str,
@@ -83,7 +159,7 @@ def add_new_eval(
83
  return styled_error("A model with this name has been already submitted.")
84
 
85
  print("Creating eval file")
86
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
87
  os.makedirs(OUT_DIR, exist_ok=True)
88
  out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
89
 
@@ -109,3 +185,38 @@ def add_new_eval(
109
  return styled_message(
110
  "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the request to show in the PENDING list."
111
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
+ import smtplib
4
  from datetime import datetime, timezone
5
 
6
  from src.display.formatting import styled_error, styled_message, styled_warning
7
+ from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, TOKEN, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH
8
  from src.submission.check_validity import (
9
  already_submitted_models,
 
10
  get_model_size,
11
  is_model_on_hub,
12
  is_valid_predictions,
13
+ parse_huggingface_url
14
  )
15
+ import gradio as gr
16
 
17
  REQUESTED_MODELS = None
18
  USERS_TO_SUBMISSION_DATES = None
19
 
20
+ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id):
21
+ errors = []
22
+ hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
23
+ try:
24
+ repo_info = hf_repo.split("huggingface.co/")[1]
25
+ user_name, repo_name = repo_info.split("/")[:2]
26
+ except Exception as e:
27
+ errors.append("Error processing HF URL: could not get username and repo name")
28
+ try:
29
+ commit_hash = API.list_repo_commits("/".join([user_name, repo_name]))[0].commit_id
30
+ except Exception as e:
31
+ errors.append("Could not get commit hash of provided Huggingface repo")
32
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
33
+
34
+ if not errors:
35
+ if "Circuit" in track:
36
+ eval_entry = {
37
+ "hf_repo": hf_repo,
38
+ "user_name": user_name,
39
+ "revision": commit_hash,
40
+ "circuit_level": level.lower(),
41
+ "method_name": method_name,
42
+ "contact_email": contact_email.lower(),
43
+ "submit_time": current_time,
44
+ "status": "PREVALIDATION",
45
+ "_id": _id
46
+ }
47
+ QUEUE_REPO = QUEUE_REPO_SUBGRAPH
48
+ EVAL_REQUESTS = EVAL_REQUESTS_SUBGRAPH
49
+ else:
50
+ eval_entry = {
51
+ "hf_repo": hf_repo,
52
+ "user_name": user_name,
53
+ "revision": commit_hash,
54
+ "layer": layer,
55
+ "token_position": token_position,
56
+ "code_upload": code_upload,
57
+ "method_name": method_name,
58
+ "contact_email": contact_email.lower(),
59
+ "submit_time": current_time,
60
+ "status": "PREVALIDATION",
61
+ "_id": _id
62
+ }
63
+ QUEUE_REPO = QUEUE_REPO_CAUSALGRAPH
64
+ EVAL_REQUESTS = EVAL_REQUESTS_CAUSALGRAPH
65
+
66
+
67
+ OUT_DIR = f"{EVAL_REQUESTS}/"
68
+ os.makedirs(OUT_DIR, exist_ok=True)
69
+ out_path = f"{OUT_DIR}/{method_name}_{_id}_{current_time}.json"
70
+ with open(out_path, 'w') as f:
71
+ f.write(json.dumps(eval_entry))
72
+
73
+ try:
74
+ API.upload_file(
75
+ path_or_fileobj=out_path,
76
+ path_in_repo=out_path.split("/")[-1],
77
+ repo_id=QUEUE_REPO,
78
+ repo_type="dataset",
79
+ commit_message=f"Add {method_name}_{_id}_{current_time}.json to eval queue"
80
+ )
81
+ except Exception as e:
82
+ errors.append(f"Could not upload entry to eval queue: {e}")
83
+
84
+ if errors:
85
+ status = gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
86
+ else:
87
+ status = gr.Textbox(f"✅ Submission received! Your ID is \"{_id}\". You'll receive an email once we've validated your submission.", visible=True)
88
+ return [
89
+ status,
90
+ None, None,
91
+ gr.Column(visible=False)
92
+ ]
93
+
94
  def add_new_eval(
95
  model_name: str,
96
  model_id: str,
 
159
  return styled_error("A model with this name has been already submitted.")
160
 
161
  print("Creating eval file")
162
+ OUT_DIR = f"{EVAL_REQUESTS}/{user_name}"
163
  os.makedirs(OUT_DIR, exist_ok=True)
164
  out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
165
 
 
185
  return styled_message(
186
  "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the request to show in the PENDING list."
187
  )
188
+
189
+ def remove_submission(track: str, method_name: str, _id: str):
190
+ if track is None:
191
+ return gr.Textbox(f"Please select a track.", visible=True)
192
+ if "Circuit" in track:
193
+ QUEUE_REPO = QUEUE_REPO_SUBGRAPH
194
+ EVAL_REQUESTS = EVAL_REQUESTS_SUBGRAPH
195
+ else:
196
+ QUEUE_REPO = QUEUE_REPO_CAUSALGRAPH
197
+ EVAL_REQUESTS = EVAL_REQUESTS_CAUSALGRAPH
198
+
199
+ OUT_DIR = f"{EVAL_REQUESTS}/"
200
+ os.makedirs(OUT_DIR, exist_ok=True)
201
+ files = os.listdir(OUT_DIR)
202
+ out_paths = [f for f in files if f.startswith(f"{method_name}_{_id}")]
203
+ if out_paths:
204
+ filename = out_paths[0]
205
+ filepath = os.path.join(OUT_DIR, filename)
206
+ with open(filepath, 'r') as f:
207
+ data = json.load(f)
208
+ hf_repo = data["hf_repo"]
209
+ try:
210
+ API.delete_file(
211
+ path_in_repo=filename,
212
+ repo_id=QUEUE_REPO,
213
+ repo_type="dataset"
214
+ )
215
+ except Exception as e:
216
+ return gr.Textbox(f"Could not delete entry from eval queue: {e}", visible=True)
217
+ os.remove(filepath)
218
+ status = "Submission removed from queue."
219
+ else:
220
+ status = "Submission not found in queue."
221
+
222
+ return gr.Textbox(status, visible=True)