Spaces:
Running
Running
Aaron Mueller
commited on
Commit
·
3c343e0
1
Parent(s):
c57af6c
leaderboard update
Browse files- app.py +152 -116
- src/about.py +19 -42
- src/display/utils.py +8 -8
- src/envs.py +5 -5
- src/leaderboard/read_evals.py +3 -3
- src/populate.py +24 -49
- src/submission/check_validity.py +245 -22
- src/submission/submit.py +114 -3
app.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
import json
|
2 |
import gzip
|
|
|
|
|
|
|
3 |
import gradio as gr
|
4 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
5 |
import pandas as pd
|
@@ -21,8 +24,6 @@ from src.about import (
|
|
21 |
)
|
22 |
from src.display.css_html_js import custom_css
|
23 |
from src.display.utils import (
|
24 |
-
BENCHMARK_COLS,
|
25 |
-
BENCHMARK_COLS_MULTIMODAL,
|
26 |
BENCHMARK_COLS_MIB_SUBGRAPH,
|
27 |
COLS,
|
28 |
COLS_MIB_SUBGRAPH,
|
@@ -34,10 +35,10 @@ from src.display.utils import (
|
|
34 |
AutoEvalColumn_mib_causalgraph,
|
35 |
fields,
|
36 |
)
|
37 |
-
from src.envs import API,
|
38 |
-
from src.populate import get_evaluation_queue_df,
|
39 |
-
from src.submission.submit import
|
40 |
-
|
41 |
|
42 |
from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
|
43 |
|
@@ -244,27 +245,35 @@ def restart_space():
|
|
244 |
|
245 |
|
246 |
|
247 |
-
### Space initialisation
|
248 |
try:
|
249 |
-
|
|
|
250 |
snapshot_download(
|
251 |
-
repo_id=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
)
|
253 |
except Exception:
|
254 |
restart_space()
|
255 |
-
|
256 |
|
257 |
try:
|
258 |
-
|
|
|
259 |
snapshot_download(
|
260 |
repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
261 |
)
|
262 |
except Exception:
|
263 |
restart_space()
|
264 |
-
|
265 |
-
|
266 |
try:
|
267 |
-
|
|
|
268 |
snapshot_download(
|
269 |
repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
270 |
)
|
@@ -277,26 +286,25 @@ def _sigmoid(x):
|
|
277 |
except:
|
278 |
return "-"
|
279 |
|
280 |
-
LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH,
|
281 |
-
LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH,
|
282 |
metric_type="F=")
|
283 |
|
284 |
# LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
|
285 |
# In app.py, modify the LEADERBOARD initialization
|
286 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
|
287 |
-
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
|
288 |
-
EVAL_REQUESTS_PATH
|
289 |
)
|
290 |
|
291 |
|
292 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
293 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
294 |
|
295 |
-
(
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
300 |
|
301 |
|
302 |
|
@@ -392,10 +400,6 @@ def init_leaderboard_mib_subgraph(dataframe, track):
|
|
392 |
|
393 |
|
394 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
395 |
-
# print("Debugging column issues:")
|
396 |
-
# print("\nActual DataFrame columns:")
|
397 |
-
# print(dataframe.columns.tolist())
|
398 |
-
|
399 |
model_name_mapping = {
|
400 |
"Qwen2ForCausalLM": "Qwen-2.5",
|
401 |
"GPT2ForCausalLM": "GPT-2",
|
@@ -419,18 +423,7 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
|
|
419 |
display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
|
420 |
display_mapping[field_name] = display_name
|
421 |
|
422 |
-
# print(dataframe)
|
423 |
renamed_df = dataframe.rename(columns=display_mapping)
|
424 |
-
|
425 |
-
# idx_to_method = {0: "Full Vector", 1: "DAS", 2: "DBM", 3: "PCA", 4: "SAE"}
|
426 |
-
# idx_to_scores = {0: [0.38, 0.36, 0.38, 0.42],
|
427 |
-
# 1: [0.56, 0.62, 0.54, 0.51],
|
428 |
-
# 2: [0.43, 0.41, 0.53, 0.49],
|
429 |
-
# 3: [0.26, 0.20, 0.32, 0.40],
|
430 |
-
# 4: ["-", "-", 0.33, "-"]}
|
431 |
-
# renamed_df.loc[0]["Method"] = "Full Vector"
|
432 |
-
# for i in range(5):
|
433 |
-
# renamed_df.loc[i] = [idx_to_method[i]] + idx_to_scores[i]
|
434 |
|
435 |
print(renamed_df)
|
436 |
|
@@ -438,11 +431,6 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
|
|
438 |
return Leaderboard(
|
439 |
value=renamed_df,
|
440 |
datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
|
441 |
-
# select_columns=SelectColumns(
|
442 |
-
# default_selection=["Method"], # Start with just Method column
|
443 |
-
# cant_deselect=["Method"], # Method column should always be visible
|
444 |
-
# label="Select Columns to Display:",
|
445 |
-
# ),
|
446 |
search_columns=["Method"],
|
447 |
hide_columns=["eval_name"],
|
448 |
bool_checkboxgroup_label="Hide models",
|
@@ -455,8 +443,6 @@ def init_leaderboard(dataframe, track):
|
|
455 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
456 |
# filter for correct track
|
457 |
dataframe = dataframe.loc[dataframe["Track"] == track]
|
458 |
-
|
459 |
-
# print(f"\n\n\n dataframe is {dataframe}\n\n\n")
|
460 |
|
461 |
return Leaderboard(
|
462 |
value=dataframe,
|
@@ -577,17 +563,6 @@ def update_leaderboard(dataframe: pd.DataFrame, selected_task_substrings: List[s
|
|
577 |
filtered_dataframe.loc[:, "Score"] = np.where(filtered_dataframe.eq("-").any(axis=1), "-", s_means.round(2))
|
578 |
filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
|
579 |
|
580 |
-
# if show_average:
|
581 |
-
# print([row for index, row in filtered_dataframe.iterrows()])
|
582 |
-
# filtered_dataframe["Average"] = [round(np.mean(row.values()), 2) if "-" not in row.values() else "-" for index, row in filtered_dataframe.iterrows()]
|
583 |
-
# # Sort by Average score descending
|
584 |
-
# if 'Average' in dataframe.columns:
|
585 |
-
# # Convert '-' to NaN for sorting purposes
|
586 |
-
# df['Average'] = pd.to_numeric(['Average'], errors='coerce')
|
587 |
-
# df = df.sort_values(by=['Average'], ascending=True, na_position='last')
|
588 |
-
# # Convert NaN back to '-'
|
589 |
-
# df['Average'] = df['Average'].fillna('-')
|
590 |
-
|
591 |
return filtered_dataframe
|
592 |
|
593 |
def process_url(url):
|
@@ -600,18 +575,6 @@ with demo:
|
|
600 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
601 |
|
602 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
603 |
-
# with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
|
604 |
-
# leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
|
605 |
-
# with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
|
606 |
-
# leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
|
607 |
-
# with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
|
608 |
-
# leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
|
609 |
-
|
610 |
-
# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
|
611 |
-
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
612 |
-
|
613 |
-
# with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
614 |
-
# leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
615 |
with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
|
616 |
with gr.Tabs() as subgraph_tabs:
|
617 |
with gr.TabItem("F+", id=0):
|
@@ -622,11 +585,6 @@ with demo:
|
|
622 |
You can combine filters to see specific task-model combinations.
|
623 |
""")
|
624 |
# CheckboxGroup for selecting substrings
|
625 |
-
# substring_checkbox = gr.CheckboxGroup(
|
626 |
-
# choices=PRESET_SUBSTRINGS,
|
627 |
-
# label="Filter results:",
|
628 |
-
# value=PRESET_SUBSTRINGS, # Default to all substrings selected
|
629 |
-
# )
|
630 |
task_substring_checkbox = gr.CheckboxGroup(
|
631 |
choices=TASK_SUBSTRINGS,
|
632 |
label="View tasks:",
|
@@ -660,11 +618,6 @@ with demo:
|
|
660 |
You can combine filters to see specific task-model combinations.
|
661 |
""")
|
662 |
# CheckboxGroup for selecting substrings
|
663 |
-
# substring_checkbox = gr.CheckboxGroup(
|
664 |
-
# choices=PRESET_SUBSTRINGS,
|
665 |
-
# label="Filter results:",
|
666 |
-
# value=PRESET_SUBSTRINGS, # Default to all substrings selected
|
667 |
-
# )
|
668 |
task_substring_checkbox = gr.CheckboxGroup(
|
669 |
choices=TASK_SUBSTRINGS,
|
670 |
label="View tasks:",
|
@@ -705,11 +658,6 @@ with demo:
|
|
705 |
Use the dropdown menus below to filter results by specific tasks or models.
|
706 |
You can combine filters to see specific task-model combinations.
|
707 |
""")
|
708 |
-
# substring_checkbox = gr.CheckboxGroup(
|
709 |
-
# choices=PRESET_SUBSTRINGS,
|
710 |
-
# label="Filter results:",
|
711 |
-
# value=PRESET_SUBSTRINGS, # Default to all substrings selected
|
712 |
-
# )
|
713 |
task_substring_checkbox = gr.CheckboxGroup(
|
714 |
choices=TASK_SUBSTRINGS,
|
715 |
label="View tasks:",
|
@@ -757,11 +705,24 @@ with demo:
|
|
757 |
|
758 |
with gr.Group(visible=False) as circuit_ui:
|
759 |
gr.Markdown("### Circuit Localization Requirements")
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
765 |
|
766 |
with gr.Group(visible=False) as causal_ui:
|
767 |
gr.Markdown("### Causal Variable Localization Requirements")
|
@@ -778,15 +739,22 @@ with demo:
|
|
778 |
minimum=0,
|
779 |
info="Integer specifying token position"
|
780 |
)
|
781 |
-
|
782 |
-
|
783 |
-
|
784 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
785 |
|
786 |
# Common fields
|
787 |
with gr.Group():
|
788 |
-
gr.Markdown("###
|
789 |
-
|
790 |
contact_email = gr.Textbox(label="Contact Email")
|
791 |
|
792 |
# Dynamic UI logic
|
@@ -801,47 +769,115 @@ with demo:
|
|
801 |
track.change(toggle_ui, track, [circuit_ui, causal_ui])
|
802 |
|
803 |
# Submission handling
|
804 |
-
status = gr.Textbox(label="Submission Status", visible=
|
805 |
|
806 |
-
def handle_submission(track,
|
807 |
errors = []
|
|
|
808 |
|
|
|
|
|
|
|
|
|
809 |
# Validate common fields
|
810 |
-
if not
|
811 |
-
errors.append("
|
812 |
if "@" not in contact_email or "." not in contact_email:
|
813 |
errors.append("Valid email address is required")
|
|
|
|
|
814 |
|
815 |
-
|
816 |
-
|
817 |
-
|
818 |
-
|
|
|
|
|
|
|
|
|
819 |
else:
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
-
|
824 |
-
|
825 |
-
|
|
|
|
|
|
|
|
|
826 |
|
827 |
-
|
828 |
if not (isinstance(layer, int) and isinstance(token_position, int)):
|
829 |
errors.append("Layer and token position must be integers")
|
830 |
if not code_upload:
|
831 |
errors.append("Code file upload is required")
|
|
|
|
|
832 |
|
833 |
-
if
|
834 |
-
|
|
|
|
|
835 |
|
836 |
-
|
837 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
838 |
|
839 |
submit_btn = gr.Button("Submit Entry", variant="primary")
|
840 |
submit_btn.click(
|
841 |
handle_submission,
|
842 |
-
inputs=[track,
|
843 |
-
outputs=status
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
844 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
845 |
|
846 |
# Add info about rate limits
|
847 |
gr.Markdown("""
|
@@ -864,4 +900,4 @@ with demo:
|
|
864 |
scheduler = BackgroundScheduler()
|
865 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
866 |
scheduler.start()
|
867 |
-
demo.launch(share=True, ssr_mode=False)
|
|
|
1 |
import json
|
2 |
import gzip
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
import secrets
|
6 |
import gradio as gr
|
7 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
8 |
import pandas as pd
|
|
|
24 |
)
|
25 |
from src.display.css_html_js import custom_css
|
26 |
from src.display.utils import (
|
|
|
|
|
27 |
BENCHMARK_COLS_MIB_SUBGRAPH,
|
28 |
COLS,
|
29 |
COLS_MIB_SUBGRAPH,
|
|
|
35 |
AutoEvalColumn_mib_causalgraph,
|
36 |
fields,
|
37 |
)
|
38 |
+
from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
|
39 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
|
40 |
+
from src.submission.submit import upload_to_queue, remove_submission
|
41 |
+
from src.submission.check_validity import verify_circuit_submission, verify_causal_variable_submission, check_rate_limit
|
42 |
|
43 |
from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
|
44 |
|
|
|
245 |
|
246 |
|
247 |
|
248 |
+
### Space initialisation - refresh caches
|
249 |
try:
|
250 |
+
if os.path.exists(EVAL_REQUESTS_SUBGRAPH):
|
251 |
+
shutil.rmtree(EVAL_REQUESTS_SUBGRAPH)
|
252 |
snapshot_download(
|
253 |
+
repo_id=QUEUE_REPO_SUBGRAPH, local_dir=EVAL_REQUESTS_SUBGRAPH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
254 |
+
)
|
255 |
+
except Exception:
|
256 |
+
restart_space()
|
257 |
+
try:
|
258 |
+
if os.path.exists(EVAL_REQUESTS_CAUSALGRAPH):
|
259 |
+
shutil.rmtree(EVAL_REQUESTS_CAUSALGRAPH)
|
260 |
+
snapshot_download(
|
261 |
+
repo_id=QUEUE_REPO_CAUSALGRAPH, local_dir=EVAL_REQUESTS_CAUSALGRAPH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
262 |
)
|
263 |
except Exception:
|
264 |
restart_space()
|
|
|
265 |
|
266 |
try:
|
267 |
+
if os.path.exists(EVAL_RESULTS_MIB_SUBGRAPH_PATH):
|
268 |
+
shutil.rmtree(EVAL_RESULTS_MIB_SUBGRAPH_PATH)
|
269 |
snapshot_download(
|
270 |
repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
271 |
)
|
272 |
except Exception:
|
273 |
restart_space()
|
|
|
|
|
274 |
try:
|
275 |
+
if os.path.exists(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH):
|
276 |
+
shutil.rmtree(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH)
|
277 |
snapshot_download(
|
278 |
repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
279 |
)
|
|
|
286 |
except:
|
287 |
return "-"
|
288 |
|
289 |
+
LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
|
290 |
+
LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
|
291 |
metric_type="F=")
|
292 |
|
293 |
# LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
|
294 |
# In app.py, modify the LEADERBOARD initialization
|
295 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
|
296 |
+
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
|
|
|
297 |
)
|
298 |
|
299 |
|
300 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
301 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
302 |
|
303 |
+
# (
|
304 |
+
# finished_eval_queue_df,
|
305 |
+
# running_eval_queue_df,
|
306 |
+
# pending_eval_queue_df,
|
307 |
+
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
308 |
|
309 |
|
310 |
|
|
|
400 |
|
401 |
|
402 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
|
|
|
|
|
|
|
|
403 |
model_name_mapping = {
|
404 |
"Qwen2ForCausalLM": "Qwen-2.5",
|
405 |
"GPT2ForCausalLM": "GPT-2",
|
|
|
423 |
display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
|
424 |
display_mapping[field_name] = display_name
|
425 |
|
|
|
426 |
renamed_df = dataframe.rename(columns=display_mapping)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
|
428 |
print(renamed_df)
|
429 |
|
|
|
431 |
return Leaderboard(
|
432 |
value=renamed_df,
|
433 |
datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
|
|
|
|
|
|
|
|
|
|
|
434 |
search_columns=["Method"],
|
435 |
hide_columns=["eval_name"],
|
436 |
bool_checkboxgroup_label="Hide models",
|
|
|
443 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
444 |
# filter for correct track
|
445 |
dataframe = dataframe.loc[dataframe["Track"] == track]
|
|
|
|
|
446 |
|
447 |
return Leaderboard(
|
448 |
value=dataframe,
|
|
|
563 |
filtered_dataframe.loc[:, "Score"] = np.where(filtered_dataframe.eq("-").any(axis=1), "-", s_means.round(2))
|
564 |
filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
|
565 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
566 |
return filtered_dataframe
|
567 |
|
568 |
def process_url(url):
|
|
|
575 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
576 |
|
577 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
578 |
with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
|
579 |
with gr.Tabs() as subgraph_tabs:
|
580 |
with gr.TabItem("F+", id=0):
|
|
|
585 |
You can combine filters to see specific task-model combinations.
|
586 |
""")
|
587 |
# CheckboxGroup for selecting substrings
|
|
|
|
|
|
|
|
|
|
|
588 |
task_substring_checkbox = gr.CheckboxGroup(
|
589 |
choices=TASK_SUBSTRINGS,
|
590 |
label="View tasks:",
|
|
|
618 |
You can combine filters to see specific task-model combinations.
|
619 |
""")
|
620 |
# CheckboxGroup for selecting substrings
|
|
|
|
|
|
|
|
|
|
|
621 |
task_substring_checkbox = gr.CheckboxGroup(
|
622 |
choices=TASK_SUBSTRINGS,
|
623 |
label="View tasks:",
|
|
|
658 |
Use the dropdown menus below to filter results by specific tasks or models.
|
659 |
You can combine filters to see specific task-model combinations.
|
660 |
""")
|
|
|
|
|
|
|
|
|
|
|
661 |
task_substring_checkbox = gr.CheckboxGroup(
|
662 |
choices=TASK_SUBSTRINGS,
|
663 |
label="View tasks:",
|
|
|
705 |
|
706 |
with gr.Group(visible=False) as circuit_ui:
|
707 |
gr.Markdown("### Circuit Localization Requirements")
|
708 |
+
with gr.Row():
|
709 |
+
hf_repo_circ = gr.Textbox(
|
710 |
+
label="HuggingFace Repository URL",
|
711 |
+
placeholder="https://huggingface.co/username/repo/path",
|
712 |
+
info="Must be a valid HuggingFace URL pointing to folders containing either 1 importance score file per task/model, or " \
|
713 |
+
"9 circuit files per task/model (.json or .pt). " \
|
714 |
+
"Remove 'tree', 'resolve', and the branch name (e.g., '/tree/main/') from URL if present."
|
715 |
+
)
|
716 |
+
level = gr.Radio(
|
717 |
+
choices=[
|
718 |
+
"Edge",
|
719 |
+
"Node (submodule)",
|
720 |
+
"Node (neuron)"
|
721 |
+
],
|
722 |
+
label="Level of granularity",
|
723 |
+
info="Is your circuit defined by its inclusion/exclusion of certain edges (e.g., MLP1 to H10L12), of certain submodules (e.g., MLP1), or of neurons " \
|
724 |
+
"within those submodules (e.g., MLP1 neuron 295)?"
|
725 |
+
)
|
726 |
|
727 |
with gr.Group(visible=False) as causal_ui:
|
728 |
gr.Markdown("### Causal Variable Localization Requirements")
|
|
|
739 |
minimum=0,
|
740 |
info="Integer specifying token position"
|
741 |
)
|
742 |
+
with gr.Row():
|
743 |
+
hf_repo_cg = gr.Textbox(
|
744 |
+
label="HuggingFace Repository URL",
|
745 |
+
placeholder="https://huggingface.co/username/repo/path",
|
746 |
+
info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " \
|
747 |
+
"Remove 'tree', 'resolve', and the branch name (e.g., '/tree/main/') from URL if present."
|
748 |
+
)
|
749 |
+
code_upload = gr.File(
|
750 |
+
label="Upload Python file implementing your featurization function",
|
751 |
+
file_types=[".py"],
|
752 |
+
)
|
753 |
|
754 |
# Common fields
|
755 |
with gr.Group():
|
756 |
+
gr.Markdown("### Submission Information")
|
757 |
+
method_name = gr.Textbox(label="Method Name")
|
758 |
contact_email = gr.Textbox(label="Contact Email")
|
759 |
|
760 |
# Dynamic UI logic
|
|
|
769 |
track.change(toggle_ui, track, [circuit_ui, causal_ui])
|
770 |
|
771 |
# Submission handling
|
772 |
+
status = gr.Textbox(label="Submission Status", visible=True)
|
773 |
|
774 |
+
def handle_submission(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email):
|
775 |
errors = []
|
776 |
+
warnings = []
|
777 |
|
778 |
+
breaking_error = False
|
779 |
+
|
780 |
+
hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
|
781 |
+
|
782 |
# Validate common fields
|
783 |
+
if not method_name.strip():
|
784 |
+
errors.append("Method name is required")
|
785 |
if "@" not in contact_email or "." not in contact_email:
|
786 |
errors.append("Valid email address is required")
|
787 |
+
if not level:
|
788 |
+
errors.append("Level of granularity is required")
|
789 |
|
790 |
+
if not hf_repo.startswith("https://huggingface.co/") and not hf_repo.startswith("http://huggingface.co/"):
|
791 |
+
errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
|
792 |
+
breaking_error = True
|
793 |
+
else:
|
794 |
+
repo_info = hf_repo.split("huggingface.co/")[1]
|
795 |
+
if len(repo_info.split("/")) < 2:
|
796 |
+
errors.append("Could not read username or repo name from HF URL")
|
797 |
+
breaking_error = True
|
798 |
else:
|
799 |
+
user_name, repo_name = repo_info.split("/")[:2]
|
800 |
+
under_rate_limit, time_left = check_rate_limit(track, user_name, contact_email)
|
801 |
+
if not under_rate_limit:
|
802 |
+
errors.append(f"Rate limit exceeded (max 2 submissions per week). Please try again in {time_left}. " \
|
803 |
+
"(If you're trying again after a failed validation, either remove the previous entry below or try again in about 30 minutes.")
|
804 |
+
breaking_error = True
|
805 |
+
|
806 |
+
# Track-specific validation
|
807 |
+
if "Circuit" in track and not breaking_error:
|
808 |
+
submission_errors, submission_warnings = verify_circuit_submission(hf_repo, level)
|
809 |
|
810 |
+
elif not breaking_error:
|
811 |
if not (isinstance(layer, int) and isinstance(token_position, int)):
|
812 |
errors.append("Layer and token position must be integers")
|
813 |
if not code_upload:
|
814 |
errors.append("Code file upload is required")
|
815 |
+
|
816 |
+
submission_errors, submission_warnings = verify_causal_variable_submission(hf_repo, layer, token_position, code_upload)
|
817 |
|
818 |
+
if not breaking_error:
|
819 |
+
errors.extend(submission_errors)
|
820 |
+
warnings.extend(submission_warnings)
|
821 |
+
_id = secrets.token_urlsafe(12)
|
822 |
|
823 |
+
if errors:
|
824 |
+
return [
|
825 |
+
gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True),
|
826 |
+
None, None,
|
827 |
+
gr.Column(visible=False),
|
828 |
+
]
|
829 |
+
elif warnings:
|
830 |
+
return [
|
831 |
+
gr.Textbox("Warnings:", visible=True),
|
832 |
+
gr.Markdown("\n".join(f"• {w}" for w in warnings)),
|
833 |
+
(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id),
|
834 |
+
gr.Column(visible=True)
|
835 |
+
]
|
836 |
+
else:
|
837 |
+
return upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id)
|
838 |
+
|
839 |
+
# New warning confirmation dialog
|
840 |
+
warning_modal = gr.Column(visible=False, variant="panel")
|
841 |
+
with warning_modal:
|
842 |
+
gr.Markdown("### ⚠️ Submission Warnings")
|
843 |
+
warning_display = gr.Markdown()
|
844 |
+
proceed_btn = gr.Button("Proceed Anyway", variant="primary")
|
845 |
+
cancel_btn = gr.Button("Cancel Submission", variant="secondary")
|
846 |
+
|
847 |
+
# Store submission data between callbacks
|
848 |
+
pending_submission = gr.State()
|
849 |
|
850 |
submit_btn = gr.Button("Submit Entry", variant="primary")
|
851 |
submit_btn.click(
|
852 |
handle_submission,
|
853 |
+
inputs=[track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email],
|
854 |
+
outputs=[status, warning_display, pending_submission, warning_modal]
|
855 |
+
)
|
856 |
+
|
857 |
+
proceed_btn.click(
|
858 |
+
lambda x: upload_to_queue(*x),
|
859 |
+
inputs=pending_submission,
|
860 |
+
outputs=[status, warning_display, pending_submission, warning_modal]
|
861 |
+
)
|
862 |
+
|
863 |
+
cancel_btn.click(
|
864 |
+
lambda: [gr.Textbox("Submission canceled.", visible=True), None, None, gr.Column(visible=False)],
|
865 |
+
outputs=[status, warning_display, pending_submission, warning_modal]
|
866 |
)
|
867 |
+
|
868 |
+
with gr.Group():
|
869 |
+
gr.Markdown("### Remove Submission from Queue")
|
870 |
+
with gr.Row():
|
871 |
+
name_r = gr.Textbox(label="Method Name")
|
872 |
+
_id_r = gr.Textbox(label = "Submission ID")
|
873 |
+
|
874 |
+
status_r = gr.Textbox(label="Removal Status", visible=False)
|
875 |
+
remove_button = gr.Button("Remove Entry")
|
876 |
+
remove_button.click(
|
877 |
+
remove_submission,
|
878 |
+
inputs=[track, name_r, _id_r],
|
879 |
+
outputs=[status_r]
|
880 |
+
)
|
881 |
|
882 |
# Add info about rate limits
|
883 |
gr.Markdown("""
|
|
|
900 |
scheduler = BackgroundScheduler()
|
901 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
902 |
scheduler.start()
|
903 |
+
demo.queue(default_concurrency_limit=40).launch(share=True, ssr_mode=False)
|
src/about.py
CHANGED
@@ -7,11 +7,6 @@ class Task:
|
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
# Select your tasks here
|
14 |
-
# ---------------------------------------------------
|
15 |
class Tasks(Enum):
|
16 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
17 |
task0 = Task("blimp", "acc", "BLiMP")
|
@@ -19,19 +14,6 @@ class Tasks(Enum):
|
|
19 |
task2 = Task("glue", "acc", "(Super)GLUE")
|
20 |
task3 = Task("ewok", "acc", "EWoK")
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
class TasksMultimodal(Enum):
|
25 |
-
task0 = Task("blimp", "acc", "BLiMP")
|
26 |
-
task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
|
27 |
-
task2 = Task("glue", "acc", "(Super)GLUE")
|
28 |
-
task3 = Task("ewok", "acc", "EWoK")
|
29 |
-
task4 = Task("vqa", "acc", "VQA")
|
30 |
-
task5 = Task("winoground", "acc", "Winoground")
|
31 |
-
task6 = Task("devbench", "acc", "DevBench")
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
@dataclass
|
36 |
class TaskMIB_Subgraph:
|
37 |
benchmark: str # task name in json (ioi/arithmetic)
|
@@ -118,15 +100,8 @@ class TasksMib_Causalgraph(Enum):
|
|
118 |
|
119 |
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
124 |
-
# ---------------------------------------------------
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
# Your leaderboard name
|
129 |
-
TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Benchmark
|
130 |
|
131 |
# What does your leaderboard evaluate?
|
132 |
INTRODUCTION_TEXT = """
|
@@ -135,34 +110,36 @@ The leaderboards for each track of the 2024 Mechanistic Interpretability Benchma
|
|
135 |
|
136 |
# Which evaluations are you running? how can people reproduce what you have?
|
137 |
LLM_BENCHMARKS_TEXT = f"""
|
138 |
-
This leaderboard displays scores
|
139 |
"""
|
140 |
|
141 |
EVALUATION_QUEUE_TEXT = """
|
142 |
## Circuit localization track:
|
143 |
|
144 |
-
You'll need
|
145 |
-
the circuit should contain no
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
149 |
|
150 |
For specifications about the file format for a circuit, see the README on our project GitHub: TODO
|
151 |
|
152 |
-
Once your
|
153 |
-
The evaluations are handled by the National Deep Inference Framework (NDIF).
|
154 |
|
155 |
## Causal variable localization track:
|
|
|
|
|
|
|
156 |
"""
|
157 |
|
158 |
-
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the
|
159 |
CITATION_BUTTON_TEXT = r"""
|
160 |
-
@article{
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
url={https://arxiv.org/abs/2412.05149},
|
167 |
}
|
168 |
"""
|
|
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
|
|
|
|
|
|
|
|
|
|
|
10 |
class Tasks(Enum):
|
11 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
12 |
task0 = Task("blimp", "acc", "BLiMP")
|
|
|
14 |
task2 = Task("glue", "acc", "(Super)GLUE")
|
15 |
task3 = Task("ewok", "acc", "EWoK")
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
@dataclass
|
18 |
class TaskMIB_Subgraph:
|
19 |
benchmark: str # task name in json (ioi/arithmetic)
|
|
|
100 |
|
101 |
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
# Your leaderboard name
|
104 |
+
TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Benchmark Leaderboards</h1>"""
|
105 |
|
106 |
# What does your leaderboard evaluate?
|
107 |
INTRODUCTION_TEXT = """
|
|
|
110 |
|
111 |
# Which evaluations are you running? how can people reproduce what you have?
|
112 |
LLM_BENCHMARKS_TEXT = f"""
|
113 |
+
This leaderboard displays scores on the private test set for the Mechanistic Interpretability Benchmark. Each track has its own tab.
|
114 |
"""
|
115 |
|
116 |
EVALUATION_QUEUE_TEXT = """
|
117 |
## Circuit localization track:
|
118 |
|
119 |
+
You'll need either (i) 1 circuit per task/model combinaton with floating-point importance scores for each edge or node, or (ii) 9 circuits per model/task with binary membership scores for each edge or node.
|
120 |
+
If (ii), then for each critical threshold k, the circuit should contain no more than k% of edges. See [here]() for examples of each valid circuit format.
|
121 |
+
|
122 |
+
Create a folder in a HuggingFace repository to hold your circuits. At the URL you provide, there should be one folder per task/model combination; these folders
|
123 |
+
should contain your circuit(s). As long as the folders contain the model and task names, you do not need to worry about the circuit filenames.
|
124 |
+
If you provide more circuits than needed, our evaluation script will take the first 9 lexicographically.
|
125 |
|
126 |
For specifications about the file format for a circuit, see the README on our project GitHub: TODO
|
127 |
|
128 |
+
Once your submission has been validated and makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
|
|
|
129 |
|
130 |
## Causal variable localization track:
|
131 |
+
|
132 |
+
You'll need to provide a link to a HuggingFace repository containing your trained featurizer, the layer on which the featurizer was trained, and the code needed to load and run your featurizer.
|
133 |
+
See TODO for an example.
|
134 |
"""
|
135 |
|
136 |
+
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
|
137 |
CITATION_BUTTON_TEXT = r"""
|
138 |
+
@article{mib-2025,
|
139 |
+
title = {{MIB}: A Mechanistic Interpretability Benchmark},
|
140 |
+
author = {Aaron Mueller and Atticus Geiger and Sarah Wiegreffe and Dana Arad and Iv{\'a}n Arcuschin and Adam Belfki and Yik Siu Chan and Jaden Fiotto-Kaufman and Tal Haklay and Michael Hanna and Jing Huang and Rohan Gupta and Yaniv Nikankin and Hadas Orgad and Nikhil Prakash and Anja Reusch and Aruna Sankaranarayanan and Shun Shao and Alessandro Stolfo and Martin Tutek and Amir Zur and David Bau and Yonatan Belinkov},
|
141 |
+
year = {2025},
|
142 |
+
note = {To appear},
|
143 |
+
journal = {arXiv preprint}
|
|
|
144 |
}
|
145 |
"""
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks,
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -28,8 +28,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
28 |
auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
29 |
auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
30 |
#Scores
|
31 |
-
for task in Tasks:
|
32 |
-
|
33 |
# Model information
|
34 |
auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
35 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
@@ -38,10 +38,10 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
|
|
38 |
auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
39 |
auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
40 |
auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
41 |
-
for task in TasksMultimodal:
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
|
46 |
auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
47 |
auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
@@ -214,7 +214,7 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
|
214 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
215 |
|
216 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
217 |
-
BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
|
218 |
|
219 |
TEXT_TASKS = {
|
220 |
"glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks, TasksMib_Subgraph, TasksMib_Causalgraph
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
28 |
auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
29 |
auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
30 |
#Scores
|
31 |
+
# for task in Tasks:
|
32 |
+
# auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
# Model information
|
34 |
auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
35 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
|
|
38 |
auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
39 |
auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
40 |
auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
41 |
+
# for task in TasksMultimodal:
|
42 |
+
# auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
43 |
+
# if task.value.col_name in ("ewok", "EWoK"): # make sure this appears in the right order
|
44 |
+
# auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
45 |
auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
|
46 |
auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
47 |
auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
|
|
214 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
215 |
|
216 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
217 |
+
# BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
|
218 |
|
219 |
TEXT_TASKS = {
|
220 |
"glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
|
src/envs.py
CHANGED
@@ -6,24 +6,24 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
# RESULTS_REPO = f"{OWNER}/results-mib-test"
|
14 |
|
15 |
-
|
16 |
-
|
17 |
RESULTS_REPO_MIB_SUBGRAPH = f"{OWNER}/subgraph-results"
|
18 |
RESULTS_REPO_MIB_CAUSALGRAPH = f"{OWNER}/causalgraph-results"
|
19 |
# RESULTS_REPO_MIB_CAUSALGRAPH = f"shunshao/causalgraph-results"
|
20 |
|
21 |
-
|
22 |
# If you setup a cache later, just change HF_HOME
|
23 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
24 |
|
25 |
# Local caches
|
26 |
-
|
|
|
27 |
# EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
28 |
EVAL_RESULTS_MIB_SUBGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-subgraph")
|
29 |
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-causalgraph")
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "mib-bench" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
# RESULTS_REPO = f"{OWNER}/results-mib-test"
|
14 |
|
15 |
+
QUEUE_REPO_SUBGRAPH = f"{OWNER}/requests-subgraph"
|
16 |
+
QUEUE_REPO_CAUSALGRAPH = f"{OWNER}/requests-causalgraph"
|
17 |
RESULTS_REPO_MIB_SUBGRAPH = f"{OWNER}/subgraph-results"
|
18 |
RESULTS_REPO_MIB_CAUSALGRAPH = f"{OWNER}/causalgraph-results"
|
19 |
# RESULTS_REPO_MIB_CAUSALGRAPH = f"shunshao/causalgraph-results"
|
20 |
|
|
|
21 |
# If you setup a cache later, just change HF_HOME
|
22 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
23 |
|
24 |
# Local caches
|
25 |
+
EVAL_REQUESTS_SUBGRAPH = os.path.join(CACHE_PATH, "eval-queue-subgraph")
|
26 |
+
EVAL_REQUESTS_CAUSALGRAPH = os.path.join(CACHE_PATH, "eval-queue-causalgraph")
|
27 |
# EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
28 |
EVAL_RESULTS_MIB_SUBGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-subgraph")
|
29 |
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-causalgraph")
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
from src.about import TasksMib_Subgraph
|
14 |
|
@@ -144,7 +144,7 @@ class EvalResult_MIB_SUBGRAPH:
|
|
144 |
return data_dict
|
145 |
|
146 |
|
147 |
-
def get_raw_eval_results_mib_subgraph(results_path: str
|
148 |
"""From the path of the results folder root, extract all needed info for MIB results"""
|
149 |
model_result_filepaths = []
|
150 |
|
@@ -487,7 +487,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
487 |
return averaged_df
|
488 |
|
489 |
|
490 |
-
def get_raw_eval_results_mib_causalgraph(results_path: str
|
491 |
"""From the path of the results folder root, extract all needed info for MIB causal graph results"""
|
492 |
model_result_filepaths = []
|
493 |
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
from src.about import TasksMib_Subgraph
|
14 |
|
|
|
144 |
return data_dict
|
145 |
|
146 |
|
147 |
+
def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
|
148 |
"""From the path of the results folder root, extract all needed info for MIB results"""
|
149 |
model_result_filepaths = []
|
150 |
|
|
|
487 |
return averaged_df
|
488 |
|
489 |
|
490 |
+
def get_raw_eval_results_mib_causalgraph(results_path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
491 |
"""From the path of the results folder root, extract all needed info for MIB causal graph results"""
|
492 |
model_result_filepaths = []
|
493 |
|
src/populate.py
CHANGED
@@ -8,37 +8,11 @@ from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueu
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
|
9 |
from src.about import TasksMib_Causalgraph
|
10 |
|
11 |
-
def
|
12 |
-
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
14 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
-
# print(f"raw_data is {raw_data}")
|
16 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
17 |
-
# print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
|
18 |
-
all_data_json_filtered = []
|
19 |
-
for item in all_data_json:
|
20 |
-
item["Track"] = item["eval_name"].split("_")[-1]
|
21 |
-
item["ioi"] = 0
|
22 |
-
item["mcqa"] = 0
|
23 |
-
if "VQA" in benchmark_cols and "VQA" in item:
|
24 |
-
all_data_json_filtered.append(item)
|
25 |
-
if "VQA" not in benchmark_cols and "VQA" not in item:
|
26 |
-
all_data_json_filtered.append(item)
|
27 |
-
|
28 |
-
all_data_json = all_data_json_filtered
|
29 |
-
|
30 |
-
df = pd.DataFrame.from_records(all_data_json)
|
31 |
-
df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
|
32 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
33 |
-
return df
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list,
|
38 |
metric_type = "F+") -> pd.DataFrame:
|
39 |
"""Creates a dataframe from all the MIB experiment results"""
|
40 |
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
41 |
-
raw_data = get_raw_eval_results_mib_subgraph(results_path
|
42 |
|
43 |
all_data_json = [v.to_dict(metric_type=metric_type) for v in raw_data]
|
44 |
# print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
|
@@ -122,10 +96,10 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
122 |
|
123 |
|
124 |
|
125 |
-
def get_leaderboard_df_mib_causalgraph(results_path: str
|
126 |
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
127 |
|
128 |
-
detailed_df, aggregated_df, intervention_averaged_df = get_raw_eval_results_mib_causalgraph(results_path
|
129 |
|
130 |
# all_data_json = [v.to_dict() for v in raw_detailed_df]
|
131 |
# detailed_df = pd.DataFrame.from_records(all_data_json)
|
@@ -175,27 +149,28 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
175 |
with open(file_path) as fp:
|
176 |
data = json.load(fp)
|
177 |
|
178 |
-
if "still_on_hub" in data and data["still_on_hub"]:
|
179 |
-
|
180 |
-
|
181 |
-
else:
|
182 |
-
|
183 |
-
|
184 |
|
185 |
all_evals.append(data)
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
|
|
199 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
200 |
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
201 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
|
9 |
from src.about import TasksMib_Causalgraph
|
10 |
|
11 |
+
def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_cols: list,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
metric_type = "F+") -> pd.DataFrame:
|
13 |
"""Creates a dataframe from all the MIB experiment results"""
|
14 |
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
15 |
+
raw_data = get_raw_eval_results_mib_subgraph(results_path)
|
16 |
|
17 |
all_data_json = [v.to_dict(metric_type=metric_type) for v in raw_data]
|
18 |
# print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
|
|
|
96 |
|
97 |
|
98 |
|
99 |
+
def get_leaderboard_df_mib_causalgraph(results_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
100 |
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
101 |
|
102 |
+
detailed_df, aggregated_df, intervention_averaged_df = get_raw_eval_results_mib_causalgraph(results_path)
|
103 |
|
104 |
# all_data_json = [v.to_dict() for v in raw_detailed_df]
|
105 |
# detailed_df = pd.DataFrame.from_records(all_data_json)
|
|
|
149 |
with open(file_path) as fp:
|
150 |
data = json.load(fp)
|
151 |
|
152 |
+
# if "still_on_hub" in data and data["still_on_hub"]:
|
153 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
|
154 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
155 |
+
# else:
|
156 |
+
# data[EvalQueueColumn.model.name] = data["model"]
|
157 |
+
# data[EvalQueueColumn.revision.name] = "N/A"
|
158 |
|
159 |
all_evals.append(data)
|
160 |
+
|
161 |
+
# elif ".md" not in entry:
|
162 |
+
# # this is a folder
|
163 |
+
# sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
164 |
+
# for sub_entry in sub_entries:
|
165 |
+
# file_path = os.path.join(save_path, entry, sub_entry)
|
166 |
+
# with open(file_path) as fp:
|
167 |
+
# data = json.load(fp)
|
168 |
+
|
169 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
170 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
171 |
+
# all_evals.append(data)
|
172 |
+
|
173 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN", "PREVALIDATION"]]
|
174 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
175 |
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
176 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
src/submission/check_validity.py
CHANGED
@@ -1,38 +1,24 @@
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
import re
|
4 |
import numpy as np
|
|
|
|
|
|
|
5 |
from collections import defaultdict
|
6 |
from datetime import datetime, timedelta, timezone
|
|
|
7 |
|
8 |
-
import
|
9 |
from huggingface_hub import ModelCard
|
10 |
from huggingface_hub.hf_api import ModelInfo
|
11 |
from transformers import AutoConfig
|
12 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
13 |
|
14 |
from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
|
|
|
15 |
|
16 |
-
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
17 |
-
"""Checks if the model card and license exist and have been filled"""
|
18 |
-
try:
|
19 |
-
card = ModelCard.load(repo_id)
|
20 |
-
except huggingface_hub.utils.EntryNotFoundError:
|
21 |
-
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
22 |
-
|
23 |
-
# Enforce license metadata
|
24 |
-
if card.data.license is None:
|
25 |
-
if not ("license_name" in card.data and "license_link" in card.data):
|
26 |
-
return False, (
|
27 |
-
"License not found. Please add a license to your model card using the `license` metadata or a"
|
28 |
-
" `license_name`/`license_link` pair."
|
29 |
-
)
|
30 |
-
|
31 |
-
# Enforce card content
|
32 |
-
if len(card.text) < 200:
|
33 |
-
return False, "Please add a description to your model card, it is too short."
|
34 |
-
|
35 |
-
return True, ""
|
36 |
|
37 |
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
38 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
@@ -73,10 +59,12 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
73 |
model_size = size_factor * model_size
|
74 |
return model_size
|
75 |
|
|
|
76 |
def get_model_arch(model_info: ModelInfo):
|
77 |
"""Gets the model architecture from the configuration"""
|
78 |
return model_info.config.get("architectures", "Unknown")
|
79 |
|
|
|
80 |
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
81 |
"""Gather a list of already submitted models to avoid duplicates"""
|
82 |
depth = 1
|
@@ -101,6 +89,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
101 |
|
102 |
return set(file_names), users_to_submission_dates
|
103 |
|
|
|
104 |
def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
|
105 |
out_msg = ""
|
106 |
for task in TEXT_TASKS:
|
@@ -164,4 +153,238 @@ def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
|
|
164 |
|
165 |
if out_msg != "":
|
166 |
return False, out_msg
|
167 |
-
return True, "Upload successful."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import shutil
|
4 |
import re
|
5 |
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
import gradio as gr
|
8 |
+
from urllib.parse import urlparse
|
9 |
from collections import defaultdict
|
10 |
from datetime import datetime, timedelta, timezone
|
11 |
+
from typing import Literal
|
12 |
|
13 |
+
from huggingface_hub import HfApi, HfFileSystem, hf_hub_url, get_hf_file_metadata
|
14 |
from huggingface_hub import ModelCard
|
15 |
from huggingface_hub.hf_api import ModelInfo
|
16 |
from transformers import AutoConfig
|
17 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
18 |
|
19 |
from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
|
20 |
+
from src.envs import EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
24 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
|
|
59 |
model_size = size_factor * model_size
|
60 |
return model_size
|
61 |
|
62 |
+
|
63 |
def get_model_arch(model_info: ModelInfo):
|
64 |
"""Gets the model architecture from the configuration"""
|
65 |
return model_info.config.get("architectures", "Unknown")
|
66 |
|
67 |
+
|
68 |
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
69 |
"""Gather a list of already submitted models to avoid duplicates"""
|
70 |
depth = 1
|
|
|
89 |
|
90 |
return set(file_names), users_to_submission_dates
|
91 |
|
92 |
+
|
93 |
def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
|
94 |
out_msg = ""
|
95 |
for task in TEXT_TASKS:
|
|
|
153 |
|
154 |
if out_msg != "":
|
155 |
return False, out_msg
|
156 |
+
return True, "Upload successful."
|
157 |
+
|
158 |
+
|
159 |
+
def _format_time(earliest_time):
|
160 |
+
time_left = (earliest_time.tz_convert("UTC") + timedelta(weeks=1)) - pd.Timestamp.utcnow()
|
161 |
+
hours = time_left.seconds // 3600
|
162 |
+
minutes, seconds = divmod(time_left.seconds % 3600, 60)
|
163 |
+
time_left_formatted = f"{hours:02}:{minutes:02}:{seconds:02}"
|
164 |
+
if time_left.days > 0:
|
165 |
+
time_left_formatted = f"{time_left.days} days, {time_left_formatted}"
|
166 |
+
return time_left_formatted
|
167 |
+
|
168 |
+
|
169 |
+
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
170 |
+
"""Creates the different dataframes for the evaluation queues requests"""
|
171 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
172 |
+
all_evals = []
|
173 |
+
|
174 |
+
for entry in entries:
|
175 |
+
if ".json" in entry:
|
176 |
+
file_path = os.path.join(save_path, entry)
|
177 |
+
with open(file_path) as fp:
|
178 |
+
data = json.load(fp)
|
179 |
+
|
180 |
+
# if "still_on_hub" in data and data["still_on_hub"]:
|
181 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
|
182 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
183 |
+
# else:
|
184 |
+
# data[EvalQueueColumn.model.name] = data["model"]
|
185 |
+
# data[EvalQueueColumn.revision.name] = "N/A"
|
186 |
+
|
187 |
+
all_evals.append(data)
|
188 |
+
|
189 |
+
elif ".md" not in entry:
|
190 |
+
# this is a folder
|
191 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
192 |
+
for sub_entry in sub_entries:
|
193 |
+
file_path = os.path.join(save_path, entry, sub_entry)
|
194 |
+
with open(file_path) as fp:
|
195 |
+
data = json.load(fp)
|
196 |
+
all_evals.append(data)
|
197 |
+
|
198 |
+
return pd.DataFrame(all_evals)
|
199 |
+
|
200 |
+
def check_rate_limit(track, user_name, contact_email):
|
201 |
+
if "Circuit" in track:
|
202 |
+
save_path = EVAL_REQUESTS_SUBGRAPH
|
203 |
+
else:
|
204 |
+
save_path = EVAL_REQUESTS_CAUSALGRAPH
|
205 |
+
|
206 |
+
evaluation_queue = get_evaluation_queue_df(save_path, ["user_name", "contact_email"])
|
207 |
+
|
208 |
+
if evaluation_queue.empty:
|
209 |
+
return True, None
|
210 |
+
|
211 |
+
one_week_ago = pd.Timestamp.utcnow() - timedelta(weeks=1)
|
212 |
+
|
213 |
+
user_name_occurrences = evaluation_queue[evaluation_queue["user_name"] == user_name]
|
214 |
+
user_name_occurrences["submit_time"] = pd.to_datetime(user_name_occurrences["submit_time"], utc=True)
|
215 |
+
user_name_occurrences = user_name_occurrences[user_name_occurrences["submit_time"] >= one_week_ago]
|
216 |
+
email_occurrences = evaluation_queue[evaluation_queue["contact_email"] == contact_email.lower()]
|
217 |
+
email_occurrences["submit_time"] = pd.to_datetime(email_occurrences["submit_time"], utc=True)
|
218 |
+
email_occurrences = email_occurrences[email_occurrences["submit_time"] >= one_week_ago]
|
219 |
+
if user_name_occurrences.shape[0] >= 2:
|
220 |
+
earliest_time = user_name_occurrences["submit_time"].min()
|
221 |
+
time_left_formatted = _format_time(earliest_time)
|
222 |
+
return False, time_left_formatted
|
223 |
+
if email_occurrences.shape[0] >= 2:
|
224 |
+
earliest_time = email_occurrences["submit_time"].min()
|
225 |
+
time_left_formatted = _format_time(earliest_time)
|
226 |
+
return False, time_left_formatted
|
227 |
+
|
228 |
+
return True, None
|
229 |
+
|
230 |
+
def parse_huggingface_url(url: str):
|
231 |
+
"""
|
232 |
+
Extracts repo_id and subfolder path from a Hugging Face URL.
|
233 |
+
Returns (repo_id, folder_path).
|
234 |
+
"""
|
235 |
+
# Handle cases where the input is already a repo_id (no URL)
|
236 |
+
if not url.startswith(("http://", "https://")):
|
237 |
+
return url, None
|
238 |
+
|
239 |
+
parsed = urlparse(url)
|
240 |
+
path_parts = parsed.path.strip("/").split("/")
|
241 |
+
|
242 |
+
# Extract repo_id (username/repo_name)
|
243 |
+
if len(path_parts) < 2:
|
244 |
+
raise ValueError("Invalid Hugging Face URL: Could not extract repo_id.")
|
245 |
+
repo_id = f"{path_parts[0]}/{path_parts[1]}"
|
246 |
+
|
247 |
+
# Extract folder path (if in /tree/ or /blob/)
|
248 |
+
if "tree" in path_parts or "blob" in path_parts:
|
249 |
+
try:
|
250 |
+
branch_idx = path_parts.index("tree") if "tree" in path_parts else path_parts.index("blob")
|
251 |
+
folder_path = "/".join(path_parts[branch_idx + 2:]) # Skip "tree/main" or "blob/main"
|
252 |
+
except (ValueError, IndexError):
|
253 |
+
folder_path = None
|
254 |
+
else:
|
255 |
+
folder_path = None
|
256 |
+
|
257 |
+
return repo_id, folder_path
|
258 |
+
|
259 |
+
|
260 |
+
def validate_directory(fs: HfFileSystem, repo_id: str, dirname: str, curr_tm: str, circuit_level:Literal['edge', 'node','neuron']='edge'):
|
261 |
+
errors = []
|
262 |
+
warnings = []
|
263 |
+
|
264 |
+
task, model = curr_tm.split("_")
|
265 |
+
curr_tm_display = curr_tm.replace("_", "/")
|
266 |
+
|
267 |
+
files = fs.ls(dirname)
|
268 |
+
|
269 |
+
# Detect whether multi-circuit or importances
|
270 |
+
is_multiple_circuits = False
|
271 |
+
files = [f["name"] for f in files if (f["name"].endswith(".json") or f["name"].endswith(".pt"))]
|
272 |
+
if len(files) == 1:
|
273 |
+
is_multiple_circuits = False
|
274 |
+
elif len(files) > 1:
|
275 |
+
is_multiple_circuits = True
|
276 |
+
if len(files) < 9:
|
277 |
+
errors.append(f"Folder for {curr_tm_display} contains multiple circuits, but not enough. If you intended to submit importances, include only one circuit in the folder. Otherwise, please add the rest of the circuits.")
|
278 |
+
else:
|
279 |
+
warnings.append(f"Directory present for {curr_tm_display} but is empty")
|
280 |
+
|
281 |
+
offset = 0
|
282 |
+
for idx, file in enumerate(files):
|
283 |
+
file_suffix = file.split(repo_id + "/")[1]
|
284 |
+
file_url = hf_hub_url(repo_id=repo_id, filename=file_suffix)
|
285 |
+
file_info = get_hf_file_metadata(file_url)
|
286 |
+
file_size_mb = file_info.size / (1024 * 1024)
|
287 |
+
if file_size_mb > 150:
|
288 |
+
warnings.append(f"Will skip file >150MB: {file}")
|
289 |
+
offset -= 1
|
290 |
+
continue
|
291 |
+
|
292 |
+
if is_multiple_circuits and idx + offset >= 9:
|
293 |
+
break
|
294 |
+
|
295 |
+
return errors, warnings
|
296 |
+
|
297 |
+
|
298 |
+
def verify_circuit_submission(hf_repo, level, progress=gr.Progress()):
|
299 |
+
VALID_COMBINATIONS = [
|
300 |
+
"ioi_gpt2", "ioi_qwen2.5", "ioi_gemma2", "ioi_llama3", "ioi_interpbench",
|
301 |
+
"mcqa_qwen2.5", "mcqa_gemma2", "mcqa_llama3",
|
302 |
+
"arithmetic-addition_llama3", "arithmetic-subtraction_llama3",
|
303 |
+
"arc-easy_gemma2", "arc-easy_llama3",
|
304 |
+
"arc-challenge_llama3"
|
305 |
+
]
|
306 |
+
|
307 |
+
TASKS = ["ioi", "mcqa", "arithmetic-addition", "arithmetic-subtraction", "arc-easy", "arc-challenge"]
|
308 |
+
MODELS = ["gpt2", "qwen2.5", "gemma2", "llama3", "interpbench"]
|
309 |
+
|
310 |
+
errors = []
|
311 |
+
warnings = []
|
312 |
+
|
313 |
+
directories_present = {tm: False for tm in VALID_COMBINATIONS}
|
314 |
+
directories_valid = {tm: False for tm in VALID_COMBINATIONS}
|
315 |
+
|
316 |
+
fs = HfFileSystem()
|
317 |
+
|
318 |
+
path = hf_repo
|
319 |
+
level = level
|
320 |
+
|
321 |
+
folder_path = path.split("huggingface.co/")[1]
|
322 |
+
repo_id = "/".join(folder_path.split("/")[:2])
|
323 |
+
try:
|
324 |
+
files = fs.listdir(folder_path)
|
325 |
+
except Exception as e:
|
326 |
+
errors.append(f"Could not open Huggingface URL: {e}")
|
327 |
+
return errors, warnings
|
328 |
+
|
329 |
+
file_counts = 0
|
330 |
+
for dirname in progress.tqdm(files, desc="Validating directories in repo"):
|
331 |
+
file_counts += 1
|
332 |
+
if file_counts >= 30:
|
333 |
+
warnings.append("Folder contains many files/directories; stopped at 30.")
|
334 |
+
break
|
335 |
+
circuit_dir = dirname["name"]
|
336 |
+
dirname_proc = circuit_dir.lower().split("/")[-1]
|
337 |
+
if not fs.isdir(circuit_dir):
|
338 |
+
continue
|
339 |
+
curr_task = None
|
340 |
+
curr_model = None
|
341 |
+
# Look for task names in filename
|
342 |
+
for task in TASKS:
|
343 |
+
if dirname_proc.startswith(task) or f"_{task}" in dirname_proc:
|
344 |
+
curr_task = task
|
345 |
+
# Look for model names in filename
|
346 |
+
for model in MODELS:
|
347 |
+
if dirname_proc.startswith(model) or f"_{model}" in dirname_proc:
|
348 |
+
curr_model = model
|
349 |
+
if curr_task is not None and curr_model is not None:
|
350 |
+
curr_tm = f"{curr_task}_{curr_model}"
|
351 |
+
if curr_tm in VALID_COMBINATIONS:
|
352 |
+
directories_present[curr_tm] = True
|
353 |
+
else:
|
354 |
+
continue
|
355 |
+
else:
|
356 |
+
continue
|
357 |
+
|
358 |
+
# Parse circuits directory
|
359 |
+
print(f"validating {circuit_dir}")
|
360 |
+
vd_errors, vd_warnings = validate_directory(fs, repo_id, circuit_dir, curr_tm, level)
|
361 |
+
errors.extend(vd_errors)
|
362 |
+
warnings.extend(vd_warnings)
|
363 |
+
|
364 |
+
if len(vd_errors) == 0:
|
365 |
+
directories_valid[curr_tm] = True
|
366 |
+
|
367 |
+
task_set, model_set = set(), set()
|
368 |
+
for tm in directories_present:
|
369 |
+
if not directories_present[tm]:
|
370 |
+
continue
|
371 |
+
if not directories_valid[tm]:
|
372 |
+
warnings.append(f"Directory found for {tm.replace('_', '/')}, but circuits not valid or present")
|
373 |
+
continue
|
374 |
+
task, model = tm.split("_")
|
375 |
+
task_set.add(task)
|
376 |
+
model_set.add(model)
|
377 |
+
if len(task_set) < 2:
|
378 |
+
errors.append("At least 2 tasks are required")
|
379 |
+
if len(model_set) < 2:
|
380 |
+
errors.append("At least 2 models are required")
|
381 |
+
|
382 |
+
no_tm_display = [tm.replace("_", "/") for tm in directories_valid if not directories_valid[tm]]
|
383 |
+
if len(no_tm_display) > 0:
|
384 |
+
warnings.append(f"No valid circuits or importance scores found for the following tasks/models: {*no_tm_display,}")
|
385 |
+
|
386 |
+
return errors, warnings
|
387 |
+
|
388 |
+
|
389 |
+
def verify_causal_variable_submission(hf_repo, layer, position, code_upload):
|
390 |
+
return
|
src/submission/submit.py
CHANGED
@@ -1,20 +1,96 @@
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API,
|
7 |
from src.submission.check_validity import (
|
8 |
already_submitted_models,
|
9 |
-
check_model_card,
|
10 |
get_model_size,
|
11 |
is_model_on_hub,
|
12 |
is_valid_predictions,
|
|
|
13 |
)
|
|
|
14 |
|
15 |
REQUESTED_MODELS = None
|
16 |
USERS_TO_SUBMISSION_DATES = None
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
def add_new_eval(
|
19 |
model_name: str,
|
20 |
model_id: str,
|
@@ -83,7 +159,7 @@ def add_new_eval(
|
|
83 |
return styled_error("A model with this name has been already submitted.")
|
84 |
|
85 |
print("Creating eval file")
|
86 |
-
OUT_DIR = f"{
|
87 |
os.makedirs(OUT_DIR, exist_ok=True)
|
88 |
out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
|
89 |
|
@@ -109,3 +185,38 @@ def add_new_eval(
|
|
109 |
return styled_message(
|
110 |
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the request to show in the PENDING list."
|
111 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import smtplib
|
4 |
from datetime import datetime, timezone
|
5 |
|
6 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
7 |
+
from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, TOKEN, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH
|
8 |
from src.submission.check_validity import (
|
9 |
already_submitted_models,
|
|
|
10 |
get_model_size,
|
11 |
is_model_on_hub,
|
12 |
is_valid_predictions,
|
13 |
+
parse_huggingface_url
|
14 |
)
|
15 |
+
import gradio as gr
|
16 |
|
17 |
REQUESTED_MODELS = None
|
18 |
USERS_TO_SUBMISSION_DATES = None
|
19 |
|
20 |
+
def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id):
|
21 |
+
errors = []
|
22 |
+
hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
|
23 |
+
try:
|
24 |
+
repo_info = hf_repo.split("huggingface.co/")[1]
|
25 |
+
user_name, repo_name = repo_info.split("/")[:2]
|
26 |
+
except Exception as e:
|
27 |
+
errors.append("Error processing HF URL: could not get username and repo name")
|
28 |
+
try:
|
29 |
+
commit_hash = API.list_repo_commits("/".join([user_name, repo_name]))[0].commit_id
|
30 |
+
except Exception as e:
|
31 |
+
errors.append("Could not get commit hash of provided Huggingface repo")
|
32 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
33 |
+
|
34 |
+
if not errors:
|
35 |
+
if "Circuit" in track:
|
36 |
+
eval_entry = {
|
37 |
+
"hf_repo": hf_repo,
|
38 |
+
"user_name": user_name,
|
39 |
+
"revision": commit_hash,
|
40 |
+
"circuit_level": level.lower(),
|
41 |
+
"method_name": method_name,
|
42 |
+
"contact_email": contact_email.lower(),
|
43 |
+
"submit_time": current_time,
|
44 |
+
"status": "PREVALIDATION",
|
45 |
+
"_id": _id
|
46 |
+
}
|
47 |
+
QUEUE_REPO = QUEUE_REPO_SUBGRAPH
|
48 |
+
EVAL_REQUESTS = EVAL_REQUESTS_SUBGRAPH
|
49 |
+
else:
|
50 |
+
eval_entry = {
|
51 |
+
"hf_repo": hf_repo,
|
52 |
+
"user_name": user_name,
|
53 |
+
"revision": commit_hash,
|
54 |
+
"layer": layer,
|
55 |
+
"token_position": token_position,
|
56 |
+
"code_upload": code_upload,
|
57 |
+
"method_name": method_name,
|
58 |
+
"contact_email": contact_email.lower(),
|
59 |
+
"submit_time": current_time,
|
60 |
+
"status": "PREVALIDATION",
|
61 |
+
"_id": _id
|
62 |
+
}
|
63 |
+
QUEUE_REPO = QUEUE_REPO_CAUSALGRAPH
|
64 |
+
EVAL_REQUESTS = EVAL_REQUESTS_CAUSALGRAPH
|
65 |
+
|
66 |
+
|
67 |
+
OUT_DIR = f"{EVAL_REQUESTS}/"
|
68 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
69 |
+
out_path = f"{OUT_DIR}/{method_name}_{_id}_{current_time}.json"
|
70 |
+
with open(out_path, 'w') as f:
|
71 |
+
f.write(json.dumps(eval_entry))
|
72 |
+
|
73 |
+
try:
|
74 |
+
API.upload_file(
|
75 |
+
path_or_fileobj=out_path,
|
76 |
+
path_in_repo=out_path.split("/")[-1],
|
77 |
+
repo_id=QUEUE_REPO,
|
78 |
+
repo_type="dataset",
|
79 |
+
commit_message=f"Add {method_name}_{_id}_{current_time}.json to eval queue"
|
80 |
+
)
|
81 |
+
except Exception as e:
|
82 |
+
errors.append(f"Could not upload entry to eval queue: {e}")
|
83 |
+
|
84 |
+
if errors:
|
85 |
+
status = gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
|
86 |
+
else:
|
87 |
+
status = gr.Textbox(f"✅ Submission received! Your ID is \"{_id}\". You'll receive an email once we've validated your submission.", visible=True)
|
88 |
+
return [
|
89 |
+
status,
|
90 |
+
None, None,
|
91 |
+
gr.Column(visible=False)
|
92 |
+
]
|
93 |
+
|
94 |
def add_new_eval(
|
95 |
model_name: str,
|
96 |
model_id: str,
|
|
|
159 |
return styled_error("A model with this name has been already submitted.")
|
160 |
|
161 |
print("Creating eval file")
|
162 |
+
OUT_DIR = f"{EVAL_REQUESTS}/{user_name}"
|
163 |
os.makedirs(OUT_DIR, exist_ok=True)
|
164 |
out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
|
165 |
|
|
|
185 |
return styled_message(
|
186 |
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the request to show in the PENDING list."
|
187 |
)
|
188 |
+
|
189 |
+
def remove_submission(track: str, method_name: str, _id: str):
|
190 |
+
if track is None:
|
191 |
+
return gr.Textbox(f"Please select a track.", visible=True)
|
192 |
+
if "Circuit" in track:
|
193 |
+
QUEUE_REPO = QUEUE_REPO_SUBGRAPH
|
194 |
+
EVAL_REQUESTS = EVAL_REQUESTS_SUBGRAPH
|
195 |
+
else:
|
196 |
+
QUEUE_REPO = QUEUE_REPO_CAUSALGRAPH
|
197 |
+
EVAL_REQUESTS = EVAL_REQUESTS_CAUSALGRAPH
|
198 |
+
|
199 |
+
OUT_DIR = f"{EVAL_REQUESTS}/"
|
200 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
201 |
+
files = os.listdir(OUT_DIR)
|
202 |
+
out_paths = [f for f in files if f.startswith(f"{method_name}_{_id}")]
|
203 |
+
if out_paths:
|
204 |
+
filename = out_paths[0]
|
205 |
+
filepath = os.path.join(OUT_DIR, filename)
|
206 |
+
with open(filepath, 'r') as f:
|
207 |
+
data = json.load(f)
|
208 |
+
hf_repo = data["hf_repo"]
|
209 |
+
try:
|
210 |
+
API.delete_file(
|
211 |
+
path_in_repo=filename,
|
212 |
+
repo_id=QUEUE_REPO,
|
213 |
+
repo_type="dataset"
|
214 |
+
)
|
215 |
+
except Exception as e:
|
216 |
+
return gr.Textbox(f"Could not delete entry from eval queue: {e}", visible=True)
|
217 |
+
os.remove(filepath)
|
218 |
+
status = "Submission removed from queue."
|
219 |
+
else:
|
220 |
+
status = "Submission not found in queue."
|
221 |
+
|
222 |
+
return gr.Textbox(status, visible=True)
|