Spaces:
Restarting
Restarting
Aaron Mueller
commited on
Commit
·
11e2149
1
Parent(s):
f8ca5d3
code cleanup
Browse files- app.py +10 -168
- src/about.py +7 -7
- src/display/utils.py +1 -206
- src/leaderboard/read_evals.py +1 -12
- src/submission/check_validity.py +0 -67
- src/submission/submit.py +0 -96
app.py
CHANGED
@@ -42,83 +42,6 @@ from src.submission.check_validity import verify_circuit_submission, verify_caus
|
|
42 |
|
43 |
from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
|
44 |
|
45 |
-
# class SmartSelectColumns(SelectColumns):
|
46 |
-
# """
|
47 |
-
# Enhanced SelectColumns component with basic filtering functionality.
|
48 |
-
# """
|
49 |
-
# def __init__(
|
50 |
-
# self,
|
51 |
-
# benchmark_keywords: Optional[List[str]] = None,
|
52 |
-
# model_keywords: Optional[List[str]] = None,
|
53 |
-
# initial_selected: Optional[List[str]] = None,
|
54 |
-
# **kwargs
|
55 |
-
# ):
|
56 |
-
# """
|
57 |
-
# Initialize SmartSelectColumns with minimal configuration.
|
58 |
-
|
59 |
-
# Args:
|
60 |
-
# benchmark_keywords: List of benchmark names to filter by
|
61 |
-
# model_keywords: List of model names to filter by
|
62 |
-
# initial_selected: List of columns to show initially
|
63 |
-
# """
|
64 |
-
# super().__init__(**kwargs)
|
65 |
-
# self.benchmark_keywords = benchmark_keywords or []
|
66 |
-
# self.model_keywords = model_keywords or []
|
67 |
-
# self.initial_selected = initial_selected or []
|
68 |
-
|
69 |
-
# def get_filtered_groups(self, df: pd.DataFrame) -> Dict[str, List[str]]:
|
70 |
-
# """
|
71 |
-
# Create column groups based on simple substring matching.
|
72 |
-
# """
|
73 |
-
# filtered_groups = {}
|
74 |
-
|
75 |
-
# # Create benchmark groups
|
76 |
-
# for benchmark in self.benchmark_keywords:
|
77 |
-
# matching_cols = [
|
78 |
-
# col for col in df.columns
|
79 |
-
# if benchmark in col.lower()
|
80 |
-
# ]
|
81 |
-
# if matching_cols:
|
82 |
-
# group_name = f"Benchmark group for {benchmark}"
|
83 |
-
# filtered_groups[group_name] = matching_cols
|
84 |
-
|
85 |
-
# # Create model groups
|
86 |
-
# for model in self.model_keywords:
|
87 |
-
# matching_cols = [
|
88 |
-
# col for col in df.columns
|
89 |
-
# if model in col.lower()
|
90 |
-
# ]
|
91 |
-
# if matching_cols:
|
92 |
-
# group_name = f"Model group for {model}"
|
93 |
-
# filtered_groups[group_name] = matching_cols
|
94 |
-
|
95 |
-
# return filtered_groups
|
96 |
-
|
97 |
-
# def update(
|
98 |
-
# self,
|
99 |
-
# value: Union[pd.DataFrame, Dict[str, List[str]], Any]
|
100 |
-
# ) -> Dict:
|
101 |
-
# """Update component with new values."""
|
102 |
-
# if isinstance(value, pd.DataFrame):
|
103 |
-
# choices = list(value.columns)
|
104 |
-
# selected = self.initial_selected if self.initial_selected else choices
|
105 |
-
# filtered_cols = self.get_filtered_groups(value)
|
106 |
-
|
107 |
-
# return {
|
108 |
-
# "choices": choices,
|
109 |
-
# "value": selected,
|
110 |
-
# "filtered_cols": filtered_cols
|
111 |
-
# }
|
112 |
-
|
113 |
-
# if hasattr(value, '__dataclass_fields__'):
|
114 |
-
# field_names = [field.name for field in fields(value)]
|
115 |
-
# return {
|
116 |
-
# "choices": field_names,
|
117 |
-
# "value": self.initial_selected if self.initial_selected else field_names
|
118 |
-
# }
|
119 |
-
|
120 |
-
# return super().update(value)
|
121 |
-
|
122 |
from gradio_leaderboard import SelectColumns, Leaderboard
|
123 |
import pandas as pd
|
124 |
from typing import List, Dict, Optional
|
@@ -290,16 +213,11 @@ LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_M
|
|
290 |
LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
|
291 |
metric_type="CMD")
|
292 |
|
293 |
-
# LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
|
294 |
# In app.py, modify the LEADERBOARD initialization
|
295 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
|
296 |
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
|
297 |
)
|
298 |
|
299 |
-
|
300 |
-
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
301 |
-
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
302 |
-
|
303 |
(
|
304 |
finished_eval_queue_df_subgraph,
|
305 |
pending_eval_queue_df_subgraph,
|
@@ -320,8 +238,6 @@ def init_leaderboard_mib_subgraph(dataframe, track):
|
|
320 |
|
321 |
print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
|
322 |
|
323 |
-
# First, create our display name mapping
|
324 |
-
# This is like creating a translation dictionary between internal names and display names
|
325 |
model_name_mapping = {
|
326 |
"qwen2_5": "Qwen-2.5",
|
327 |
"gpt2": "GPT-2",
|
@@ -377,12 +293,8 @@ def init_leaderboard_mib_subgraph(dataframe, track):
|
|
377 |
# Combine all groups using display names
|
378 |
all_groups = benchmark_groups + model_groups
|
379 |
all_columns = [col for group in all_groups for col in group]
|
380 |
-
|
381 |
-
# Important: We need to rename our DataFrame columns to match display names
|
382 |
|
383 |
renamed_df = dataframe.rename(columns=display_mapping)
|
384 |
-
# all_columns = [c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default]
|
385 |
-
# all_columns = [c.name for c in fields(AutoEvalColumn_mib_subgraph)]
|
386 |
all_columns = renamed_df.columns.tolist()
|
387 |
|
388 |
|
@@ -390,45 +302,10 @@ def init_leaderboard_mib_subgraph(dataframe, track):
|
|
390 |
return Leaderboard(
|
391 |
value=renamed_df, # Use DataFrame with display names
|
392 |
datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
393 |
-
# select_columns=SelectColumns(
|
394 |
-
# default_selection=all_columns, # Now contains display names
|
395 |
-
# label="Filter Results:",
|
396 |
-
# ),
|
397 |
search_columns=["Method"],
|
398 |
hide_columns=["eval_name"],
|
399 |
interactive=False,
|
400 |
), renamed_df
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
# @dataclass
|
405 |
-
# class TaskMIB_Causalgraph:
|
406 |
-
# benchmark: str # task name in json (ioi/arithmetic)
|
407 |
-
# models: list[str] # list of models to show as sub-columns
|
408 |
-
# col_name: str # display name in leaderboard
|
409 |
-
# metrics: list[str] # metrics to store (average_score)
|
410 |
-
|
411 |
-
# class TasksMib_Causalgraph(Enum):
|
412 |
-
# task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "ioi_task", ["average_score"])
|
413 |
-
# task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
|
414 |
-
# task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
|
415 |
-
# task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
|
416 |
-
|
417 |
-
# @classmethod
|
418 |
-
# def get_all_tasks(cls):
|
419 |
-
# """Returns a list of all task benchmarks"""
|
420 |
-
# return [task.value.benchmark for task in cls]
|
421 |
-
|
422 |
-
# @classmethod
|
423 |
-
# def get_all_models(cls):
|
424 |
-
# """Returns a list of all unique models across all tasks"""
|
425 |
-
# models = set()
|
426 |
-
# for task in cls:
|
427 |
-
# models.update(task.value.models)
|
428 |
-
# return sorted(list(models))
|
429 |
-
|
430 |
-
# ioi_task
|
431 |
-
# 4_answer_MCQA
|
432 |
|
433 |
|
434 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
@@ -694,11 +571,6 @@ with demo:
|
|
694 |
# Then modify the Causal Graph tab section
|
695 |
with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
|
696 |
with gr.Tabs() as causalgraph_tabs:
|
697 |
-
# with gr.TabItem("Detailed View", id=0):
|
698 |
-
# leaderboard_detailed, data = init_leaderboard_mib_causalgraph(
|
699 |
-
# LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
|
700 |
-
# "Causal Graph"
|
701 |
-
# )
|
702 |
with gr.TabItem("Highest View", id=0):
|
703 |
gr.Markdown("""
|
704 |
### Filtering Options
|
@@ -759,11 +631,6 @@ with demo:
|
|
759 |
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
760 |
outputs=leaderboard_averaged
|
761 |
)
|
762 |
-
|
763 |
-
# leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
|
764 |
-
# LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
765 |
-
# "Causal Graph"
|
766 |
-
# )
|
767 |
|
768 |
with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
|
769 |
# Track selection
|
@@ -776,7 +643,6 @@ with demo:
|
|
776 |
elem_id="track_selector"
|
777 |
)
|
778 |
|
779 |
-
# with gr.Group(visible=False) as circuit_ui:
|
780 |
with gr.Column(visible=False, elem_id="bordered-column") as circuit_ui:
|
781 |
with gr.Row():
|
782 |
gr.Markdown(EVALUATION_QUEUE_TEXT_SUBGRAPH, elem_classes="markdown-text")
|
@@ -799,33 +665,13 @@ with demo:
|
|
799 |
"within those submodules (e.g., MLP1 neuron 295)?"
|
800 |
)
|
801 |
|
802 |
-
# with gr.Group(visible=False) as causal_ui:
|
803 |
with gr.Column(visible=False, elem_id="bordered-column") as causal_ui:
|
804 |
gr.Markdown(EVALUATION_QUEUE_TEXT_CAUSALVARIABLE, elem_classes="markdown-text")
|
805 |
-
"""
|
806 |
-
with gr.Row():
|
807 |
-
layer = gr.Number(
|
808 |
-
label="Layer Number",
|
809 |
-
precision=0,
|
810 |
-
minimum=0,
|
811 |
-
info="Integer specifying the model layer"
|
812 |
-
)
|
813 |
-
token_position = gr.Number(
|
814 |
-
label="Token Position",
|
815 |
-
precision=0,
|
816 |
-
minimum=0,
|
817 |
-
info="Integer specifying token position"
|
818 |
-
)
|
819 |
-
"""
|
820 |
with gr.Row():
|
821 |
hf_repo_cg = gr.Textbox(
|
822 |
label="HuggingFace Repository URL",
|
823 |
placeholder="https://huggingface.co/username/repo/path",
|
824 |
info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " )
|
825 |
-
# code_upload = gr.File(
|
826 |
-
# label="Upload Python file implementing your featurization function",
|
827 |
-
# file_types=[".py"],
|
828 |
-
# )
|
829 |
|
830 |
# Common fields
|
831 |
with gr.Group():
|
@@ -884,11 +730,6 @@ with demo:
|
|
884 |
submission_errors, submission_warnings = verify_circuit_submission(hf_repo, level)
|
885 |
|
886 |
elif not breaking_error:
|
887 |
-
# if not (isinstance(layer, int) and isinstance(token_position, int)):
|
888 |
-
# errors.append("Layer and token position must be integers")
|
889 |
-
# if not code_upload:
|
890 |
-
# errors.append("Code file upload is required")
|
891 |
-
|
892 |
submission_errors, submission_warnings = verify_causal_variable_submission(hf_repo)
|
893 |
|
894 |
if not breaking_error:
|
@@ -986,17 +827,18 @@ with demo:
|
|
986 |
- Maximum 2 valid submissions per HuggingFace account per week
|
987 |
- Invalid submissions don't count toward your limit
|
988 |
- Rate limit tracked on a rolling basis: a submission no longer counts toward the limit as soon as 7 days have passed since the submission time
|
|
|
989 |
""")
|
990 |
|
991 |
-
|
992 |
-
|
993 |
-
|
994 |
-
|
995 |
-
|
996 |
-
|
997 |
-
|
998 |
-
|
999 |
-
|
1000 |
|
1001 |
scheduler = BackgroundScheduler()
|
1002 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
|
|
42 |
|
43 |
from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
from gradio_leaderboard import SelectColumns, Leaderboard
|
46 |
import pandas as pd
|
47 |
from typing import List, Dict, Optional
|
|
|
213 |
LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
|
214 |
metric_type="CMD")
|
215 |
|
|
|
216 |
# In app.py, modify the LEADERBOARD initialization
|
217 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
|
218 |
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
|
219 |
)
|
220 |
|
|
|
|
|
|
|
|
|
221 |
(
|
222 |
finished_eval_queue_df_subgraph,
|
223 |
pending_eval_queue_df_subgraph,
|
|
|
238 |
|
239 |
print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
|
240 |
|
|
|
|
|
241 |
model_name_mapping = {
|
242 |
"qwen2_5": "Qwen-2.5",
|
243 |
"gpt2": "GPT-2",
|
|
|
293 |
# Combine all groups using display names
|
294 |
all_groups = benchmark_groups + model_groups
|
295 |
all_columns = [col for group in all_groups for col in group]
|
|
|
|
|
296 |
|
297 |
renamed_df = dataframe.rename(columns=display_mapping)
|
|
|
|
|
298 |
all_columns = renamed_df.columns.tolist()
|
299 |
|
300 |
|
|
|
302 |
return Leaderboard(
|
303 |
value=renamed_df, # Use DataFrame with display names
|
304 |
datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
|
|
|
|
|
|
|
|
305 |
search_columns=["Method"],
|
306 |
hide_columns=["eval_name"],
|
307 |
interactive=False,
|
308 |
), renamed_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
|
310 |
|
311 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
|
|
571 |
# Then modify the Causal Graph tab section
|
572 |
with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
|
573 |
with gr.Tabs() as causalgraph_tabs:
|
|
|
|
|
|
|
|
|
|
|
574 |
with gr.TabItem("Highest View", id=0):
|
575 |
gr.Markdown("""
|
576 |
### Filtering Options
|
|
|
631 |
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
632 |
outputs=leaderboard_averaged
|
633 |
)
|
|
|
|
|
|
|
|
|
|
|
634 |
|
635 |
with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
|
636 |
# Track selection
|
|
|
643 |
elem_id="track_selector"
|
644 |
)
|
645 |
|
|
|
646 |
with gr.Column(visible=False, elem_id="bordered-column") as circuit_ui:
|
647 |
with gr.Row():
|
648 |
gr.Markdown(EVALUATION_QUEUE_TEXT_SUBGRAPH, elem_classes="markdown-text")
|
|
|
665 |
"within those submodules (e.g., MLP1 neuron 295)?"
|
666 |
)
|
667 |
|
|
|
668 |
with gr.Column(visible=False, elem_id="bordered-column") as causal_ui:
|
669 |
gr.Markdown(EVALUATION_QUEUE_TEXT_CAUSALVARIABLE, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
670 |
with gr.Row():
|
671 |
hf_repo_cg = gr.Textbox(
|
672 |
label="HuggingFace Repository URL",
|
673 |
placeholder="https://huggingface.co/username/repo/path",
|
674 |
info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " )
|
|
|
|
|
|
|
|
|
675 |
|
676 |
# Common fields
|
677 |
with gr.Group():
|
|
|
730 |
submission_errors, submission_warnings = verify_circuit_submission(hf_repo, level)
|
731 |
|
732 |
elif not breaking_error:
|
|
|
|
|
|
|
|
|
|
|
733 |
submission_errors, submission_warnings = verify_causal_variable_submission(hf_repo)
|
734 |
|
735 |
if not breaking_error:
|
|
|
827 |
- Maximum 2 valid submissions per HuggingFace account per week
|
828 |
- Invalid submissions don't count toward your limit
|
829 |
- Rate limit tracked on a rolling basis: a submission no longer counts toward the limit as soon as 7 days have passed since the submission time
|
830 |
+
- The queues can take up to an hour to update; don't fret if your submission doesn't show up immediately!
|
831 |
""")
|
832 |
|
833 |
+
with gr.Row():
|
834 |
+
with gr.Accordion("📙 Citation", open=False):
|
835 |
+
citation_button = gr.Textbox(
|
836 |
+
value=CITATION_BUTTON_TEXT,
|
837 |
+
label=CITATION_BUTTON_LABEL,
|
838 |
+
lines=10,
|
839 |
+
elem_id="citation-button",
|
840 |
+
show_copy_button=True,
|
841 |
+
)
|
842 |
|
843 |
scheduler = BackgroundScheduler()
|
844 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
src/about.py
CHANGED
@@ -166,12 +166,12 @@ It will keep the PENDING status until it has been run on the private test set.
|
|
166 |
"""
|
167 |
|
168 |
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
|
169 |
-
CITATION_BUTTON_TEXT = r"""
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
}
|
177 |
"""
|
|
|
166 |
"""
|
167 |
|
168 |
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
|
169 |
+
CITATION_BUTTON_TEXT = r"""@article{mib-2025,
|
170 |
+
title = {{MIB}: A Mechanistic Interpretability Benchmark},
|
171 |
+
author = {Aaron Mueller and Atticus Geiger and Sarah Wiegreffe and Dana Arad and Iv{\'a}n Arcuschin and Adam Belfki and Yik Siu Chan and Jaden Fiotto-Kaufman and Tal Haklay and Michael Hanna and Jing Huang and Rohan Gupta and Yaniv Nikankin and Hadas Orgad and Nikhil Prakash and Anja Reusch and Aruna Sankaranarayanan and Shun Shao and Alessandro Stolfo and Martin Tutek and Amir Zur and David Bau and Yonatan Belinkov},
|
172 |
+
year = {2025},
|
173 |
+
journal = {CoRR},
|
174 |
+
volume = {arXiv:2504.13151},
|
175 |
+
url = {https://arxiv.org/abs/2504.13151v1}
|
176 |
}
|
177 |
"""
|
src/display/utils.py
CHANGED
@@ -53,12 +53,6 @@ AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_
|
|
53 |
|
54 |
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
##############################################################################################################
|
63 |
# Version 3
|
64 |
auto_eval_column_dict_mib_subgraph = []
|
@@ -95,10 +89,6 @@ for field in auto_eval_column_dict_mib_subgraph:
|
|
95 |
print(f"Field name: {field[0]}, Display name: {field[2].name}")
|
96 |
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
# Create the dataclass for MIB columns
|
103 |
AutoEvalColumn_mib_subgraph = make_dataclass("AutoEvalColumn_mib_subgraph", auto_eval_column_dict_mib_subgraph, frozen=True)
|
104 |
|
@@ -118,15 +108,6 @@ COLS_MIB_CAUSALGRAPH = []
|
|
118 |
BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
119 |
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
auto_eval_column_dict_mib_causalgraph = []
|
131 |
|
132 |
# Only include Method column as required
|
@@ -154,40 +135,6 @@ AutoEvalColumn_mib_causalgraph = make_dataclass(
|
|
154 |
frozen=True
|
155 |
)
|
156 |
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
# # Column selection for display
|
164 |
-
# COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
|
165 |
-
|
166 |
-
|
167 |
-
# BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"{model}_{task.value.benchmark}_{intervention}".lower()
|
168 |
-
# for task in TasksMib_Causalgraph
|
169 |
-
# for model in task.value.models
|
170 |
-
# for intervention in task.value.interventions]
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
|
192 |
## For the queue columns in the submission tab
|
193 |
@dataclass(frozen=True)
|
@@ -213,156 +160,4 @@ COLS_MULTIMODAL = [c.name for c in fields(AutoEvalColumnMultimodal) if not c.hid
|
|
213 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
214 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
215 |
|
216 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
217 |
-
# BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
|
218 |
-
|
219 |
-
TEXT_TASKS = {
|
220 |
-
"glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
|
221 |
-
"boolq", "multirc", "wsc"],
|
222 |
-
# Lots of BLiMP tasks – use verifier function below to see if you've included everything.
|
223 |
-
"blimp": ["adjunct_island","anaphor_gender_agreement","anaphor_number_agreement","animate_subject_passive","animate_subject_trans",
|
224 |
-
"causative","complex_NP_island","coordinate_structure_constraint_complex_left_branch","coordinate_structure_constraint_object_extraction","determiner_noun_agreement_1",
|
225 |
-
"determiner_noun_agreement_2","determiner_noun_agreement_irregular_1","determiner_noun_agreement_irregular_2","determiner_noun_agreement_with_adjective_1",
|
226 |
-
"determiner_noun_agreement_with_adj_2","determiner_noun_agreement_with_adj_irregular_1","determiner_noun_agreement_with_adj_irregular_2","distractor_agreement_relational_noun",
|
227 |
-
"distractor_agreement_relative_clause","drop_argument","ellipsis_n_bar_1","ellipsis_n_bar_2",
|
228 |
-
"existential_there_object_raising", "existential_there_quantifiers_1",
|
229 |
-
"existential_there_quantifiers_2", "existential_there_subject_raising", "expletive_it_object_raising",
|
230 |
-
"inchoative", "intransitive","irregular_past_participle_adjectives", "irregular_past_participle_verbs",
|
231 |
-
"irregular_plural_subject_verb_agreement_1", "irregular_plural_subject_verb_agreement_2", "left_branch_island_echo_question", "left_branch_island_simple_question",
|
232 |
-
"matrix_question_npi_licensor_present", "npi_present_1", "npi_present_2", "only_npi_licensor_present", "only_npi_scope", "passive_1", "passive_2",
|
233 |
-
"principle_A_case_1", "principle_A_case_2", "principle_A_c_command", "principle_A_domain_1",
|
234 |
-
"principle_A_domain_2", "principle_A_domain_3", "principle_A_reconstruction", "regular_plural_subject_verb_agreement_1",
|
235 |
-
"regular_plural_subject_verb_agreement_2", "sentential_negation_npi_licensor_present", "sentential_negation_npi_scope", "sentential_subject_island",
|
236 |
-
"superlative_quantifiers_1", "superlative_quantifiers_2", "tough_vs_raising_1", "tough_vs_raising_2",
|
237 |
-
"transitive", "wh_island", "wh_questions_object_gap", "wh_questions_subject_gap",
|
238 |
-
"wh_questions_subject_gap_long_distance", "wh_vs_that_no_gap", "wh_vs_that_no_gap_long_distance", "wh_vs_that_with_gap",
|
239 |
-
"wh_vs_that_with_gap_long_distance"
|
240 |
-
],
|
241 |
-
"blimp_supplement": ["hypernym", "qa_congruence_easy", "qa_congruence_tricky",
|
242 |
-
"subject_aux_inversion", "turn_taking"],
|
243 |
-
"ewok": ["agent-properties", "material-dynamics", "material-properties", "physical-dynamics",
|
244 |
-
"physical-interactions", "physical-relations", "quantitative-properties",
|
245 |
-
"social-interactions", "social-properties", "social-relations", "spatial-relations"]
|
246 |
-
}
|
247 |
-
|
248 |
-
VISION_TASKS = {
|
249 |
-
"vqa": ["vqa"],
|
250 |
-
"winoground": ["winoground"],
|
251 |
-
"devbench": ["lex-viz_vocab", "gram-trog", "sem-things"]
|
252 |
-
}
|
253 |
-
|
254 |
-
NUM_EXPECTED_EXAMPLES = {
|
255 |
-
"glue": {
|
256 |
-
"cola": 522,
|
257 |
-
"sst2": 436,
|
258 |
-
"mrpc": 204,
|
259 |
-
"qqp": 20215,
|
260 |
-
"mnli": 4908,
|
261 |
-
"mnli-mm": 4916,
|
262 |
-
"qnli": 2732,
|
263 |
-
"rte": 139,
|
264 |
-
"boolq": 1635,
|
265 |
-
"multirc": 2424,
|
266 |
-
"wsc": 52
|
267 |
-
},
|
268 |
-
"blimp": {
|
269 |
-
"adjunct_island": 928,
|
270 |
-
"anaphor_gender_agreement": 971,
|
271 |
-
"anaphor_number_agreement": 931,
|
272 |
-
"animate_subject_passive": 895,
|
273 |
-
"animate_subject_trans": 923,
|
274 |
-
"causative": 818,
|
275 |
-
"complex_NP_island": 846,
|
276 |
-
"coordinate_structure_constraint_complex_left_branch": 906,
|
277 |
-
"coordinate_structure_constraint_object_extraction": 949,
|
278 |
-
"determiner_noun_agreement_1": 929,
|
279 |
-
"determiner_noun_agreement_2": 931,
|
280 |
-
"determiner_noun_agreement_irregular_1": 681,
|
281 |
-
"determiner_noun_agreement_irregular_2": 820,
|
282 |
-
"determiner_noun_agreement_with_adjective_1": 933,
|
283 |
-
"determiner_noun_agreement_with_adj_2": 941,
|
284 |
-
"determiner_noun_agreement_with_adj_irregular_1": 718,
|
285 |
-
"determiner_noun_agreement_with_adj_irregular_2": 840,
|
286 |
-
"distractor_agreement_relational_noun": 788,
|
287 |
-
"distractor_agreement_relative_clause": 871,
|
288 |
-
"drop_argument": 920,
|
289 |
-
"ellipsis_n_bar_1": 802,
|
290 |
-
"ellipsis_n_bar_2": 828,
|
291 |
-
"existential_there_object_raising": 812,
|
292 |
-
"existential_there_quantifiers_1": 930,
|
293 |
-
"existential_there_quantifiers_2": 911,
|
294 |
-
"existential_there_subject_raising": 924,
|
295 |
-
"expletive_it_object_raising": 759,
|
296 |
-
"inchoative": 855,
|
297 |
-
"intransitive": 868,
|
298 |
-
"irregular_past_participle_adjectives": 961,
|
299 |
-
"irregular_past_participle_verbs": 942,
|
300 |
-
"irregular_plural_subject_verb_agreement_1": 804,
|
301 |
-
"irregular_plural_subject_verb_agreement_2": 892,
|
302 |
-
"left_branch_island_echo_question": 947,
|
303 |
-
"left_branch_island_simple_question": 951,
|
304 |
-
"matrix_question_npi_licensor_present": 929,
|
305 |
-
"npi_present_1": 909,
|
306 |
-
"npi_present_2": 914,
|
307 |
-
"only_npi_licensor_present": 882,
|
308 |
-
"only_npi_scope": 837,
|
309 |
-
"passive_1": 840,
|
310 |
-
"passive_2": 903,
|
311 |
-
"principle_A_case_1": 912,
|
312 |
-
"principle_A_case_2": 915,
|
313 |
-
"principle_A_c_command": 946,
|
314 |
-
"principle_A_domain_1": 914,
|
315 |
-
"principle_A_domain_2": 915,
|
316 |
-
"principle_A_domain_3": 941,
|
317 |
-
"principle_A_reconstruction": 967,
|
318 |
-
"regular_plural_subject_verb_agreement_1": 890,
|
319 |
-
"regular_plural_subject_verb_agreement_2": 945,
|
320 |
-
"sentential_negation_npi_licensor_present": 919,
|
321 |
-
"sentential_negation_npi_scope": 871,
|
322 |
-
"sentential_subject_island": 961,
|
323 |
-
"superlative_quantifiers_1": 979,
|
324 |
-
"superlative_quantifiers_2": 986,
|
325 |
-
"tough_vs_raising_1": 948,
|
326 |
-
"tough_vs_raising_2": 920,
|
327 |
-
"transitive": 868,
|
328 |
-
"wh_island": 960,
|
329 |
-
"wh_questions_object_gap": 859,
|
330 |
-
"wh_questions_subject_gap": 898,
|
331 |
-
"wh_questions_subject_gap_long_distance": 857,
|
332 |
-
"wh_vs_that_no_gap": 861,
|
333 |
-
"wh_vs_that_no_gap_long_distance": 875,
|
334 |
-
"wh_vs_that_with_gap": 919,
|
335 |
-
"wh_vs_that_with_gap_long_distance": 910
|
336 |
-
},
|
337 |
-
"blimp_supplement": {
|
338 |
-
"hypernym": 842,
|
339 |
-
"qa_congruence_easy": 64,
|
340 |
-
"qa_congruence_tricky": 165,
|
341 |
-
"subject_aux_inversion": 3867,
|
342 |
-
"turn_taking": 280
|
343 |
-
},
|
344 |
-
"ewok": {
|
345 |
-
"agent-properties": 2210,
|
346 |
-
"material-dynamics": 770,
|
347 |
-
"material-properties": 170,
|
348 |
-
"physical-dynamics": 120,
|
349 |
-
"physical-interactions": 556,
|
350 |
-
"physical-relations": 818,
|
351 |
-
"quantitative-properties": 314,
|
352 |
-
"social-interactions": 294,
|
353 |
-
"social-properties": 328,
|
354 |
-
"social-relations": 1548,
|
355 |
-
"spatial-relations": 490
|
356 |
-
},
|
357 |
-
"vqa": {
|
358 |
-
"vqa": 25230
|
359 |
-
},
|
360 |
-
"winoground": {
|
361 |
-
"winoground": 746
|
362 |
-
},
|
363 |
-
"devbench": {
|
364 |
-
"lex-viz_vocab": 119,
|
365 |
-
"gram-trog": 76,
|
366 |
-
"sem-things": 1854
|
367 |
-
}
|
368 |
-
}
|
|
|
53 |
|
54 |
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
##############################################################################################################
|
57 |
# Version 3
|
58 |
auto_eval_column_dict_mib_subgraph = []
|
|
|
89 |
print(f"Field name: {field[0]}, Display name: {field[2].name}")
|
90 |
|
91 |
|
|
|
|
|
|
|
|
|
92 |
# Create the dataclass for MIB columns
|
93 |
AutoEvalColumn_mib_subgraph = make_dataclass("AutoEvalColumn_mib_subgraph", auto_eval_column_dict_mib_subgraph, frozen=True)
|
94 |
|
|
|
108 |
BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
109 |
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
auto_eval_column_dict_mib_causalgraph = []
|
112 |
|
113 |
# Only include Method column as required
|
|
|
135 |
frozen=True
|
136 |
)
|
137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
## For the queue columns in the submission tab
|
140 |
@dataclass(frozen=True)
|
|
|
160 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
161 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
162 |
|
163 |
+
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard/read_evals.py
CHANGED
@@ -123,11 +123,9 @@ class EvalResult_MIB_SUBGRAPH:
|
|
123 |
|
124 |
# Initialize all possible columns with '-'
|
125 |
expected_models = TasksMib_Subgraph.get_all_models()
|
126 |
-
# expected_tasks = TasksMib_Subgraph.get_all_tasks()
|
127 |
|
128 |
for task in TasksMib_Subgraph:
|
129 |
for model in task.value.models:
|
130 |
-
# print(f"task is {task}, task.value.benchmark is {task.value.benchmark}, model is {model}")
|
131 |
data_dict[f"{task.value.benchmark}_{model}"] = '-'
|
132 |
|
133 |
all_scores = []
|
@@ -167,11 +165,8 @@ class EvalResult_MIB_SUBGRAPH:
|
|
167 |
def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
|
168 |
"""From the path of the results folder root, extract all needed info for MIB results"""
|
169 |
model_result_filepaths = []
|
170 |
-
|
171 |
-
# print(f"results_path is {results_path}")
|
172 |
-
|
173 |
for root, dirnames, files in os.walk(results_path):
|
174 |
-
# print(f"root is {root}, dirnames is {dirnames}, files is {files}")
|
175 |
# We should only have json files in model results
|
176 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
177 |
continue
|
@@ -185,14 +180,11 @@ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_
|
|
185 |
for file in files:
|
186 |
model_result_filepaths.append(os.path.join(root, file))
|
187 |
|
188 |
-
# print(f"model_result_filepaths is {model_result_filepaths}")
|
189 |
-
|
190 |
eval_results = []
|
191 |
for model_result_filepath in model_result_filepaths:
|
192 |
try:
|
193 |
eval_result = EvalResult_MIB_SUBGRAPH("", "", {}) # Create empty instance
|
194 |
result = eval_result.init_from_json_file(model_result_filepath)
|
195 |
-
# print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
|
196 |
# Verify the result can be converted to dict format
|
197 |
result.to_dict()
|
198 |
eval_results.append(result)
|
@@ -236,9 +228,6 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
|
236 |
return aggregated_df
|
237 |
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
243 |
"""
|
244 |
Creates a DataFrame where columns are model_task and cells are averaged over interventions.
|
|
|
123 |
|
124 |
# Initialize all possible columns with '-'
|
125 |
expected_models = TasksMib_Subgraph.get_all_models()
|
|
|
126 |
|
127 |
for task in TasksMib_Subgraph:
|
128 |
for model in task.value.models:
|
|
|
129 |
data_dict[f"{task.value.benchmark}_{model}"] = '-'
|
130 |
|
131 |
all_scores = []
|
|
|
165 |
def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
|
166 |
"""From the path of the results folder root, extract all needed info for MIB results"""
|
167 |
model_result_filepaths = []
|
168 |
+
|
|
|
|
|
169 |
for root, dirnames, files in os.walk(results_path):
|
|
|
170 |
# We should only have json files in model results
|
171 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
172 |
continue
|
|
|
180 |
for file in files:
|
181 |
model_result_filepaths.append(os.path.join(root, file))
|
182 |
|
|
|
|
|
183 |
eval_results = []
|
184 |
for model_result_filepath in model_result_filepaths:
|
185 |
try:
|
186 |
eval_result = EvalResult_MIB_SUBGRAPH("", "", {}) # Create empty instance
|
187 |
result = eval_result.init_from_json_file(model_result_filepath)
|
|
|
188 |
# Verify the result can be converted to dict format
|
189 |
result.to_dict()
|
190 |
eval_results.append(result)
|
|
|
228 |
return aggregated_df
|
229 |
|
230 |
|
|
|
|
|
|
|
231 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
232 |
"""
|
233 |
Creates a DataFrame where columns are model_task and cells are averaged over interventions.
|
src/submission/check_validity.py
CHANGED
@@ -16,7 +16,6 @@ from huggingface_hub.hf_api import ModelInfo
|
|
16 |
from transformers import AutoConfig
|
17 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
18 |
|
19 |
-
from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
|
20 |
from src.envs import EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH
|
21 |
|
22 |
TASKS = ["ioi", "mcqa", "arithmetic-addition", "arithmetic-subtraction", "arc-easy", "arc-challenge"]
|
@@ -246,72 +245,6 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
246 |
return set(file_names), users_to_submission_dates
|
247 |
|
248 |
|
249 |
-
def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
|
250 |
-
out_msg = ""
|
251 |
-
for task in TEXT_TASKS:
|
252 |
-
if task not in predictions:
|
253 |
-
out_msg = f"Error: {task} not present"
|
254 |
-
break
|
255 |
-
for subtask in TEXT_TASKS[task]:
|
256 |
-
if subtask not in predictions[task]:
|
257 |
-
out_msg = f"Error: {subtask} not present under {task}"
|
258 |
-
break
|
259 |
-
if out_msg != "":
|
260 |
-
break
|
261 |
-
if "vqa" in predictions or "winoground" in predictions or "devbench" in predictions:
|
262 |
-
for task in VISION_TASKS:
|
263 |
-
if task not in predictions:
|
264 |
-
out_msg = f"Error: {task} not present"
|
265 |
-
break
|
266 |
-
for subtask in VISION_TASKS[task]:
|
267 |
-
if subtask not in predictions[task]:
|
268 |
-
out_msg = f"Error: {subtask} not present under {task}"
|
269 |
-
break
|
270 |
-
if out_msg != "":
|
271 |
-
break
|
272 |
-
|
273 |
-
# Make sure all examples have predictions, and that predictions are the correct type
|
274 |
-
for task in predictions:
|
275 |
-
for subtask in predictions[task]:
|
276 |
-
if task == "devbench":
|
277 |
-
a = np.array(predictions[task][subtask]["predictions"])
|
278 |
-
if subtask == "sem-things":
|
279 |
-
required_shape = (1854, 1854)
|
280 |
-
elif subtask == "gram-trog":
|
281 |
-
required_shape = (76, 4, 1)
|
282 |
-
elif subtask == "lex-viz_vocab":
|
283 |
-
required_shape = (119, 4, 1)
|
284 |
-
if a.shape[0] != required_shape[0] or a.shape[1] != required_shape[1]:
|
285 |
-
out_msg = f"Error: Wrong shape for results for `{subtask}` in `{task}`."
|
286 |
-
break
|
287 |
-
if not str(a.dtype).startswith("float"):
|
288 |
-
out_msg = f"Error: Results for `{subtask}` ({task}) \
|
289 |
-
should be floats but aren't."
|
290 |
-
break
|
291 |
-
continue
|
292 |
-
|
293 |
-
num_expected_examples = NUM_EXPECTED_EXAMPLES[task][subtask]
|
294 |
-
if len(predictions[task][subtask]["predictions"]) != num_expected_examples:
|
295 |
-
out_msg = f"Error: {subtask} has the wrong number of examples."
|
296 |
-
break
|
297 |
-
|
298 |
-
if task == "glue":
|
299 |
-
if type(predictions[task][subtask]["predictions"][0]["pred"]) != int:
|
300 |
-
out_msg = f"Error: results for `{subtask}` (`{task}`) should be integers but aren't."
|
301 |
-
break
|
302 |
-
else:
|
303 |
-
if type(predictions[task][subtask]["predictions"][0]["pred"]) != str:
|
304 |
-
out_msg = f"Error: results for `{subtask}` (`{task}`) should be strings but aren't."
|
305 |
-
break
|
306 |
-
|
307 |
-
if out_msg != "":
|
308 |
-
break
|
309 |
-
|
310 |
-
if out_msg != "":
|
311 |
-
return False, out_msg
|
312 |
-
return True, "Upload successful."
|
313 |
-
|
314 |
-
|
315 |
def _format_time(earliest_time):
|
316 |
time_left = (earliest_time.tz_convert("UTC") + timedelta(weeks=1)) - pd.Timestamp.utcnow()
|
317 |
hours = time_left.seconds // 3600
|
|
|
16 |
from transformers import AutoConfig
|
17 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
18 |
|
|
|
19 |
from src.envs import EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH
|
20 |
|
21 |
TASKS = ["ioi", "mcqa", "arithmetic-addition", "arithmetic-subtraction", "arc-easy", "arc-challenge"]
|
|
|
245 |
return set(file_names), users_to_submission_dates
|
246 |
|
247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
def _format_time(earliest_time):
|
249 |
time_left = (earliest_time.tz_convert("UTC") + timedelta(weeks=1)) - pd.Timestamp.utcnow()
|
250 |
hours = time_left.seconds // 3600
|
src/submission/submit.py
CHANGED
@@ -9,7 +9,6 @@ from src.submission.check_validity import (
|
|
9 |
already_submitted_models,
|
10 |
get_model_size,
|
11 |
is_model_on_hub,
|
12 |
-
is_valid_predictions,
|
13 |
parse_huggingface_url
|
14 |
)
|
15 |
import gradio as gr
|
@@ -89,101 +88,6 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact
|
|
89 |
gr.Column(visible=False)
|
90 |
]
|
91 |
|
92 |
-
def add_new_eval(
|
93 |
-
model_name: str,
|
94 |
-
model_id: str,
|
95 |
-
revision: str,
|
96 |
-
track: str,
|
97 |
-
predictions: dict,
|
98 |
-
):
|
99 |
-
global REQUESTED_MODELS
|
100 |
-
global USERS_TO_SUBMISSION_DATES
|
101 |
-
if not REQUESTED_MODELS:
|
102 |
-
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
103 |
-
|
104 |
-
out_message = ""
|
105 |
-
|
106 |
-
user_name = ""
|
107 |
-
model_path = model_name
|
108 |
-
if "/" in model_name:
|
109 |
-
user_name = model_name.split("/")[0]
|
110 |
-
model_path = model_name.split("/")[1]
|
111 |
-
|
112 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
113 |
-
|
114 |
-
if track is None:
|
115 |
-
return styled_error("Please select a track.")
|
116 |
-
|
117 |
-
# Does the model actually exist?
|
118 |
-
if revision == "":
|
119 |
-
revision = "main"
|
120 |
-
|
121 |
-
out_message = ""
|
122 |
-
|
123 |
-
# Is the model info correctly filled?
|
124 |
-
print("Made it before 1")
|
125 |
-
try:
|
126 |
-
model_info = API.model_info(repo_id=model_id, revision=revision)
|
127 |
-
except Exception:
|
128 |
-
out_message += styled_warning("Could not get your model information. The leaderboard entry will not have a link to its HF repo.") + "<br>"
|
129 |
-
print("Made it after 1")
|
130 |
-
|
131 |
-
try:
|
132 |
-
predictions_OK, error_msg = is_valid_predictions(predictions)
|
133 |
-
if not predictions_OK:
|
134 |
-
return styled_error(error_msg) + "<br>"
|
135 |
-
except:
|
136 |
-
return styled_error(error_msg) + "<br>"
|
137 |
-
|
138 |
-
print("Made it after 3")
|
139 |
-
|
140 |
-
# Seems good, creating the eval
|
141 |
-
print("Adding new eval")
|
142 |
-
|
143 |
-
eval_entry = {
|
144 |
-
"model_name": model_name,
|
145 |
-
"hf_repo": model_id,
|
146 |
-
"revision": revision,
|
147 |
-
"track": track,
|
148 |
-
"predictions": predictions,
|
149 |
-
"status": "PENDING",
|
150 |
-
"submitted_time": current_time,
|
151 |
-
}
|
152 |
-
|
153 |
-
print("Made it after 4")
|
154 |
-
|
155 |
-
# Check for duplicate submission
|
156 |
-
if f"{model_name}_{revision}_{track}" in REQUESTED_MODELS:
|
157 |
-
return styled_error("A model with this name has been already submitted.")
|
158 |
-
|
159 |
-
print("Creating eval file")
|
160 |
-
OUT_DIR = f"{EVAL_REQUESTS}/{user_name}"
|
161 |
-
os.makedirs(OUT_DIR, exist_ok=True)
|
162 |
-
out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
|
163 |
-
|
164 |
-
print("Made it after 5")
|
165 |
-
|
166 |
-
with open(out_path, "w") as f:
|
167 |
-
f.write(json.dumps(eval_entry))
|
168 |
-
|
169 |
-
print("Uploading eval file")
|
170 |
-
API.upload_file(
|
171 |
-
path_or_fileobj=out_path,
|
172 |
-
path_in_repo=out_path.split("eval-queue/")[1],
|
173 |
-
repo_id=QUEUE_REPO,
|
174 |
-
repo_type="dataset",
|
175 |
-
commit_message=f"Add {model_name} to eval queue",
|
176 |
-
)
|
177 |
-
|
178 |
-
print("Made it after 6")
|
179 |
-
|
180 |
-
# Remove the local file
|
181 |
-
os.remove(out_path)
|
182 |
-
|
183 |
-
return styled_message(
|
184 |
-
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the request to show in the PENDING list."
|
185 |
-
)
|
186 |
-
|
187 |
def remove_submission(track: str, method_name: str, _id: str):
|
188 |
if track is None:
|
189 |
return gr.Textbox(f"Please select a track.", visible=True)
|
|
|
9 |
already_submitted_models,
|
10 |
get_model_size,
|
11 |
is_model_on_hub,
|
|
|
12 |
parse_huggingface_url
|
13 |
)
|
14 |
import gradio as gr
|
|
|
88 |
gr.Column(visible=False)
|
89 |
]
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
def remove_submission(track: str, method_name: str, _id: str):
|
92 |
if track is None:
|
93 |
return gr.Textbox(f"Please select a track.", visible=True)
|