Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Merge branch 'gal-beniamini/cleanup'
Browse files- README.md +3 -3
- app.py +67 -126
- scripts/upload_f1_dataset.py +20 -5
- src/about.py +7 -18
- src/datamodel/data.py +21 -10
- src/display/__init__.py +0 -0
- src/display/css_html_js.py +1 -1
- src/display/formatting.py +0 -17
- src/display/utils.py +8 -70
- src/leaderboard/read_evals.py +0 -196
- src/logger.py +5 -1
- src/populate.py +23 -48
- src/submission/check_validity.py +0 -102
- src/submission/submit.py +42 -69
- src/validation/__init__.py +0 -0
- src/validation/validate.py +89 -0
README.md
CHANGED
@@ -41,9 +41,9 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
|
|
41 |
# Code logic for more complex edits
|
42 |
|
43 |
You'll find
|
44 |
-
-
|
45 |
-
-
|
46 |
-
-
|
47 |
|
48 |
|
49 |
# Setting up the environment
|
|
|
41 |
# Code logic for more complex edits
|
42 |
|
43 |
You'll find
|
44 |
+
- The main table' columns names and properties in `src/display/utils.py`
|
45 |
+
- The logic to read all results and request files, then convert them in dataframe lines, in `src/populate.py`
|
46 |
+
- The logic to allow or filter submissions in `src/submission/submit.py`.
|
47 |
|
48 |
|
49 |
# Setting up the environment
|
app.py
CHANGED
@@ -1,45 +1,34 @@
|
|
1 |
-
import random
|
2 |
-
|
3 |
import gradio as gr
|
4 |
-
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
5 |
import pandas as pd
|
6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
from src.about import (
|
11 |
-
CITATION_BUTTON_LABEL,
|
12 |
-
CITATION_BUTTON_TEXT,
|
13 |
-
EVALUATION_QUEUE_TEXT,
|
14 |
-
INTRODUCTION_TEXT,
|
15 |
-
LLM_BENCHMARKS_TEXT,
|
16 |
-
TITLE,
|
17 |
-
)
|
18 |
from src.datamodel.data import F1Data
|
19 |
-
|
20 |
from src.display.css_html_js import custom_css
|
21 |
-
|
22 |
-
from src.
|
23 |
-
# BENCHMARK_COLS,
|
24 |
-
COLS,
|
25 |
-
EVAL_COLS,
|
26 |
-
EVAL_TYPES,
|
27 |
-
AutoEvalColumn,
|
28 |
-
ModelType,
|
29 |
-
fields,
|
30 |
-
WeightType,
|
31 |
-
Precision,
|
32 |
-
)
|
33 |
-
from src.envs import API, REPO_ID, TOKEN, CODE_PROBLEMS_REPO, SUBMISSIONS_REPO, RESULTS_REPO
|
34 |
from src.logger import get_logger
|
35 |
-
|
36 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
37 |
from src.submission.submit import add_new_solutions
|
|
|
38 |
|
39 |
logger = get_logger(__name__)
|
40 |
|
|
|
41 |
SPLIT = "warmup" # TODO temp
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
|
45 |
def restart_space():
|
@@ -65,9 +54,11 @@ def refresh_leaderboard_data():
|
|
65 |
return None
|
66 |
|
67 |
|
68 |
-
def init_leaderboard(dataframe):
|
|
|
69 |
if dataframe is None or dataframe.empty:
|
70 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
71 |
return Leaderboard(
|
72 |
value=dataframe,
|
73 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
@@ -80,39 +71,14 @@ def init_leaderboard(dataframe):
|
|
80 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
81 |
filter_columns=[
|
82 |
ColumnFilter(AutoEvalColumn.system_type.name, type="checkboxgroup", label="Model types"),
|
83 |
-
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
84 |
-
# ColumnFilter(
|
85 |
-
# AutoEvalColumn.params.name,
|
86 |
-
# type="slider",
|
87 |
-
# min=0.01,
|
88 |
-
# max=150,
|
89 |
-
# label="Select the number of parameters (B)",
|
90 |
-
# ),
|
91 |
-
# ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
|
92 |
],
|
93 |
bool_checkboxgroup_label="Hide models",
|
94 |
interactive=False,
|
95 |
)
|
96 |
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
leaderboard_df = None
|
101 |
-
|
102 |
-
logger.info("Initialized LBDB")
|
103 |
-
|
104 |
-
# (
|
105 |
-
# finished_eval_queue_df,
|
106 |
-
# running_eval_queue_df,
|
107 |
-
# pending_eval_queue_df,
|
108 |
-
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
109 |
-
|
110 |
-
|
111 |
-
# Display image using Markdown
|
112 |
-
# banner = ""
|
113 |
-
|
114 |
-
demo = gr.Blocks(css=custom_css)
|
115 |
-
with demo:
|
116 |
gr.Image(
|
117 |
"assets/banner.png",
|
118 |
interactive=False,
|
@@ -121,7 +87,6 @@ with demo:
|
|
121 |
container=False,
|
122 |
)
|
123 |
|
124 |
-
# gr.Markdown(banner)
|
125 |
gr.HTML(
|
126 |
"""
|
127 |
<style>
|
@@ -149,53 +114,15 @@ with demo:
|
|
149 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
150 |
with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
|
151 |
refresh_leaderboard_data() # updates leaderboard_df
|
|
|
152 |
leaderboard_component = init_leaderboard(leaderboard_df)
|
153 |
|
154 |
-
# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
155 |
-
# logger.info("Tab about")
|
156 |
-
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
157 |
-
|
158 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
|
159 |
logger.info("Tab submission")
|
160 |
with gr.Column():
|
161 |
with gr.Row():
|
162 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
163 |
|
164 |
-
# with gr.Column():
|
165 |
-
# with gr.Accordion(
|
166 |
-
# f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
167 |
-
# open=False,
|
168 |
-
# ):
|
169 |
-
# with gr.Row():
|
170 |
-
# finished_eval_table = gr.components.Dataframe(
|
171 |
-
# value=finished_eval_queue_df,
|
172 |
-
# headers=EVAL_COLS,
|
173 |
-
# datatype=EVAL_TYPES,
|
174 |
-
# row_count=5,
|
175 |
-
# )
|
176 |
-
# with gr.Accordion(
|
177 |
-
# f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
178 |
-
# open=False,
|
179 |
-
# ):
|
180 |
-
# with gr.Row():
|
181 |
-
# running_eval_table = gr.components.Dataframe(
|
182 |
-
# value=running_eval_queue_df,
|
183 |
-
# headers=EVAL_COLS,
|
184 |
-
# datatype=EVAL_TYPES,
|
185 |
-
# row_count=5,
|
186 |
-
# )
|
187 |
-
|
188 |
-
# with gr.Accordion(
|
189 |
-
# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
190 |
-
# open=False,
|
191 |
-
# ):
|
192 |
-
# with gr.Row():
|
193 |
-
# pending_eval_table = gr.components.Dataframe(
|
194 |
-
# value=pending_eval_queue_df,
|
195 |
-
# headers=EVAL_COLS,
|
196 |
-
# datatype=EVAL_TYPES,
|
197 |
-
# row_count=5,
|
198 |
-
# )
|
199 |
with gr.Row():
|
200 |
gr.Markdown("# ✉️✨ Submit your solutions here!", elem_classes="markdown-text")
|
201 |
|
@@ -203,7 +130,6 @@ with demo:
|
|
203 |
with gr.Column():
|
204 |
system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
|
205 |
org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
|
206 |
-
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
207 |
sys_type_dropdown = gr.Dropdown(
|
208 |
choices=[t.to_str(" ") for t in ModelType],
|
209 |
label=AutoEvalColumn.system_type.name,
|
@@ -212,31 +138,53 @@ with demo:
|
|
212 |
interactive=True,
|
213 |
)
|
214 |
|
215 |
-
# with gr.Column():
|
216 |
submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
|
217 |
-
# precision = gr.Dropdown(
|
218 |
-
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
219 |
-
# label="Precision",
|
220 |
-
# multiselect=False,
|
221 |
-
# value="float16",
|
222 |
-
# interactive=True,
|
223 |
-
# )
|
224 |
-
# weight_type = gr.Dropdown(
|
225 |
-
# choices=[i.value.name for i in WeightType],
|
226 |
-
# label="Weights type",
|
227 |
-
# multiselect=False,
|
228 |
-
# value="Original",
|
229 |
-
# interactive=True,
|
230 |
-
# )
|
231 |
-
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
232 |
|
233 |
logger.info("Submit button")
|
234 |
submit_button = gr.Button("Submit")
|
235 |
submission_result = gr.Markdown()
|
236 |
|
237 |
-
def add_solution_cbk(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
return add_new_solutions(
|
239 |
-
lbdb,
|
|
|
|
|
|
|
|
|
|
|
240 |
)
|
241 |
|
242 |
submit_button.click(
|
@@ -257,16 +205,9 @@ with demo:
|
|
257 |
value=CITATION_BUTTON_TEXT.strip(),
|
258 |
elem_id="citation-block",
|
259 |
)
|
260 |
-
# citation_button = gr.Textbox(
|
261 |
-
# value=CITATION_BUTTON_TEXT,
|
262 |
-
# # label=CITATION_BUTTON_LABEL,
|
263 |
-
# lines=20,
|
264 |
-
# elem_id="citation-button",
|
265 |
-
# show_copy_button=True,
|
266 |
-
# )
|
267 |
|
268 |
# UI refresh triggers latest data swap. The work already happened in the background - refresh_leaderboard_data().
|
269 |
-
|
270 |
|
271 |
|
272 |
logger.info("Scheduler")
|
@@ -275,5 +216,5 @@ scheduler.add_job(restart_space, "interval", seconds=1800)
|
|
275 |
scheduler.add_job(refresh_leaderboard_data, "interval", seconds=120)
|
276 |
scheduler.start()
|
277 |
logger.info("Launch")
|
278 |
-
|
279 |
logger.info("Done")
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import pandas as pd
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
+
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
|
5 |
|
6 |
+
from display.formatting import styled_error
|
7 |
+
from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from src.datamodel.data import F1Data
|
|
|
9 |
from src.display.css_html_js import custom_css
|
10 |
+
from src.display.utils import AutoEvalColumn, ModelType, fields
|
11 |
+
from src.envs import API, CODE_PROBLEMS_REPO, REPO_ID, RESULTS_REPO, SUBMISSIONS_REPO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
from src.logger import get_logger
|
13 |
+
from src.populate import get_leaderboard_df
|
|
|
14 |
from src.submission.submit import add_new_solutions
|
15 |
+
from src.validation.validate import MAX_INPUT_LENGTH, MIN_INPUT_LENGTH, is_submission_file_valid, is_valid
|
16 |
|
17 |
logger = get_logger(__name__)
|
18 |
|
19 |
+
ENSURE_ALL_PRESENT = False # TODO: Switch to True.
|
20 |
SPLIT = "warmup" # TODO temp
|
21 |
+
|
22 |
+
lbdb = F1Data(
|
23 |
+
cp_ds_name=CODE_PROBLEMS_REPO,
|
24 |
+
sub_ds_name=SUBMISSIONS_REPO,
|
25 |
+
res_ds_name=RESULTS_REPO,
|
26 |
+
split=SPLIT,
|
27 |
+
)
|
28 |
+
|
29 |
+
leaderboard_df = None
|
30 |
+
|
31 |
+
logger.info("Initialized LBDB")
|
32 |
|
33 |
|
34 |
def restart_space():
|
|
|
54 |
return None
|
55 |
|
56 |
|
57 |
+
def init_leaderboard(dataframe: pd.DataFrame):
|
58 |
+
|
59 |
if dataframe is None or dataframe.empty:
|
60 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
61 |
+
|
62 |
return Leaderboard(
|
63 |
value=dataframe,
|
64 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
|
|
71 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
72 |
filter_columns=[
|
73 |
ColumnFilter(AutoEvalColumn.system_type.name, type="checkboxgroup", label="Model types"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
],
|
75 |
bool_checkboxgroup_label="Hide models",
|
76 |
interactive=False,
|
77 |
)
|
78 |
|
79 |
|
80 |
+
blocks = gr.Blocks(css=custom_css)
|
81 |
+
with blocks:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
gr.Image(
|
83 |
"assets/banner.png",
|
84 |
interactive=False,
|
|
|
87 |
container=False,
|
88 |
)
|
89 |
|
|
|
90 |
gr.HTML(
|
91 |
"""
|
92 |
<style>
|
|
|
114 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
115 |
with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
|
116 |
refresh_leaderboard_data() # updates leaderboard_df
|
117 |
+
assert leaderboard_df is not None
|
118 |
leaderboard_component = init_leaderboard(leaderboard_df)
|
119 |
|
|
|
|
|
|
|
|
|
120 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
|
121 |
logger.info("Tab submission")
|
122 |
with gr.Column():
|
123 |
with gr.Row():
|
124 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
with gr.Row():
|
127 |
gr.Markdown("# ✉️✨ Submit your solutions here!", elem_classes="markdown-text")
|
128 |
|
|
|
130 |
with gr.Column():
|
131 |
system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
|
132 |
org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
|
|
|
133 |
sys_type_dropdown = gr.Dropdown(
|
134 |
choices=[t.to_str(" ") for t in ModelType],
|
135 |
label=AutoEvalColumn.system_type.name,
|
|
|
138 |
interactive=True,
|
139 |
)
|
140 |
|
|
|
141 |
submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
logger.info("Submit button")
|
144 |
submit_button = gr.Button("Submit")
|
145 |
submission_result = gr.Markdown()
|
146 |
|
147 |
+
def add_solution_cbk(
|
148 |
+
system_name: str,
|
149 |
+
org: str,
|
150 |
+
sys_type: str,
|
151 |
+
submission_path: str,
|
152 |
+
):
|
153 |
+
|
154 |
+
try:
|
155 |
+
# Validating the submission file.
|
156 |
+
if len(submission_path) == 0:
|
157 |
+
return styled_error("Please upload JSONL submission file.")
|
158 |
+
|
159 |
+
if not is_submission_file_valid(submission_path):
|
160 |
+
return styled_error("Failed to read JSONL submission file. Please try again later.")
|
161 |
+
|
162 |
+
# Validating all user-supplied arguments.
|
163 |
+
for val, val_name in [
|
164 |
+
(system_name, "System name"),
|
165 |
+
(org, "Organisation name"),
|
166 |
+
(sys_type, "System type"),
|
167 |
+
]:
|
168 |
+
if len(val) == 0:
|
169 |
+
return styled_error(f"Please fill in the '{val_name}' field.")
|
170 |
+
|
171 |
+
if not is_valid(val):
|
172 |
+
return styled_error(
|
173 |
+
f"{val_name} is invalid! Must only contain characters [a-zA-Z0-9], spaces, "
|
174 |
+
+ "or the special characters '-' and '.', and be of length between "
|
175 |
+
+ f"{MIN_INPUT_LENGTH} and {MAX_INPUT_LENGTH}."
|
176 |
+
)
|
177 |
+
except Exception:
|
178 |
+
logger.warning("Failed to process user submission", exc_info=True)
|
179 |
+
return styled_error("An error occurred. Please try again later.") # Intentionally vague.
|
180 |
+
|
181 |
return add_new_solutions(
|
182 |
+
lbdb,
|
183 |
+
system_name,
|
184 |
+
org,
|
185 |
+
sys_type,
|
186 |
+
submission_path,
|
187 |
+
ensure_all_present=ENSURE_ALL_PRESENT,
|
188 |
)
|
189 |
|
190 |
submit_button.click(
|
|
|
205 |
value=CITATION_BUTTON_TEXT.strip(),
|
206 |
elem_id="citation-block",
|
207 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
|
209 |
# UI refresh triggers latest data swap. The work already happened in the background - refresh_leaderboard_data().
|
210 |
+
blocks.load(lambda: leaderboard_df, inputs=[], outputs=[leaderboard_component])
|
211 |
|
212 |
|
213 |
logger.info("Scheduler")
|
|
|
216 |
scheduler.add_job(refresh_leaderboard_data, "interval", seconds=120)
|
217 |
scheduler.start()
|
218 |
logger.info("Launch")
|
219 |
+
blocks.queue(default_concurrency_limit=40).launch()
|
220 |
logger.info("Done")
|
scripts/upload_f1_dataset.py
CHANGED
@@ -2,6 +2,7 @@ import argparse
|
|
2 |
import fnmatch
|
3 |
import json
|
4 |
import os
|
|
|
5 |
|
6 |
from datasets import Dataset
|
7 |
|
@@ -13,9 +14,23 @@ logger = get_logger(__name__)
|
|
13 |
|
14 |
def get_args() -> argparse.Namespace:
|
15 |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
16 |
-
parser.add_argument(
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
return parser.parse_args()
|
20 |
|
21 |
|
@@ -26,7 +41,7 @@ def main(args: argparse.Namespace) -> None:
|
|
26 |
raise ValueError(f"No .json files in input dir {args.input_dir}")
|
27 |
logger.info("Found %d code problems in %s", len(input_files), args.input_dir)
|
28 |
|
29 |
-
def ds_generator():
|
30 |
for fname in sorted(input_files):
|
31 |
formula_name = os.path.splitext(fname)[0]
|
32 |
cp_path = os.path.join(args.input_dir, fname)
|
@@ -35,7 +50,7 @@ def main(args: argparse.Namespace) -> None:
|
|
35 |
logger.info("Read code problem for formula %s from %s", formula_name, cp_path)
|
36 |
yield dict(id=code_problem["id"], code_problem=code_problem)
|
37 |
|
38 |
-
ds = Dataset.from_generator(ds_generator)
|
39 |
logger.info("Created dataset")
|
40 |
|
41 |
ds.push_to_hub(args.dataset_name, split=args.split, private=True)
|
|
|
2 |
import fnmatch
|
3 |
import json
|
4 |
import os
|
5 |
+
from typing import Iterator
|
6 |
|
7 |
from datasets import Dataset
|
8 |
|
|
|
14 |
|
15 |
def get_args() -> argparse.Namespace:
|
16 |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
17 |
+
parser.add_argument(
|
18 |
+
"--input_dir",
|
19 |
+
type=str,
|
20 |
+
help="Dir with .json files",
|
21 |
+
required=True,
|
22 |
+
)
|
23 |
+
parser.add_argument(
|
24 |
+
"--dataset_name",
|
25 |
+
type=str,
|
26 |
+
default=f"{CODE_PROBLEMS_REPO}",
|
27 |
+
)
|
28 |
+
parser.add_argument(
|
29 |
+
"--split",
|
30 |
+
type=str,
|
31 |
+
choices=["hard", "warmup"],
|
32 |
+
default="hard",
|
33 |
+
)
|
34 |
return parser.parse_args()
|
35 |
|
36 |
|
|
|
41 |
raise ValueError(f"No .json files in input dir {args.input_dir}")
|
42 |
logger.info("Found %d code problems in %s", len(input_files), args.input_dir)
|
43 |
|
44 |
+
def ds_generator() -> Iterator[dict]:
|
45 |
for fname in sorted(input_files):
|
46 |
formula_name = os.path.splitext(fname)[0]
|
47 |
cp_path = os.path.join(args.input_dir, fname)
|
|
|
50 |
logger.info("Read code problem for formula %s from %s", formula_name, cp_path)
|
51 |
yield dict(id=code_problem["id"], code_problem=code_problem)
|
52 |
|
53 |
+
ds: Dataset = Dataset.from_generator(ds_generator) # type: ignore
|
54 |
logger.info("Created dataset")
|
55 |
|
56 |
ds.push_to_hub(args.dataset_name, split=args.split, private=True)
|
src/about.py
CHANGED
@@ -9,20 +9,11 @@ class Task:
|
|
9 |
col_name: str
|
10 |
|
11 |
|
12 |
-
# Select your tasks here
|
13 |
-
# ---------------------------------------------------
|
14 |
class Tasks(Enum):
|
15 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
|
17 |
-
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
18 |
|
19 |
|
20 |
-
NUM_FEWSHOT = 0
|
21 |
-
# ---------------------------------------------------
|
22 |
-
|
23 |
-
|
24 |
-
# Your leaderboard name
|
25 |
-
# TITLE = """<h1 align="center" id="space-title">AAI FormulaOne Leaderboard</h1>"""
|
26 |
|
27 |
TITLE = """
|
28 |
<h1 id="space-title" style="
|
@@ -39,19 +30,17 @@ TITLE = """
|
|
39 |
</h1>
|
40 |
"""
|
41 |
|
42 |
-
# What does your leaderboard evaluate?
|
43 |
INTRODUCTION_TEXT = """
|
44 |
Welcome to the official leaderboard for the paper:
|
45 |
|
46 |
**FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming** <br>
|
47 |
-
*Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
|
48 |
**AAI, July 2025**
|
49 |
|
50 |
FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science.
|
51 |
"""
|
52 |
|
53 |
-
|
54 |
-
LLM_BENCHMARKS_TEXT = f"""
|
55 |
## How it works
|
56 |
|
57 |
## Reproducibility
|
@@ -95,7 +84,7 @@ Submissions must:
|
|
95 |
- **Organization**
|
96 |
- **System Type**
|
97 |
- Click **Submit**.
|
98 |
-
|
99 |
### ⏱️ After Submission
|
100 |
|
101 |
Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed.
|
@@ -105,12 +94,12 @@ Submissions are validated and evaluated within ~24 hours. Results will appear on
|
|
105 |
CITATION_BUTTON_LABEL = """📚 How to cite FormulaOne"""
|
106 |
CITATION_BUTTON_TEXT = r"""
|
107 |
@misc{beniamini2025formulaonemeasuringdepthalgorithmic,
|
108 |
-
title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming},
|
109 |
-
author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and Ido Ben Shaul and Tomer Zekharya and Yoav Levine and Shai Shalev-Shwartz and Amnon Shashua},
|
110 |
year={2025},
|
111 |
eprint={2507.13337},
|
112 |
archivePrefix={arXiv},
|
113 |
primaryClass={cs.AI},
|
114 |
-
url={https://arxiv.org/abs/2507.13337},
|
115 |
}
|
116 |
"""
|
|
|
9 |
col_name: str
|
10 |
|
11 |
|
|
|
|
|
12 |
class Tasks(Enum):
|
|
|
13 |
task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
|
|
|
14 |
|
15 |
|
16 |
+
NUM_FEWSHOT = 0
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
TITLE = """
|
19 |
<h1 id="space-title" style="
|
|
|
30 |
</h1>
|
31 |
"""
|
32 |
|
|
|
33 |
INTRODUCTION_TEXT = """
|
34 |
Welcome to the official leaderboard for the paper:
|
35 |
|
36 |
**FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming** <br>
|
37 |
+
*Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Nadav Schweiger, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
|
38 |
**AAI, July 2025**
|
39 |
|
40 |
FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science.
|
41 |
"""
|
42 |
|
43 |
+
LLM_BENCHMARKS_TEXT = """
|
|
|
44 |
## How it works
|
45 |
|
46 |
## Reproducibility
|
|
|
84 |
- **Organization**
|
85 |
- **System Type**
|
86 |
- Click **Submit**.
|
87 |
+
|
88 |
### ⏱️ After Submission
|
89 |
|
90 |
Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed.
|
|
|
94 |
CITATION_BUTTON_LABEL = """📚 How to cite FormulaOne"""
|
95 |
CITATION_BUTTON_TEXT = r"""
|
96 |
@misc{beniamini2025formulaonemeasuringdepthalgorithmic,
|
97 |
+
title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming},
|
98 |
+
author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and Nadav Schweiger and Ido Ben Shaul and Tomer Zekharya and Yoav Levine and Shai Shalev-Shwartz and Amnon Shashua},
|
99 |
year={2025},
|
100 |
eprint={2507.13337},
|
101 |
archivePrefix={arXiv},
|
102 |
primaryClass={cs.AI},
|
103 |
+
url={https://arxiv.org/abs/2507.13337},
|
104 |
}
|
105 |
"""
|
src/datamodel/data.py
CHANGED
@@ -3,14 +3,20 @@ import time
|
|
3 |
|
4 |
from datasets import load_dataset
|
5 |
|
6 |
-
from src.envs import
|
7 |
from src.logger import get_logger
|
8 |
|
9 |
logger = get_logger(__name__)
|
10 |
|
11 |
|
12 |
class F1Data:
|
13 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
self.cp_dataset_name = cp_ds_name
|
15 |
self.submissions_dataset_name = sub_ds_name
|
16 |
self.results_dataset_name = res_ds_name
|
@@ -19,16 +25,16 @@ class F1Data:
|
|
19 |
self._initialize()
|
20 |
|
21 |
def _initialize(self):
|
22 |
-
logger.info("Initialize F1Data TOKEN='
|
23 |
start_time = time.monotonic()
|
24 |
-
cp_ds = load_dataset(
|
25 |
-
logger.info(
|
26 |
-
"Loaded code-problems dataset from %s in %f sec",
|
27 |
self.cp_dataset_name,
|
28 |
-
|
|
|
29 |
)
|
30 |
-
|
31 |
-
|
|
|
32 |
|
33 |
@functools.cached_property
|
34 |
def code_problem_ids(self) -> set[str]:
|
@@ -37,6 +43,11 @@ class F1Data:
|
|
37 |
|
38 |
if __name__ == "__main__":
|
39 |
split = "hard"
|
40 |
-
f1_data = F1Data(
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
print(f"Found {len(f1_data.code_problem_ids)} code problems in {split} split of {f1_data.cp_dataset_name}")
|
|
|
3 |
|
4 |
from datasets import load_dataset
|
5 |
|
6 |
+
from src.envs import CODE_PROBLEMS_REPO, RESULTS_REPO, SUBMISSIONS_REPO, TOKEN
|
7 |
from src.logger import get_logger
|
8 |
|
9 |
logger = get_logger(__name__)
|
10 |
|
11 |
|
12 |
class F1Data:
|
13 |
+
def __init__(
|
14 |
+
self,
|
15 |
+
cp_ds_name: str, # Name of the dataset. Fixed.
|
16 |
+
sub_ds_name: str, # Name of subdataset. Fixed.
|
17 |
+
res_ds_name: str, # Name of results repository. Fixed.
|
18 |
+
split: str = "hard", # Split is either 'hard' or 'easy'.
|
19 |
+
):
|
20 |
self.cp_dataset_name = cp_ds_name
|
21 |
self.submissions_dataset_name = sub_ds_name
|
22 |
self.results_dataset_name = res_ds_name
|
|
|
25 |
self._initialize()
|
26 |
|
27 |
def _initialize(self):
|
28 |
+
logger.info(f"Initialize F1Data TOKEN='{TOKEN}'")
|
29 |
start_time = time.monotonic()
|
30 |
+
cp_ds = load_dataset(
|
|
|
|
|
31 |
self.cp_dataset_name,
|
32 |
+
split=self.split,
|
33 |
+
token=TOKEN,
|
34 |
)
|
35 |
+
logger.info(f"Loaded code-problems dataset from {self.cp_dataset_name} in {time.monotonic() - start_time} sec")
|
36 |
+
self.code_problems = {r["id"]: r["code_problem"] for r in cp_ds} # id string -> code problem.
|
37 |
+
logger.info(f"Loaded {len(self.code_problems)} code problems")
|
38 |
|
39 |
@functools.cached_property
|
40 |
def code_problem_ids(self) -> set[str]:
|
|
|
43 |
|
44 |
if __name__ == "__main__":
|
45 |
split = "hard"
|
46 |
+
f1_data = F1Data(
|
47 |
+
cp_ds_name=CODE_PROBLEMS_REPO,
|
48 |
+
sub_ds_name=SUBMISSIONS_REPO,
|
49 |
+
res_ds_name=RESULTS_REPO,
|
50 |
+
split=split,
|
51 |
+
)
|
52 |
|
53 |
print(f"Found {len(f1_data.code_problem_ids)} code problems in {split} split of {f1_data.cp_dataset_name}")
|
src/display/__init__.py
ADDED
File without changes
|
src/display/css_html_js.py
CHANGED
@@ -33,7 +33,7 @@ custom_css = """
|
|
33 |
background: none;
|
34 |
border: none;
|
35 |
}
|
36 |
-
|
37 |
#search-bar {
|
38 |
padding: 0px;
|
39 |
}
|
|
|
33 |
background: none;
|
34 |
border: none;
|
35 |
}
|
36 |
+
|
37 |
#search-bar {
|
38 |
padding: 0px;
|
39 |
}
|
src/display/formatting.py
CHANGED
@@ -1,12 +1,3 @@
|
|
1 |
-
def model_hyperlink(link, model_name):
|
2 |
-
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
-
|
4 |
-
|
5 |
-
def make_clickable_model(model_name):
|
6 |
-
link = f"https://huggingface.co/{model_name}"
|
7 |
-
return model_hyperlink(link, model_name)
|
8 |
-
|
9 |
-
|
10 |
def styled_error(error):
|
11 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
12 |
|
@@ -17,11 +8,3 @@ def styled_warning(warn):
|
|
17 |
|
18 |
def styled_message(message):
|
19 |
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
20 |
-
|
21 |
-
|
22 |
-
def has_no_nan_values(df, columns):
|
23 |
-
return df[columns].notna().all(axis=1)
|
24 |
-
|
25 |
-
|
26 |
-
def has_nan_values(df, columns):
|
27 |
-
return df[columns].isna().any(axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
def styled_error(error):
|
2 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
3 |
|
|
|
8 |
|
9 |
def styled_message(message):
|
10 |
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/utils.py
CHANGED
@@ -1,19 +1,15 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
from typing import ClassVar
|
3 |
from enum import Enum
|
4 |
|
5 |
-
import pandas as pd
|
6 |
-
|
7 |
-
from src.about import Tasks
|
8 |
-
|
9 |
|
10 |
def fields(raw_class):
|
11 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
12 |
|
13 |
|
14 |
-
# These classes are for user facing column names,
|
15 |
-
#
|
16 |
-
|
|
|
17 |
@dataclass
|
18 |
class ColumnContent:
|
19 |
name: str
|
@@ -23,41 +19,6 @@ class ColumnContent:
|
|
23 |
never_hidden: bool = False
|
24 |
|
25 |
|
26 |
-
## Leaderboard columns
|
27 |
-
# auto_eval_column_fields = []
|
28 |
-
# # Init
|
29 |
-
# auto_eval_column_fields.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
30 |
-
# auto_eval_column_fields.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
-
# # Scores
|
32 |
-
# auto_eval_column_fields.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
33 |
-
# for task in Tasks:
|
34 |
-
# auto_eval_column_fields.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
35 |
-
# # Model information
|
36 |
-
# auto_eval_column_fields.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
37 |
-
# auto_eval_column_fields.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
38 |
-
# auto_eval_column_fields.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
39 |
-
# auto_eval_column_fields.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
40 |
-
# auto_eval_column_fields.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
41 |
-
# auto_eval_column_fields.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
42 |
-
# auto_eval_column_fields.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
43 |
-
# auto_eval_column_fields.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
44 |
-
# auto_eval_column_fields.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
45 |
-
#
|
46 |
-
#
|
47 |
-
#
|
48 |
-
# def make_classvar_dataclass(name: str, spec: list):
|
49 |
-
# ns = {"__annotations__": {}}
|
50 |
-
# for field_name, field_type, default in spec:
|
51 |
-
# # Mark as ClassVar so dataclass doesn't treat it as an instance field
|
52 |
-
# ns["__annotations__"][field_name] = ClassVar[field_type]
|
53 |
-
# ns[field_name] = default
|
54 |
-
# # No instance fields; just class-level descriptors
|
55 |
-
# return make_dataclass(name, [], frozen=True, namespace=ns)
|
56 |
-
#
|
57 |
-
# # We use make dataclass to dynamically fill the scores from Tasks
|
58 |
-
# AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
|
59 |
-
|
60 |
-
|
61 |
@dataclass(frozen=True)
|
62 |
class AutoEvalColumn:
|
63 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
@@ -68,18 +29,18 @@ class AutoEvalColumn:
|
|
68 |
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
69 |
|
70 |
|
71 |
-
|
72 |
@dataclass(frozen=True)
|
73 |
class EvalQueueColumn: # Queue column
|
74 |
model = ColumnContent("model", "markdown", True)
|
75 |
revision = ColumnContent("revision", "str", True)
|
76 |
private = ColumnContent("private", "bool", True)
|
77 |
precision = ColumnContent("precision", "str", True)
|
78 |
-
weight_type = ColumnContent("weight_type", "str",
|
79 |
status = ColumnContent("status", "str", True)
|
80 |
|
81 |
|
82 |
-
|
83 |
@dataclass
|
84 |
class ModelDetails:
|
85 |
name: str
|
@@ -90,8 +51,6 @@ class ModelDetails:
|
|
90 |
class ModelType(Enum):
|
91 |
LLM = ModelDetails(name="LLM", symbol="🟢")
|
92 |
AgenticLLM = ModelDetails(name="AgenticLLM", symbol="🔶")
|
93 |
-
# IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
94 |
-
# RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
95 |
Other = ModelDetails(name="Other", symbol="?")
|
96 |
|
97 |
def to_str(self, separator=" "):
|
@@ -103,36 +62,15 @@ class ModelType(Enum):
|
|
103 |
return ModelType.AgenticLLM
|
104 |
if "LLM" in type or "🟢" in type:
|
105 |
return ModelType.LLM
|
106 |
-
# if "RL-tuned" in type or "🟦" in type:
|
107 |
-
# return ModelType.RL
|
108 |
-
# if "instruction-tuned" in type or "⭕" in type:
|
109 |
-
# return ModelType.IFT
|
110 |
return ModelType.Other
|
111 |
|
112 |
|
113 |
-
class WeightType(Enum):
|
114 |
-
Adapter = ModelDetails("Adapter")
|
115 |
-
Original = ModelDetails("Original")
|
116 |
-
Delta = ModelDetails("Delta")
|
117 |
-
|
118 |
-
|
119 |
class Precision(Enum):
|
120 |
float16 = ModelDetails("float16")
|
121 |
bfloat16 = ModelDetails("bfloat16")
|
122 |
Unknown = ModelDetails("?")
|
123 |
|
124 |
-
def from_str(precision):
|
125 |
-
if precision in ["torch.float16", "float16"]:
|
126 |
-
return Precision.float16
|
127 |
-
if precision in ["torch.bfloat16", "bfloat16"]:
|
128 |
-
return Precision.bfloat16
|
129 |
-
return Precision.Unknown
|
130 |
-
|
131 |
|
132 |
-
# Column selection
|
133 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
134 |
-
|
135 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
136 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
137 |
-
|
138 |
-
# BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
1 |
+
from dataclasses import dataclass
|
|
|
2 |
from enum import Enum
|
3 |
|
|
|
|
|
|
|
|
|
4 |
|
5 |
def fields(raw_class):
|
6 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
7 |
|
8 |
|
9 |
+
# These classes are for user facing column names, to avoid having to change them
|
10 |
+
# all around the code when a modification is needed.
|
11 |
+
|
12 |
+
|
13 |
@dataclass
|
14 |
class ColumnContent:
|
15 |
name: str
|
|
|
19 |
never_hidden: bool = False
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
@dataclass(frozen=True)
|
23 |
class AutoEvalColumn:
|
24 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
|
|
29 |
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
30 |
|
31 |
|
32 |
+
# For the queue columns in the submission tab
|
33 |
@dataclass(frozen=True)
|
34 |
class EvalQueueColumn: # Queue column
|
35 |
model = ColumnContent("model", "markdown", True)
|
36 |
revision = ColumnContent("revision", "str", True)
|
37 |
private = ColumnContent("private", "bool", True)
|
38 |
precision = ColumnContent("precision", "str", True)
|
39 |
+
weight_type = ColumnContent("weight_type", "str", True)
|
40 |
status = ColumnContent("status", "str", True)
|
41 |
|
42 |
|
43 |
+
# All the model information that we might need
|
44 |
@dataclass
|
45 |
class ModelDetails:
|
46 |
name: str
|
|
|
51 |
class ModelType(Enum):
|
52 |
LLM = ModelDetails(name="LLM", symbol="🟢")
|
53 |
AgenticLLM = ModelDetails(name="AgenticLLM", symbol="🔶")
|
|
|
|
|
54 |
Other = ModelDetails(name="Other", symbol="?")
|
55 |
|
56 |
def to_str(self, separator=" "):
|
|
|
62 |
return ModelType.AgenticLLM
|
63 |
if "LLM" in type or "🟢" in type:
|
64 |
return ModelType.LLM
|
|
|
|
|
|
|
|
|
65 |
return ModelType.Other
|
66 |
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
class Precision(Enum):
|
69 |
float16 = ModelDetails("float16")
|
70 |
bfloat16 = ModelDetails("bfloat16")
|
71 |
Unknown = ModelDetails("?")
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
|
|
74 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
75 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
76 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
|
|
src/leaderboard/read_evals.py
DELETED
@@ -1,196 +0,0 @@
|
|
1 |
-
import glob
|
2 |
-
import json
|
3 |
-
import math
|
4 |
-
import os
|
5 |
-
from dataclasses import dataclass
|
6 |
-
|
7 |
-
import dateutil
|
8 |
-
import numpy as np
|
9 |
-
|
10 |
-
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
-
from src.submission.check_validity import is_model_on_hub
|
13 |
-
|
14 |
-
|
15 |
-
@dataclass
|
16 |
-
class EvalResult:
|
17 |
-
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
-
"""
|
19 |
-
eval_name: str # org_model_precision (uid)
|
20 |
-
full_model: str # org/model (path on hub)
|
21 |
-
org: str
|
22 |
-
model: str
|
23 |
-
revision: str # commit hash, "" if main
|
24 |
-
results: dict
|
25 |
-
precision: Precision = Precision.Unknown
|
26 |
-
model_type: ModelType = ModelType.LLM # Pretrained, fine tuned, ...
|
27 |
-
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
-
architecture: str = "Unknown"
|
29 |
-
license: str = "?"
|
30 |
-
likes: int = 0
|
31 |
-
num_params: int = 0
|
32 |
-
date: str = "" # submission date of request file
|
33 |
-
still_on_hub: bool = False
|
34 |
-
|
35 |
-
@classmethod
|
36 |
-
def init_from_json_file(self, json_filepath):
|
37 |
-
"""Inits the result from the specific model result file"""
|
38 |
-
with open(json_filepath) as fp:
|
39 |
-
data = json.load(fp)
|
40 |
-
|
41 |
-
config = data.get("config")
|
42 |
-
|
43 |
-
# Precision
|
44 |
-
precision = Precision.from_str(config.get("model_dtype"))
|
45 |
-
|
46 |
-
# Get model and org
|
47 |
-
org_and_model = config.get("model_name", config.get("model_args", None))
|
48 |
-
org_and_model = org_and_model.split("/", 1)
|
49 |
-
|
50 |
-
if len(org_and_model) == 1:
|
51 |
-
org = None
|
52 |
-
model = org_and_model[0]
|
53 |
-
result_key = f"{model}_{precision.value.name}"
|
54 |
-
else:
|
55 |
-
org = org_and_model[0]
|
56 |
-
model = org_and_model[1]
|
57 |
-
result_key = f"{org}_{model}_{precision.value.name}"
|
58 |
-
full_model = "/".join(org_and_model)
|
59 |
-
|
60 |
-
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
-
)
|
63 |
-
architecture = "?"
|
64 |
-
if model_config is not None:
|
65 |
-
architectures = getattr(model_config, "architectures", None)
|
66 |
-
if architectures:
|
67 |
-
architecture = ";".join(architectures)
|
68 |
-
|
69 |
-
# Extract results available in this file (some results are split in several files)
|
70 |
-
results = {}
|
71 |
-
for task in Tasks:
|
72 |
-
task = task.value
|
73 |
-
|
74 |
-
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
-
continue
|
78 |
-
|
79 |
-
mean_acc = np.mean(accs) * 100.0
|
80 |
-
results[task.benchmark] = mean_acc
|
81 |
-
|
82 |
-
return self(
|
83 |
-
eval_name=result_key,
|
84 |
-
full_model=full_model,
|
85 |
-
org=org,
|
86 |
-
model=model,
|
87 |
-
results=results,
|
88 |
-
precision=precision,
|
89 |
-
revision= config.get("model_sha", ""),
|
90 |
-
still_on_hub=still_on_hub,
|
91 |
-
architecture=architecture
|
92 |
-
)
|
93 |
-
|
94 |
-
def update_with_request_file(self, requests_path):
|
95 |
-
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
-
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
-
|
98 |
-
try:
|
99 |
-
with open(request_file, "r") as f:
|
100 |
-
request = json.load(f)
|
101 |
-
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
102 |
-
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
103 |
-
self.license = request.get("license", "?")
|
104 |
-
self.likes = request.get("likes", 0)
|
105 |
-
self.num_params = request.get("params", 0)
|
106 |
-
self.date = request.get("submitted_time", "")
|
107 |
-
except Exception:
|
108 |
-
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
-
|
110 |
-
def to_dict(self):
|
111 |
-
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
-
data_dict = {
|
114 |
-
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
-
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
-
AutoEvalColumn.revision.name: self.revision,
|
122 |
-
AutoEvalColumn.average.name: average,
|
123 |
-
AutoEvalColumn.license.name: self.license,
|
124 |
-
AutoEvalColumn.likes.name: self.likes,
|
125 |
-
AutoEvalColumn.params.name: self.num_params,
|
126 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
-
}
|
128 |
-
|
129 |
-
for task in Tasks:
|
130 |
-
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
131 |
-
|
132 |
-
return data_dict
|
133 |
-
|
134 |
-
|
135 |
-
def get_request_file_for_model(requests_path, model_name, precision):
|
136 |
-
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
137 |
-
request_files = os.path.join(
|
138 |
-
requests_path,
|
139 |
-
f"{model_name}_eval_request_*.json",
|
140 |
-
)
|
141 |
-
request_files = glob.glob(request_files)
|
142 |
-
|
143 |
-
# Select correct request file (precision)
|
144 |
-
request_file = ""
|
145 |
-
request_files = sorted(request_files, reverse=True)
|
146 |
-
for tmp_request_file in request_files:
|
147 |
-
with open(tmp_request_file, "r") as f:
|
148 |
-
req_content = json.load(f)
|
149 |
-
if (
|
150 |
-
req_content["status"] in ["FINISHED"]
|
151 |
-
and req_content["precision"] == precision.split(".")[-1]
|
152 |
-
):
|
153 |
-
request_file = tmp_request_file
|
154 |
-
return request_file
|
155 |
-
|
156 |
-
|
157 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
158 |
-
"""From the path of the results folder root, extract all needed info for results"""
|
159 |
-
model_result_filepaths = []
|
160 |
-
|
161 |
-
for root, _, files in os.walk(results_path):
|
162 |
-
# We should only have json files in model results
|
163 |
-
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
164 |
-
continue
|
165 |
-
|
166 |
-
# Sort the files by date
|
167 |
-
try:
|
168 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
169 |
-
except dateutil.parser._parser.ParserError:
|
170 |
-
files = [files[-1]]
|
171 |
-
|
172 |
-
for file in files:
|
173 |
-
model_result_filepaths.append(os.path.join(root, file))
|
174 |
-
|
175 |
-
eval_results = {}
|
176 |
-
for model_result_filepath in model_result_filepaths:
|
177 |
-
# Creation of result
|
178 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
-
eval_result.update_with_request_file(requests_path)
|
180 |
-
|
181 |
-
# Store results of same eval together
|
182 |
-
eval_name = eval_result.eval_name
|
183 |
-
if eval_name in eval_results.keys():
|
184 |
-
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
185 |
-
else:
|
186 |
-
eval_results[eval_name] = eval_result
|
187 |
-
|
188 |
-
results = []
|
189 |
-
for v in eval_results.values():
|
190 |
-
try:
|
191 |
-
v.to_dict() # we test if the dict version is complete
|
192 |
-
results.append(v)
|
193 |
-
except KeyError: # not all eval values present
|
194 |
-
continue
|
195 |
-
|
196 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/logger.py
CHANGED
@@ -1,7 +1,11 @@
|
|
1 |
import logging
|
2 |
import sys
|
3 |
|
4 |
-
|
|
|
|
|
|
|
|
|
5 |
new_logger = logging.getLogger(filename)
|
6 |
fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")
|
7 |
handler = logging.StreamHandler(sys.stderr)
|
|
|
1 |
import logging
|
2 |
import sys
|
3 |
|
4 |
+
|
5 |
+
def get_logger(
|
6 |
+
filename: str,
|
7 |
+
level=logging.INFO,
|
8 |
+
) -> logging.Logger:
|
9 |
new_logger = logging.getLogger(filename)
|
10 |
fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")
|
11 |
handler = logging.StreamHandler(sys.stderr)
|
src/populate.py
CHANGED
@@ -1,27 +1,29 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
|
4 |
import pandas as pd
|
5 |
-
from datasets import
|
6 |
from datasets.exceptions import DatasetNotFoundError
|
7 |
from tqdm.auto import tqdm
|
8 |
|
9 |
-
from src.display.
|
10 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
11 |
from src.envs import TOKEN
|
12 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
13 |
from src.logger import get_logger
|
14 |
|
15 |
logger = get_logger(__name__)
|
16 |
|
17 |
|
18 |
def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
|
19 |
-
"""
|
|
|
|
|
20 |
|
21 |
try:
|
22 |
-
configs = get_dataset_config_names(
|
|
|
|
|
|
|
23 |
except (DatasetNotFoundError, FileNotFoundError):
|
|
|
24 |
# Return an empty DataFrame with expected columns
|
|
|
25 |
return pd.DataFrame(
|
26 |
columns=[
|
27 |
"System Name",
|
@@ -34,8 +36,17 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
|
|
34 |
)
|
35 |
|
36 |
rows = []
|
37 |
-
for submission_id in tqdm(
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
submission_df = pd.DataFrame(submission_ds)
|
40 |
|
41 |
if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
|
@@ -59,7 +70,7 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
|
|
59 |
|
60 |
full_df = pd.DataFrame(rows)
|
61 |
|
62 |
-
# TODO:
|
63 |
# Keep only the latest entry per unique (System Name, System Type, Organization) triplet
|
64 |
final_df = (
|
65 |
full_df.sort_values("Submitted On", ascending=False)
|
@@ -72,39 +83,3 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
|
|
72 |
final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
|
73 |
|
74 |
return final_df
|
75 |
-
|
76 |
-
|
77 |
-
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
78 |
-
"""Creates the different dataframes for the evaluation queues requestes"""
|
79 |
-
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
80 |
-
all_evals = []
|
81 |
-
|
82 |
-
for entry in entries:
|
83 |
-
if ".json" in entry:
|
84 |
-
file_path = os.path.join(save_path, entry)
|
85 |
-
with open(file_path) as fp:
|
86 |
-
data = json.load(fp)
|
87 |
-
|
88 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
89 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
90 |
-
|
91 |
-
all_evals.append(data)
|
92 |
-
elif ".md" not in entry:
|
93 |
-
# this is a folder
|
94 |
-
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
95 |
-
for sub_entry in sub_entries:
|
96 |
-
file_path = os.path.join(save_path, entry, sub_entry)
|
97 |
-
with open(file_path) as fp:
|
98 |
-
data = json.load(fp)
|
99 |
-
|
100 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
101 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
102 |
-
all_evals.append(data)
|
103 |
-
|
104 |
-
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
105 |
-
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
106 |
-
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
107 |
-
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
108 |
-
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
109 |
-
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
110 |
-
return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
+
from datasets import get_dataset_config_names, load_dataset
|
3 |
from datasets.exceptions import DatasetNotFoundError
|
4 |
from tqdm.auto import tqdm
|
5 |
|
6 |
+
from src.display.utils import AutoEvalColumn
|
|
|
7 |
from src.envs import TOKEN
|
|
|
8 |
from src.logger import get_logger
|
9 |
|
10 |
logger = get_logger(__name__)
|
11 |
|
12 |
|
13 |
def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
|
14 |
+
"""
|
15 |
+
@brief Creates a dataframe from all the individual experiment results.
|
16 |
+
"""
|
17 |
|
18 |
try:
|
19 |
+
configs = get_dataset_config_names(
|
20 |
+
results_dataset_name,
|
21 |
+
token=TOKEN,
|
22 |
+
)
|
23 |
except (DatasetNotFoundError, FileNotFoundError):
|
24 |
+
|
25 |
# Return an empty DataFrame with expected columns
|
26 |
+
logger.warning("Failed to load configuration", exc_info=True)
|
27 |
return pd.DataFrame(
|
28 |
columns=[
|
29 |
"System Name",
|
|
|
36 |
)
|
37 |
|
38 |
rows = []
|
39 |
+
for submission_id in tqdm(
|
40 |
+
configs,
|
41 |
+
total=len(configs),
|
42 |
+
desc="Processing Submission Results",
|
43 |
+
):
|
44 |
+
submission_ds = load_dataset(
|
45 |
+
results_dataset_name,
|
46 |
+
submission_id,
|
47 |
+
split="train",
|
48 |
+
token=TOKEN,
|
49 |
+
)
|
50 |
submission_df = pd.DataFrame(submission_ds)
|
51 |
|
52 |
if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
|
|
|
70 |
|
71 |
full_df = pd.DataFrame(rows)
|
72 |
|
73 |
+
# TODO: Forbid multiple submissions under the same name?
|
74 |
# Keep only the latest entry per unique (System Name, System Type, Organization) triplet
|
75 |
final_df = (
|
76 |
full_df.sort_values("Submitted On", ascending=False)
|
|
|
83 |
final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
|
84 |
|
85 |
return final_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/check_validity.py
DELETED
@@ -1,102 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import re
|
4 |
-
from collections import defaultdict
|
5 |
-
from datetime import datetime, timedelta, timezone
|
6 |
-
|
7 |
-
from datasets import get_dataset_config_names
|
8 |
-
import huggingface_hub
|
9 |
-
from huggingface_hub import ModelCard
|
10 |
-
from huggingface_hub.hf_api import ModelInfo
|
11 |
-
from transformers import AutoConfig
|
12 |
-
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
13 |
-
|
14 |
-
from src.envs import SUBMISSIONS_REPO
|
15 |
-
|
16 |
-
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
17 |
-
"""Checks if the model card and license exist and have been filled"""
|
18 |
-
try:
|
19 |
-
card = ModelCard.load(repo_id)
|
20 |
-
except huggingface_hub.utils.EntryNotFoundError:
|
21 |
-
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
22 |
-
|
23 |
-
# Enforce license metadata
|
24 |
-
if card.data.license is None:
|
25 |
-
if not ("license_name" in card.data and "license_link" in card.data):
|
26 |
-
return False, (
|
27 |
-
"License not found. Please add a license to your model card using the `license` metadata or a"
|
28 |
-
" `license_name`/`license_link` pair."
|
29 |
-
)
|
30 |
-
|
31 |
-
# Enforce card content
|
32 |
-
if len(card.text) < 200:
|
33 |
-
return False, "Please add a description to your model card, it is too short."
|
34 |
-
|
35 |
-
return True, ""
|
36 |
-
|
37 |
-
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
38 |
-
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
39 |
-
try:
|
40 |
-
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
41 |
-
if test_tokenizer:
|
42 |
-
try:
|
43 |
-
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
44 |
-
except ValueError as e:
|
45 |
-
return (
|
46 |
-
False,
|
47 |
-
f"uses a tokenizer which is not in a transformers release: {e}",
|
48 |
-
None
|
49 |
-
)
|
50 |
-
except Exception as e:
|
51 |
-
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
52 |
-
return True, None, config
|
53 |
-
|
54 |
-
except ValueError:
|
55 |
-
return (
|
56 |
-
False,
|
57 |
-
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
58 |
-
None
|
59 |
-
)
|
60 |
-
|
61 |
-
except Exception as e:
|
62 |
-
return False, "was not found on hub!", None
|
63 |
-
|
64 |
-
|
65 |
-
def get_model_size(model_info: ModelInfo, precision: str):
|
66 |
-
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
67 |
-
try:
|
68 |
-
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
69 |
-
except (AttributeError, TypeError):
|
70 |
-
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
71 |
-
|
72 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
73 |
-
model_size = size_factor * model_size
|
74 |
-
return model_size
|
75 |
-
|
76 |
-
def get_model_arch(model_info: ModelInfo):
|
77 |
-
"""Gets the model architecture from the configuration"""
|
78 |
-
return model_info.config.get("architectures", "Unknown")
|
79 |
-
|
80 |
-
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
81 |
-
"""Gather a list of already submitted models to avoid duplicates"""
|
82 |
-
depth = 1
|
83 |
-
file_names = []
|
84 |
-
users_to_submission_dates = defaultdict(list)
|
85 |
-
|
86 |
-
for root, _, files in os.walk(requested_models_dir):
|
87 |
-
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
88 |
-
if current_depth == depth:
|
89 |
-
for file in files:
|
90 |
-
if not file.endswith(".json"):
|
91 |
-
continue
|
92 |
-
with open(os.path.join(root, file), "r") as f:
|
93 |
-
info = json.load(f)
|
94 |
-
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
|
95 |
-
|
96 |
-
# Select organisation
|
97 |
-
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
98 |
-
continue
|
99 |
-
organisation, _ = info["model"].split("/")
|
100 |
-
users_to_submission_dates[organisation].append(info["submitted_time"])
|
101 |
-
|
102 |
-
return set(file_names), users_to_submission_dates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/submit.py
CHANGED
@@ -1,50 +1,44 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
from datetime import datetime, timezone
|
4 |
import time
|
|
|
5 |
|
6 |
-
from datasets import Dataset, DatasetDict
|
7 |
import pandas as pd
|
8 |
-
from
|
|
|
9 |
|
|
|
10 |
from src.datamodel.data import F1Data
|
11 |
-
from src.display.formatting import styled_error, styled_message
|
12 |
from src.display.utils import ModelType
|
13 |
-
from src.envs import
|
14 |
from src.logger import get_logger
|
15 |
-
|
16 |
-
# from src.submission.check_validity import (
|
17 |
-
# already_submitted_models,
|
18 |
-
# check_model_card,
|
19 |
-
# get_model_size,
|
20 |
-
# is_model_on_hub,
|
21 |
-
# )
|
22 |
|
23 |
logger = get_logger(__name__)
|
24 |
|
25 |
|
26 |
-
def
|
27 |
-
|
|
|
|
|
|
|
28 |
expected_cols = ["problem_id", "solution"]
|
29 |
|
30 |
if set(pd_ds.columns) != set(expected_cols):
|
31 |
-
return f"Expected attributes: {expected_cols}, Got: {pd_ds.columns.tolist()}"
|
32 |
|
33 |
if not is_integer_dtype(pd_ds["problem_id"]):
|
34 |
-
return "problem_id must be str convertible to int"
|
35 |
|
36 |
-
if any(type(v)
|
37 |
-
return "solution must be of type str"
|
38 |
|
39 |
submitted_ids = set(pd_ds.problem_id.astype(str))
|
40 |
if submitted_ids != lbdb.code_problem_ids:
|
41 |
missing = lbdb.code_problem_ids - submitted_ids
|
42 |
unknown = submitted_ids - lbdb.code_problem_ids
|
43 |
-
|
44 |
if len(pd_ds) > len(lbdb.code_problem_ids):
|
45 |
-
return "Duplicate problem IDs exist in uploaded file"
|
46 |
-
|
47 |
-
return None
|
48 |
|
49 |
|
50 |
def add_new_solutions(
|
@@ -53,36 +47,33 @@ def add_new_solutions(
|
|
53 |
org: str,
|
54 |
sys_type: str,
|
55 |
submission_path: str,
|
56 |
-
|
57 |
):
|
58 |
-
logger.info(
|
59 |
-
|
60 |
-
|
61 |
|
62 |
-
|
63 |
-
|
|
|
|
|
64 |
|
65 |
-
if not sys_type:
|
66 |
-
return styled_error("Please select system type")
|
67 |
sys_type = ModelType.from_str(sys_type).name
|
68 |
|
69 |
-
if not submission_path:
|
70 |
-
return styled_error("Please upload JSONL solutions file")
|
71 |
-
|
72 |
try:
|
73 |
submission_df = pd.read_json(submission_path, lines=True)
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
|
82 |
submission_id = f"{system_name}_{org}_{sys_type}_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}"
|
83 |
|
84 |
-
# Seems good, creating the eval
|
85 |
-
|
86 |
submission_ts = time.time_ns()
|
87 |
|
88 |
def add_info(row):
|
@@ -96,31 +87,13 @@ def add_new_solutions(
|
|
96 |
}
|
97 |
|
98 |
ds = Dataset.from_pandas(submission_df).map(add_info)
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
# print("Creating eval file")
|
105 |
-
# OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
106 |
-
# os.makedirs(OUT_DIR, exist_ok=True)
|
107 |
-
# out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
108 |
-
|
109 |
-
# with open(out_path, "w") as f:
|
110 |
-
# f.write(json.dumps(eval_entry))
|
111 |
-
|
112 |
-
# print("Uploading eval file")
|
113 |
-
# API.upload_file(
|
114 |
-
# path_or_fileobj=out_path,
|
115 |
-
# path_in_repo=out_path.split("eval-queue/")[1],
|
116 |
-
# repo_id=QUEUE_REPO,
|
117 |
-
# repo_type="dataset",
|
118 |
-
# commit_message=f"Add {model} to eval queue",
|
119 |
-
# )
|
120 |
-
|
121 |
-
# # Remove the local file
|
122 |
-
# os.remove(out_path)
|
123 |
|
124 |
return styled_message(
|
125 |
-
"Your request has been submitted to the evaluation queue!\
|
|
|
126 |
)
|
|
|
|
|
|
|
|
|
1 |
import time
|
2 |
+
from datetime import datetime, timezone
|
3 |
|
|
|
4 |
import pandas as pd
|
5 |
+
from datasets import Dataset
|
6 |
+
from pandas.api.types import is_integer_dtype
|
7 |
|
8 |
+
from app import is_valid
|
9 |
from src.datamodel.data import F1Data
|
10 |
+
from src.display.formatting import styled_error, styled_message
|
11 |
from src.display.utils import ModelType
|
12 |
+
from src.envs import SUBMISSIONS_REPO
|
13 |
from src.logger import get_logger
|
14 |
+
from validation.validate import is_submission_file_valid
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
logger = get_logger(__name__)
|
17 |
|
18 |
|
19 |
+
def _validate_all_submissions_present(
|
20 |
+
lbdb: F1Data,
|
21 |
+
pd_ds: pd.DataFrame,
|
22 |
+
):
|
23 |
+
logger.info(f"Validating DS size {len(pd_ds)} columns {pd_ds.columns} set {set(pd_ds.columns)}")
|
24 |
expected_cols = ["problem_id", "solution"]
|
25 |
|
26 |
if set(pd_ds.columns) != set(expected_cols):
|
27 |
+
return ValueError(f"Expected attributes: {expected_cols}, Got: {pd_ds.columns.tolist()}")
|
28 |
|
29 |
if not is_integer_dtype(pd_ds["problem_id"]):
|
30 |
+
return ValueError("problem_id must be str convertible to int")
|
31 |
|
32 |
+
if any(type(v) is not str for v in pd_ds["solution"]):
|
33 |
+
return ValueError("solution must be of type str")
|
34 |
|
35 |
submitted_ids = set(pd_ds.problem_id.astype(str))
|
36 |
if submitted_ids != lbdb.code_problem_ids:
|
37 |
missing = lbdb.code_problem_ids - submitted_ids
|
38 |
unknown = submitted_ids - lbdb.code_problem_ids
|
39 |
+
raise ValueError(f"Mismatched problem IDs: {len(missing)} missing, {len(unknown)} unknown")
|
40 |
if len(pd_ds) > len(lbdb.code_problem_ids):
|
41 |
+
return ValueError("Duplicate problem IDs exist in uploaded file")
|
|
|
|
|
42 |
|
43 |
|
44 |
def add_new_solutions(
|
|
|
47 |
org: str,
|
48 |
sys_type: str,
|
49 |
submission_path: str,
|
50 |
+
ensure_all_present: bool = False,
|
51 |
):
|
52 |
+
logger.info(
|
53 |
+
f"Adding new submission! {system_name=}, {org=}, {sys_type=} and {submission_path=}",
|
54 |
+
)
|
55 |
|
56 |
+
# Double-checking.
|
57 |
+
for val in [system_name, org, sys_type]:
|
58 |
+
assert is_valid(val)
|
59 |
+
assert is_submission_file_valid(submission_path)
|
60 |
|
|
|
|
|
61 |
sys_type = ModelType.from_str(sys_type).name
|
62 |
|
|
|
|
|
|
|
63 |
try:
|
64 |
submission_df = pd.read_json(submission_path, lines=True)
|
65 |
+
if ensure_all_present:
|
66 |
+
_validate_all_submissions_present(lbdb=lbdb, pd_ds=submission_df)
|
67 |
+
except Exception:
|
68 |
+
logger.warning("Failed to parse submission DF!", exc_info=True)
|
69 |
+
return styled_error(
|
70 |
+
"An error occurred. Please try again later."
|
71 |
+
) # Use same message as external error. Avoid infoleak.
|
72 |
|
73 |
submission_id = f"{system_name}_{org}_{sys_type}_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}"
|
74 |
|
75 |
+
# Seems good, creating the eval.
|
76 |
+
logger.info(f"Adding new submission: {submission_id}")
|
77 |
submission_ts = time.time_ns()
|
78 |
|
79 |
def add_info(row):
|
|
|
87 |
}
|
88 |
|
89 |
ds = Dataset.from_pandas(submission_df).map(add_info)
|
90 |
+
ds.push_to_hub(
|
91 |
+
SUBMISSIONS_REPO,
|
92 |
+
submission_id,
|
93 |
+
private=True,
|
94 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
return styled_message(
|
97 |
+
"Your request has been submitted to the evaluation queue!\n"
|
98 |
+
+ "Results may take up to 24 hours to be processed and shown in the leaderboard."
|
99 |
)
|
src/validation/__init__.py
ADDED
File without changes
|
src/validation/validate.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import string
|
4 |
+
|
5 |
+
DATASET_SIZE = 120
|
6 |
+
|
7 |
+
MIN_INPUT_LENGTH = 2
|
8 |
+
MAX_INPUT_LENGTH = 20
|
9 |
+
|
10 |
+
MIN_SUBMISSION_SIZE = 1
|
11 |
+
MAX_SUBMISSION_SIZE = 1024 * 1024 * 120 # 120 MB.
|
12 |
+
MAX_SINGLE_SUBMISSION_SIZE = 1024 * 1024 # 1MB.
|
13 |
+
MAX_SUBMISSION_LINES = DATASET_SIZE + 1 # Allow empty line.
|
14 |
+
|
15 |
+
|
16 |
+
def is_valid(
|
17 |
+
s: str,
|
18 |
+
min_length: int = MIN_INPUT_LENGTH,
|
19 |
+
max_length: int = MAX_INPUT_LENGTH,
|
20 |
+
) -> bool:
|
21 |
+
"""
|
22 |
+
@brief Checks whether the given string is valid.
|
23 |
+
@param s The string to validate.
|
24 |
+
@return True iff all characters are in [a-zA-Z0-9], spaces, or '.' and '-', and the length if between
|
25 |
+
min length and max length.
|
26 |
+
"""
|
27 |
+
|
28 |
+
characters = [c for c in s] # Not using the length from len(.) as that includes unicode characters.
|
29 |
+
if len(characters) < min_length or len(characters) > max_length:
|
30 |
+
return False
|
31 |
+
|
32 |
+
# Very important: We delimit using underscores. So these _CANNOT_ be allowed in sanitised strings.
|
33 |
+
ALLOWED = (
|
34 |
+
[c for c in string.ascii_lowercase]
|
35 |
+
+ [c for c in string.ascii_uppercase]
|
36 |
+
+ [c for c in string.digits]
|
37 |
+
+ [" ", ".", "-"]
|
38 |
+
)
|
39 |
+
for c in s:
|
40 |
+
if c not in ALLOWED:
|
41 |
+
return False
|
42 |
+
return True
|
43 |
+
|
44 |
+
|
45 |
+
def is_submission_file_valid(submission_path: str) -> bool:
|
46 |
+
"""
|
47 |
+
@brief Checks whether the given submission file is valid.
|
48 |
+
@param submission_path The path to the submission file.
|
49 |
+
@return True iff the file is within the size constraints, a JSONL, and every line is no longer than
|
50 |
+
the fixed maximum bound.
|
51 |
+
"""
|
52 |
+
|
53 |
+
if not os.path.exists(submission_path):
|
54 |
+
return False
|
55 |
+
|
56 |
+
submission_size = os.stat(submission_path).st_size
|
57 |
+
if submission_size < MIN_SUBMISSION_SIZE or submission_size > MAX_SUBMISSION_SIZE:
|
58 |
+
return False
|
59 |
+
|
60 |
+
with open(submission_path, "r") as f:
|
61 |
+
|
62 |
+
# Not using readlines() to avoid consuming a large buffer at once.
|
63 |
+
n_lines = 0
|
64 |
+
seen_ids = set()
|
65 |
+
while len(line := f.readline(MAX_SINGLE_SUBMISSION_SIZE)) > 0:
|
66 |
+
n_lines += 1
|
67 |
+
if n_lines > MAX_SUBMISSION_LINES:
|
68 |
+
return False
|
69 |
+
|
70 |
+
if not line.startswith("{") or not line.endswith("}"):
|
71 |
+
return False
|
72 |
+
|
73 |
+
d = json.loads(line)
|
74 |
+
if set(d.keys()) != set(["problem_id", "solution"]):
|
75 |
+
return False
|
76 |
+
|
77 |
+
if not ((type(d["problem_id"]) is str or type(d["problem_id"]) is int) and type(d["solution"] is str)):
|
78 |
+
return False
|
79 |
+
if not d["problem_id"].isdigit():
|
80 |
+
return False
|
81 |
+
problem_id = int(d["problem_id"])
|
82 |
+
if problem_id < 0 or problem_id >= DATASET_SIZE:
|
83 |
+
return False
|
84 |
+
|
85 |
+
if problem_id in seen_ids:
|
86 |
+
return False # Duplicate submission.
|
87 |
+
seen_ids.add(problem_id)
|
88 |
+
|
89 |
+
return True
|