galb-dai commited on
Commit
791ff8a
·
2 Parent(s): fe968be 814f111

Merge branch 'gal-beniamini/cleanup'

Browse files
README.md CHANGED
@@ -41,9 +41,9 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
41
  # Code logic for more complex edits
42
 
43
  You'll find
44
- - the main table' columns names and properties in `src/display/utils.py`
45
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
47
 
48
 
49
  # Setting up the environment
 
41
  # Code logic for more complex edits
42
 
43
  You'll find
44
+ - The main table' columns names and properties in `src/display/utils.py`
45
+ - The logic to read all results and request files, then convert them in dataframe lines, in `src/populate.py`
46
+ - The logic to allow or filter submissions in `src/submission/submit.py`.
47
 
48
 
49
  # Setting up the environment
app.py CHANGED
@@ -1,45 +1,34 @@
1
- import random
2
-
3
  import gradio as gr
4
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
  import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
 
7
 
8
- # from huggingface_hub import snapshot_download
9
-
10
- from src.about import (
11
- CITATION_BUTTON_LABEL,
12
- CITATION_BUTTON_TEXT,
13
- EVALUATION_QUEUE_TEXT,
14
- INTRODUCTION_TEXT,
15
- LLM_BENCHMARKS_TEXT,
16
- TITLE,
17
- )
18
  from src.datamodel.data import F1Data
19
-
20
  from src.display.css_html_js import custom_css
21
-
22
- from src.display.utils import (
23
- # BENCHMARK_COLS,
24
- COLS,
25
- EVAL_COLS,
26
- EVAL_TYPES,
27
- AutoEvalColumn,
28
- ModelType,
29
- fields,
30
- WeightType,
31
- Precision,
32
- )
33
- from src.envs import API, REPO_ID, TOKEN, CODE_PROBLEMS_REPO, SUBMISSIONS_REPO, RESULTS_REPO
34
  from src.logger import get_logger
35
-
36
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
37
  from src.submission.submit import add_new_solutions
 
38
 
39
  logger = get_logger(__name__)
40
 
 
41
  SPLIT = "warmup" # TODO temp
42
- SKIP_VALIDATION = True # TODO temp
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  def restart_space():
@@ -65,9 +54,11 @@ def refresh_leaderboard_data():
65
  return None
66
 
67
 
68
- def init_leaderboard(dataframe):
 
69
  if dataframe is None or dataframe.empty:
70
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
71
  return Leaderboard(
72
  value=dataframe,
73
  datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -80,39 +71,14 @@ def init_leaderboard(dataframe):
80
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
81
  filter_columns=[
82
  ColumnFilter(AutoEvalColumn.system_type.name, type="checkboxgroup", label="Model types"),
83
- # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
84
- # ColumnFilter(
85
- # AutoEvalColumn.params.name,
86
- # type="slider",
87
- # min=0.01,
88
- # max=150,
89
- # label="Select the number of parameters (B)",
90
- # ),
91
- # ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
92
  ],
93
  bool_checkboxgroup_label="Hide models",
94
  interactive=False,
95
  )
96
 
97
 
98
- lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=SPLIT)
99
-
100
- leaderboard_df = None
101
-
102
- logger.info("Initialized LBDB")
103
-
104
- # (
105
- # finished_eval_queue_df,
106
- # running_eval_queue_df,
107
- # pending_eval_queue_df,
108
- # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
109
-
110
-
111
- # Display image using Markdown
112
- # banner = "![Leaderboard Banner](file/assets/banner.png)"
113
-
114
- demo = gr.Blocks(css=custom_css)
115
- with demo:
116
  gr.Image(
117
  "assets/banner.png",
118
  interactive=False,
@@ -121,7 +87,6 @@ with demo:
121
  container=False,
122
  )
123
 
124
- # gr.Markdown(banner)
125
  gr.HTML(
126
  """
127
  <style>
@@ -149,53 +114,15 @@ with demo:
149
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
150
  with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
151
  refresh_leaderboard_data() # updates leaderboard_df
 
152
  leaderboard_component = init_leaderboard(leaderboard_df)
153
 
154
- # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
155
- # logger.info("Tab about")
156
- # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
157
-
158
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
159
  logger.info("Tab submission")
160
  with gr.Column():
161
  with gr.Row():
162
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
163
 
164
- # with gr.Column():
165
- # with gr.Accordion(
166
- # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
167
- # open=False,
168
- # ):
169
- # with gr.Row():
170
- # finished_eval_table = gr.components.Dataframe(
171
- # value=finished_eval_queue_df,
172
- # headers=EVAL_COLS,
173
- # datatype=EVAL_TYPES,
174
- # row_count=5,
175
- # )
176
- # with gr.Accordion(
177
- # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
178
- # open=False,
179
- # ):
180
- # with gr.Row():
181
- # running_eval_table = gr.components.Dataframe(
182
- # value=running_eval_queue_df,
183
- # headers=EVAL_COLS,
184
- # datatype=EVAL_TYPES,
185
- # row_count=5,
186
- # )
187
-
188
- # with gr.Accordion(
189
- # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
190
- # open=False,
191
- # ):
192
- # with gr.Row():
193
- # pending_eval_table = gr.components.Dataframe(
194
- # value=pending_eval_queue_df,
195
- # headers=EVAL_COLS,
196
- # datatype=EVAL_TYPES,
197
- # row_count=5,
198
- # )
199
  with gr.Row():
200
  gr.Markdown("# ✉️✨ Submit your solutions here!", elem_classes="markdown-text")
201
 
@@ -203,7 +130,6 @@ with demo:
203
  with gr.Column():
204
  system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
205
  org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
206
- # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
207
  sys_type_dropdown = gr.Dropdown(
208
  choices=[t.to_str(" ") for t in ModelType],
209
  label=AutoEvalColumn.system_type.name,
@@ -212,31 +138,53 @@ with demo:
212
  interactive=True,
213
  )
214
 
215
- # with gr.Column():
216
  submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
217
- # precision = gr.Dropdown(
218
- # choices=[i.value.name for i in Precision if i != Precision.Unknown],
219
- # label="Precision",
220
- # multiselect=False,
221
- # value="float16",
222
- # interactive=True,
223
- # )
224
- # weight_type = gr.Dropdown(
225
- # choices=[i.value.name for i in WeightType],
226
- # label="Weights type",
227
- # multiselect=False,
228
- # value="Original",
229
- # interactive=True,
230
- # )
231
- # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
232
 
233
  logger.info("Submit button")
234
  submit_button = gr.Button("Submit")
235
  submission_result = gr.Markdown()
236
 
237
- def add_solution_cbk(system_name, org, sys_type, submission_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  return add_new_solutions(
239
- lbdb, system_name, org, sys_type, submission_path, skip_validation=SKIP_VALIDATION
 
 
 
 
 
240
  )
241
 
242
  submit_button.click(
@@ -257,16 +205,9 @@ with demo:
257
  value=CITATION_BUTTON_TEXT.strip(),
258
  elem_id="citation-block",
259
  )
260
- # citation_button = gr.Textbox(
261
- # value=CITATION_BUTTON_TEXT,
262
- # # label=CITATION_BUTTON_LABEL,
263
- # lines=20,
264
- # elem_id="citation-button",
265
- # show_copy_button=True,
266
- # )
267
 
268
  # UI refresh triggers latest data swap. The work already happened in the background - refresh_leaderboard_data().
269
- demo.load(lambda: leaderboard_df, inputs=[], outputs=[leaderboard_component])
270
 
271
 
272
  logger.info("Scheduler")
@@ -275,5 +216,5 @@ scheduler.add_job(restart_space, "interval", seconds=1800)
275
  scheduler.add_job(refresh_leaderboard_data, "interval", seconds=120)
276
  scheduler.start()
277
  logger.info("Launch")
278
- demo.queue(default_concurrency_limit=40).launch()
279
  logger.info("Done")
 
 
 
1
  import gradio as gr
 
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
+ from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
5
 
6
+ from display.formatting import styled_error
7
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
 
 
 
 
 
 
 
 
8
  from src.datamodel.data import F1Data
 
9
  from src.display.css_html_js import custom_css
10
+ from src.display.utils import AutoEvalColumn, ModelType, fields
11
+ from src.envs import API, CODE_PROBLEMS_REPO, REPO_ID, RESULTS_REPO, SUBMISSIONS_REPO
 
 
 
 
 
 
 
 
 
 
 
12
  from src.logger import get_logger
13
+ from src.populate import get_leaderboard_df
 
14
  from src.submission.submit import add_new_solutions
15
+ from src.validation.validate import MAX_INPUT_LENGTH, MIN_INPUT_LENGTH, is_submission_file_valid, is_valid
16
 
17
  logger = get_logger(__name__)
18
 
19
+ ENSURE_ALL_PRESENT = False # TODO: Switch to True.
20
  SPLIT = "warmup" # TODO temp
21
+
22
+ lbdb = F1Data(
23
+ cp_ds_name=CODE_PROBLEMS_REPO,
24
+ sub_ds_name=SUBMISSIONS_REPO,
25
+ res_ds_name=RESULTS_REPO,
26
+ split=SPLIT,
27
+ )
28
+
29
+ leaderboard_df = None
30
+
31
+ logger.info("Initialized LBDB")
32
 
33
 
34
  def restart_space():
 
54
  return None
55
 
56
 
57
+ def init_leaderboard(dataframe: pd.DataFrame):
58
+
59
  if dataframe is None or dataframe.empty:
60
  raise ValueError("Leaderboard DataFrame is empty or None.")
61
+
62
  return Leaderboard(
63
  value=dataframe,
64
  datatype=[c.type for c in fields(AutoEvalColumn)],
 
71
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
72
  filter_columns=[
73
  ColumnFilter(AutoEvalColumn.system_type.name, type="checkboxgroup", label="Model types"),
 
 
 
 
 
 
 
 
 
74
  ],
75
  bool_checkboxgroup_label="Hide models",
76
  interactive=False,
77
  )
78
 
79
 
80
+ blocks = gr.Blocks(css=custom_css)
81
+ with blocks:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  gr.Image(
83
  "assets/banner.png",
84
  interactive=False,
 
87
  container=False,
88
  )
89
 
 
90
  gr.HTML(
91
  """
92
  <style>
 
114
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
115
  with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
116
  refresh_leaderboard_data() # updates leaderboard_df
117
+ assert leaderboard_df is not None
118
  leaderboard_component = init_leaderboard(leaderboard_df)
119
 
 
 
 
 
120
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
121
  logger.info("Tab submission")
122
  with gr.Column():
123
  with gr.Row():
124
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  with gr.Row():
127
  gr.Markdown("# ✉️✨ Submit your solutions here!", elem_classes="markdown-text")
128
 
 
130
  with gr.Column():
131
  system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
132
  org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
 
133
  sys_type_dropdown = gr.Dropdown(
134
  choices=[t.to_str(" ") for t in ModelType],
135
  label=AutoEvalColumn.system_type.name,
 
138
  interactive=True,
139
  )
140
 
 
141
  submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  logger.info("Submit button")
144
  submit_button = gr.Button("Submit")
145
  submission_result = gr.Markdown()
146
 
147
+ def add_solution_cbk(
148
+ system_name: str,
149
+ org: str,
150
+ sys_type: str,
151
+ submission_path: str,
152
+ ):
153
+
154
+ try:
155
+ # Validating the submission file.
156
+ if len(submission_path) == 0:
157
+ return styled_error("Please upload JSONL submission file.")
158
+
159
+ if not is_submission_file_valid(submission_path):
160
+ return styled_error("Failed to read JSONL submission file. Please try again later.")
161
+
162
+ # Validating all user-supplied arguments.
163
+ for val, val_name in [
164
+ (system_name, "System name"),
165
+ (org, "Organisation name"),
166
+ (sys_type, "System type"),
167
+ ]:
168
+ if len(val) == 0:
169
+ return styled_error(f"Please fill in the '{val_name}' field.")
170
+
171
+ if not is_valid(val):
172
+ return styled_error(
173
+ f"{val_name} is invalid! Must only contain characters [a-zA-Z0-9], spaces, "
174
+ + "or the special characters '-' and '.', and be of length between "
175
+ + f"{MIN_INPUT_LENGTH} and {MAX_INPUT_LENGTH}."
176
+ )
177
+ except Exception:
178
+ logger.warning("Failed to process user submission", exc_info=True)
179
+ return styled_error("An error occurred. Please try again later.") # Intentionally vague.
180
+
181
  return add_new_solutions(
182
+ lbdb,
183
+ system_name,
184
+ org,
185
+ sys_type,
186
+ submission_path,
187
+ ensure_all_present=ENSURE_ALL_PRESENT,
188
  )
189
 
190
  submit_button.click(
 
205
  value=CITATION_BUTTON_TEXT.strip(),
206
  elem_id="citation-block",
207
  )
 
 
 
 
 
 
 
208
 
209
  # UI refresh triggers latest data swap. The work already happened in the background - refresh_leaderboard_data().
210
+ blocks.load(lambda: leaderboard_df, inputs=[], outputs=[leaderboard_component])
211
 
212
 
213
  logger.info("Scheduler")
 
216
  scheduler.add_job(refresh_leaderboard_data, "interval", seconds=120)
217
  scheduler.start()
218
  logger.info("Launch")
219
+ blocks.queue(default_concurrency_limit=40).launch()
220
  logger.info("Done")
scripts/upload_f1_dataset.py CHANGED
@@ -2,6 +2,7 @@ import argparse
2
  import fnmatch
3
  import json
4
  import os
 
5
 
6
  from datasets import Dataset
7
 
@@ -13,9 +14,23 @@ logger = get_logger(__name__)
13
 
14
  def get_args() -> argparse.Namespace:
15
  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
16
- parser.add_argument("--input_dir", type=str, help="Dir with .json files", required=True)
17
- parser.add_argument("--dataset_name", type=str, default=f"{CODE_PROBLEMS_REPO}")
18
- parser.add_argument("--split", type=str, choices=["hard", "warmup"], default="hard")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  return parser.parse_args()
20
 
21
 
@@ -26,7 +41,7 @@ def main(args: argparse.Namespace) -> None:
26
  raise ValueError(f"No .json files in input dir {args.input_dir}")
27
  logger.info("Found %d code problems in %s", len(input_files), args.input_dir)
28
 
29
- def ds_generator():
30
  for fname in sorted(input_files):
31
  formula_name = os.path.splitext(fname)[0]
32
  cp_path = os.path.join(args.input_dir, fname)
@@ -35,7 +50,7 @@ def main(args: argparse.Namespace) -> None:
35
  logger.info("Read code problem for formula %s from %s", formula_name, cp_path)
36
  yield dict(id=code_problem["id"], code_problem=code_problem)
37
 
38
- ds = Dataset.from_generator(ds_generator)
39
  logger.info("Created dataset")
40
 
41
  ds.push_to_hub(args.dataset_name, split=args.split, private=True)
 
2
  import fnmatch
3
  import json
4
  import os
5
+ from typing import Iterator
6
 
7
  from datasets import Dataset
8
 
 
14
 
15
  def get_args() -> argparse.Namespace:
16
  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
17
+ parser.add_argument(
18
+ "--input_dir",
19
+ type=str,
20
+ help="Dir with .json files",
21
+ required=True,
22
+ )
23
+ parser.add_argument(
24
+ "--dataset_name",
25
+ type=str,
26
+ default=f"{CODE_PROBLEMS_REPO}",
27
+ )
28
+ parser.add_argument(
29
+ "--split",
30
+ type=str,
31
+ choices=["hard", "warmup"],
32
+ default="hard",
33
+ )
34
  return parser.parse_args()
35
 
36
 
 
41
  raise ValueError(f"No .json files in input dir {args.input_dir}")
42
  logger.info("Found %d code problems in %s", len(input_files), args.input_dir)
43
 
44
+ def ds_generator() -> Iterator[dict]:
45
  for fname in sorted(input_files):
46
  formula_name = os.path.splitext(fname)[0]
47
  cp_path = os.path.join(args.input_dir, fname)
 
50
  logger.info("Read code problem for formula %s from %s", formula_name, cp_path)
51
  yield dict(id=code_problem["id"], code_problem=code_problem)
52
 
53
+ ds: Dataset = Dataset.from_generator(ds_generator) # type: ignore
54
  logger.info("Created dataset")
55
 
56
  ds.push_to_hub(args.dataset_name, split=args.split, private=True)
src/about.py CHANGED
@@ -9,20 +9,11 @@ class Task:
9
  col_name: str
10
 
11
 
12
- # Select your tasks here
13
- # ---------------------------------------------------
14
  class Tasks(Enum):
15
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
  task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
17
- # task1 = Task("logiqa", "acc_norm", "LogiQA")
18
 
19
 
20
- NUM_FEWSHOT = 0 # Change with your few shot
21
- # ---------------------------------------------------
22
-
23
-
24
- # Your leaderboard name
25
- # TITLE = """<h1 align="center" id="space-title">AAI FormulaOne Leaderboard</h1>"""
26
 
27
  TITLE = """
28
  <h1 id="space-title" style="
@@ -39,19 +30,17 @@ TITLE = """
39
  </h1>
40
  """
41
 
42
- # What does your leaderboard evaluate?
43
  INTRODUCTION_TEXT = """
44
  Welcome to the official leaderboard for the paper:
45
 
46
  **FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming** <br>
47
- *Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
48
  **AAI, July 2025**
49
 
50
  FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science.
51
  """
52
 
53
- # Which evaluations are you running? how can people reproduce what you have?
54
- LLM_BENCHMARKS_TEXT = f"""
55
  ## How it works
56
 
57
  ## Reproducibility
@@ -95,7 +84,7 @@ Submissions must:
95
  - **Organization**
96
  - **System Type**
97
  - Click **Submit**.
98
-
99
  ### ⏱️ After Submission
100
 
101
  Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed.
@@ -105,12 +94,12 @@ Submissions are validated and evaluated within ~24 hours. Results will appear on
105
  CITATION_BUTTON_LABEL = """📚 How to cite FormulaOne"""
106
  CITATION_BUTTON_TEXT = r"""
107
  @misc{beniamini2025formulaonemeasuringdepthalgorithmic,
108
- title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming},
109
- author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and Ido Ben Shaul and Tomer Zekharya and Yoav Levine and Shai Shalev-Shwartz and Amnon Shashua},
110
  year={2025},
111
  eprint={2507.13337},
112
  archivePrefix={arXiv},
113
  primaryClass={cs.AI},
114
- url={https://arxiv.org/abs/2507.13337},
115
  }
116
  """
 
9
  col_name: str
10
 
11
 
 
 
12
  class Tasks(Enum):
 
13
  task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
 
14
 
15
 
16
+ NUM_FEWSHOT = 0
 
 
 
 
 
17
 
18
  TITLE = """
19
  <h1 id="space-title" style="
 
30
  </h1>
31
  """
32
 
 
33
  INTRODUCTION_TEXT = """
34
  Welcome to the official leaderboard for the paper:
35
 
36
  **FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming** <br>
37
+ *Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Nadav Schweiger, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
38
  **AAI, July 2025**
39
 
40
  FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science.
41
  """
42
 
43
+ LLM_BENCHMARKS_TEXT = """
 
44
  ## How it works
45
 
46
  ## Reproducibility
 
84
  - **Organization**
85
  - **System Type**
86
  - Click **Submit**.
87
+
88
  ### ⏱️ After Submission
89
 
90
  Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed.
 
94
  CITATION_BUTTON_LABEL = """📚 How to cite FormulaOne"""
95
  CITATION_BUTTON_TEXT = r"""
96
  @misc{beniamini2025formulaonemeasuringdepthalgorithmic,
97
+ title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming},
98
+ author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and Nadav Schweiger and Ido Ben Shaul and Tomer Zekharya and Yoav Levine and Shai Shalev-Shwartz and Amnon Shashua},
99
  year={2025},
100
  eprint={2507.13337},
101
  archivePrefix={arXiv},
102
  primaryClass={cs.AI},
103
+ url={https://arxiv.org/abs/2507.13337},
104
  }
105
  """
src/datamodel/data.py CHANGED
@@ -3,14 +3,20 @@ import time
3
 
4
  from datasets import load_dataset
5
 
6
- from src.envs import TOKEN, CODE_PROBLEMS_REPO, RESULTS_REPO, SUBMISSIONS_REPO
7
  from src.logger import get_logger
8
 
9
  logger = get_logger(__name__)
10
 
11
 
12
  class F1Data:
13
- def __init__(self, cp_ds_name: str, sub_ds_name: str, res_ds_name: str, split: str = "hard"):
 
 
 
 
 
 
14
  self.cp_dataset_name = cp_ds_name
15
  self.submissions_dataset_name = sub_ds_name
16
  self.results_dataset_name = res_ds_name
@@ -19,16 +25,16 @@ class F1Data:
19
  self._initialize()
20
 
21
  def _initialize(self):
22
- logger.info("Initialize F1Data TOKEN='%s'", TOKEN)
23
  start_time = time.monotonic()
24
- cp_ds = load_dataset(self.cp_dataset_name, split=self.split, token=TOKEN)
25
- logger.info(
26
- "Loaded code-problems dataset from %s in %f sec",
27
  self.cp_dataset_name,
28
- time.monotonic() - start_time,
 
29
  )
30
- self.code_problems: dict[str, str] = {r["id"]: r["code_problem"] for r in cp_ds}
31
- logger.info(f"Loaded %d code problems {len(self.code_problems)}")
 
32
 
33
  @functools.cached_property
34
  def code_problem_ids(self) -> set[str]:
@@ -37,6 +43,11 @@ class F1Data:
37
 
38
  if __name__ == "__main__":
39
  split = "hard"
40
- f1_data = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=split)
 
 
 
 
 
41
 
42
  print(f"Found {len(f1_data.code_problem_ids)} code problems in {split} split of {f1_data.cp_dataset_name}")
 
3
 
4
  from datasets import load_dataset
5
 
6
+ from src.envs import CODE_PROBLEMS_REPO, RESULTS_REPO, SUBMISSIONS_REPO, TOKEN
7
  from src.logger import get_logger
8
 
9
  logger = get_logger(__name__)
10
 
11
 
12
  class F1Data:
13
+ def __init__(
14
+ self,
15
+ cp_ds_name: str, # Name of the dataset. Fixed.
16
+ sub_ds_name: str, # Name of subdataset. Fixed.
17
+ res_ds_name: str, # Name of results repository. Fixed.
18
+ split: str = "hard", # Split is either 'hard' or 'easy'.
19
+ ):
20
  self.cp_dataset_name = cp_ds_name
21
  self.submissions_dataset_name = sub_ds_name
22
  self.results_dataset_name = res_ds_name
 
25
  self._initialize()
26
 
27
  def _initialize(self):
28
+ logger.info(f"Initialize F1Data TOKEN='{TOKEN}'")
29
  start_time = time.monotonic()
30
+ cp_ds = load_dataset(
 
 
31
  self.cp_dataset_name,
32
+ split=self.split,
33
+ token=TOKEN,
34
  )
35
+ logger.info(f"Loaded code-problems dataset from {self.cp_dataset_name} in {time.monotonic() - start_time} sec")
36
+ self.code_problems = {r["id"]: r["code_problem"] for r in cp_ds} # id string -> code problem.
37
+ logger.info(f"Loaded {len(self.code_problems)} code problems")
38
 
39
  @functools.cached_property
40
  def code_problem_ids(self) -> set[str]:
 
43
 
44
  if __name__ == "__main__":
45
  split = "hard"
46
+ f1_data = F1Data(
47
+ cp_ds_name=CODE_PROBLEMS_REPO,
48
+ sub_ds_name=SUBMISSIONS_REPO,
49
+ res_ds_name=RESULTS_REPO,
50
+ split=split,
51
+ )
52
 
53
  print(f"Found {len(f1_data.code_problem_ids)} code problems in {split} split of {f1_data.cp_dataset_name}")
src/display/__init__.py ADDED
File without changes
src/display/css_html_js.py CHANGED
@@ -33,7 +33,7 @@ custom_css = """
33
  background: none;
34
  border: none;
35
  }
36
-
37
  #search-bar {
38
  padding: 0px;
39
  }
 
33
  background: none;
34
  border: none;
35
  }
36
+
37
  #search-bar {
38
  padding: 0px;
39
  }
src/display/formatting.py CHANGED
@@ -1,12 +1,3 @@
1
- def model_hyperlink(link, model_name):
2
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
-
4
-
5
- def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
8
-
9
-
10
  def styled_error(error):
11
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
 
@@ -17,11 +8,3 @@ def styled_warning(warn):
17
 
18
  def styled_message(message):
19
  return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
-
21
-
22
- def has_no_nan_values(df, columns):
23
- return df[columns].notna().all(axis=1)
24
-
25
-
26
- def has_nan_values(df, columns):
27
- return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
1
  def styled_error(error):
2
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
3
 
 
8
 
9
  def styled_message(message):
10
  return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
 
 
 
 
 
 
 
 
src/display/utils.py CHANGED
@@ -1,19 +1,15 @@
1
- from dataclasses import dataclass, field, make_dataclass
2
- from typing import ClassVar
3
  from enum import Enum
4
 
5
- import pandas as pd
6
-
7
- from src.about import Tasks
8
-
9
 
10
  def fields(raw_class):
11
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
12
 
13
 
14
- # These classes are for user facing column names,
15
- # to avoid having to change them all around the code
16
- # when a modif is needed
 
17
  @dataclass
18
  class ColumnContent:
19
  name: str
@@ -23,41 +19,6 @@ class ColumnContent:
23
  never_hidden: bool = False
24
 
25
 
26
- ## Leaderboard columns
27
- # auto_eval_column_fields = []
28
- # # Init
29
- # auto_eval_column_fields.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
30
- # auto_eval_column_fields.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
31
- # # Scores
32
- # auto_eval_column_fields.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
33
- # for task in Tasks:
34
- # auto_eval_column_fields.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
35
- # # Model information
36
- # auto_eval_column_fields.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
37
- # auto_eval_column_fields.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
38
- # auto_eval_column_fields.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
39
- # auto_eval_column_fields.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
40
- # auto_eval_column_fields.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
41
- # auto_eval_column_fields.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
42
- # auto_eval_column_fields.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
43
- # auto_eval_column_fields.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
44
- # auto_eval_column_fields.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
45
- #
46
- #
47
- #
48
- # def make_classvar_dataclass(name: str, spec: list):
49
- # ns = {"__annotations__": {}}
50
- # for field_name, field_type, default in spec:
51
- # # Mark as ClassVar so dataclass doesn't treat it as an instance field
52
- # ns["__annotations__"][field_name] = ClassVar[field_type]
53
- # ns[field_name] = default
54
- # # No instance fields; just class-level descriptors
55
- # return make_dataclass(name, [], frozen=True, namespace=ns)
56
- #
57
- # # We use make dataclass to dynamically fill the scores from Tasks
58
- # AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
59
-
60
-
61
  @dataclass(frozen=True)
62
  class AutoEvalColumn:
63
  system = ColumnContent("System Name", "markdown", True, never_hidden=True)
@@ -68,18 +29,18 @@ class AutoEvalColumn:
68
  submitted_on = ColumnContent("Submitted On", "datetime", True)
69
 
70
 
71
- ## For the queue columns in the submission tab
72
  @dataclass(frozen=True)
73
  class EvalQueueColumn: # Queue column
74
  model = ColumnContent("model", "markdown", True)
75
  revision = ColumnContent("revision", "str", True)
76
  private = ColumnContent("private", "bool", True)
77
  precision = ColumnContent("precision", "str", True)
78
- weight_type = ColumnContent("weight_type", "str", "Original")
79
  status = ColumnContent("status", "str", True)
80
 
81
 
82
- ## All the model information that we might need
83
  @dataclass
84
  class ModelDetails:
85
  name: str
@@ -90,8 +51,6 @@ class ModelDetails:
90
  class ModelType(Enum):
91
  LLM = ModelDetails(name="LLM", symbol="🟢")
92
  AgenticLLM = ModelDetails(name="AgenticLLM", symbol="🔶")
93
- # IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
94
- # RL = ModelDetails(name="RL-tuned", symbol="🟦")
95
  Other = ModelDetails(name="Other", symbol="?")
96
 
97
  def to_str(self, separator=" "):
@@ -103,36 +62,15 @@ class ModelType(Enum):
103
  return ModelType.AgenticLLM
104
  if "LLM" in type or "🟢" in type:
105
  return ModelType.LLM
106
- # if "RL-tuned" in type or "🟦" in type:
107
- # return ModelType.RL
108
- # if "instruction-tuned" in type or "⭕" in type:
109
- # return ModelType.IFT
110
  return ModelType.Other
111
 
112
 
113
- class WeightType(Enum):
114
- Adapter = ModelDetails("Adapter")
115
- Original = ModelDetails("Original")
116
- Delta = ModelDetails("Delta")
117
-
118
-
119
  class Precision(Enum):
120
  float16 = ModelDetails("float16")
121
  bfloat16 = ModelDetails("bfloat16")
122
  Unknown = ModelDetails("?")
123
 
124
- def from_str(precision):
125
- if precision in ["torch.float16", "float16"]:
126
- return Precision.float16
127
- if precision in ["torch.bfloat16", "bfloat16"]:
128
- return Precision.bfloat16
129
- return Precision.Unknown
130
-
131
 
132
- # Column selection
133
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
134
-
135
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
136
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
137
-
138
- # BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
1
+ from dataclasses import dataclass
 
2
  from enum import Enum
3
 
 
 
 
 
4
 
5
  def fields(raw_class):
6
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
7
 
8
 
9
+ # These classes are for user facing column names, to avoid having to change them
10
+ # all around the code when a modification is needed.
11
+
12
+
13
  @dataclass
14
  class ColumnContent:
15
  name: str
 
19
  never_hidden: bool = False
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  @dataclass(frozen=True)
23
  class AutoEvalColumn:
24
  system = ColumnContent("System Name", "markdown", True, never_hidden=True)
 
29
  submitted_on = ColumnContent("Submitted On", "datetime", True)
30
 
31
 
32
+ # For the queue columns in the submission tab
33
  @dataclass(frozen=True)
34
  class EvalQueueColumn: # Queue column
35
  model = ColumnContent("model", "markdown", True)
36
  revision = ColumnContent("revision", "str", True)
37
  private = ColumnContent("private", "bool", True)
38
  precision = ColumnContent("precision", "str", True)
39
+ weight_type = ColumnContent("weight_type", "str", True)
40
  status = ColumnContent("status", "str", True)
41
 
42
 
43
+ # All the model information that we might need
44
  @dataclass
45
  class ModelDetails:
46
  name: str
 
51
  class ModelType(Enum):
52
  LLM = ModelDetails(name="LLM", symbol="🟢")
53
  AgenticLLM = ModelDetails(name="AgenticLLM", symbol="🔶")
 
 
54
  Other = ModelDetails(name="Other", symbol="?")
55
 
56
  def to_str(self, separator=" "):
 
62
  return ModelType.AgenticLLM
63
  if "LLM" in type or "🟢" in type:
64
  return ModelType.LLM
 
 
 
 
65
  return ModelType.Other
66
 
67
 
 
 
 
 
 
 
68
  class Precision(Enum):
69
  float16 = ModelDetails("float16")
70
  bfloat16 = ModelDetails("bfloat16")
71
  Unknown = ModelDetails("?")
72
 
 
 
 
 
 
 
 
73
 
 
74
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
75
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
76
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
 
src/leaderboard/read_evals.py DELETED
@@ -1,196 +0,0 @@
1
- import glob
2
- import json
3
- import math
4
- import os
5
- from dataclasses import dataclass
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
-
14
-
15
- @dataclass
16
- class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
- model: str
23
- revision: str # commit hash, "" if main
24
- results: dict
25
- precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.LLM # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
- license: str = "?"
30
- likes: int = 0
31
- num_params: int = 0
32
- date: str = "" # submission date of request file
33
- still_on_hub: bool = False
34
-
35
- @classmethod
36
- def init_from_json_file(self, json_filepath):
37
- """Inits the result from the specific model result file"""
38
- with open(json_filepath) as fp:
39
- data = json.load(fp)
40
-
41
- config = data.get("config")
42
-
43
- # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
-
46
- # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
-
50
- if len(org_and_model) == 1:
51
- org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
- else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
59
-
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
-
69
- # Extract results available in this file (some results are split in several files)
70
- results = {}
71
- for task in Tasks:
72
- task = task.value
73
-
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
-
82
- return self(
83
- eval_name=result_key,
84
- full_model=full_model,
85
- org=org,
86
- model=model,
87
- results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
91
- architecture=architecture
92
- )
93
-
94
- def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
- try:
99
- with open(request_file, "r") as f:
100
- request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
- self.date = request.get("submitted_time", "")
107
- except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
-
110
- def to_dict(self):
111
- """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
- data_dict = {
114
- "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
- }
128
-
129
- for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
-
132
- return data_dict
133
-
134
-
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
- request_files = os.path.join(
138
- requests_path,
139
- f"{model_name}_eval_request_*.json",
140
- )
141
- request_files = glob.glob(request_files)
142
-
143
- # Select correct request file (precision)
144
- request_file = ""
145
- request_files = sorted(request_files, reverse=True)
146
- for tmp_request_file in request_files:
147
- with open(tmp_request_file, "r") as f:
148
- req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
- request_file = tmp_request_file
154
- return request_file
155
-
156
-
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
- """From the path of the results folder root, extract all needed info for results"""
159
- model_result_filepaths = []
160
-
161
- for root, _, files in os.walk(results_path):
162
- # We should only have json files in model results
163
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
- continue
165
-
166
- # Sort the files by date
167
- try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
-
172
- for file in files:
173
- model_result_filepaths.append(os.path.join(root, file))
174
-
175
- eval_results = {}
176
- for model_result_filepath in model_result_filepaths:
177
- # Creation of result
178
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
-
181
- # Store results of same eval together
182
- eval_name = eval_result.eval_name
183
- if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
- else:
186
- eval_results[eval_name] = eval_result
187
-
188
- results = []
189
- for v in eval_results.values():
190
- try:
191
- v.to_dict() # we test if the dict version is complete
192
- results.append(v)
193
- except KeyError: # not all eval values present
194
- continue
195
-
196
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/logger.py CHANGED
@@ -1,7 +1,11 @@
1
  import logging
2
  import sys
3
 
4
- def get_logger(filename: str, level=logging.INFO) -> logging.Logger:
 
 
 
 
5
  new_logger = logging.getLogger(filename)
6
  fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")
7
  handler = logging.StreamHandler(sys.stderr)
 
1
  import logging
2
  import sys
3
 
4
+
5
+ def get_logger(
6
+ filename: str,
7
+ level=logging.INFO,
8
+ ) -> logging.Logger:
9
  new_logger = logging.getLogger(filename)
10
  fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")
11
  handler = logging.StreamHandler(sys.stderr)
src/populate.py CHANGED
@@ -1,27 +1,29 @@
1
- import json
2
- import os
3
-
4
  import pandas as pd
5
- from datasets import load_dataset, get_dataset_config_names
6
  from datasets.exceptions import DatasetNotFoundError
7
  from tqdm.auto import tqdm
8
 
9
- from src.display.formatting import has_no_nan_values, make_clickable_model
10
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
11
  from src.envs import TOKEN
12
- from src.leaderboard.read_evals import get_raw_eval_results
13
  from src.logger import get_logger
14
 
15
  logger = get_logger(__name__)
16
 
17
 
18
  def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
19
- """Creates a dataframe from all the individual experiment results"""
 
 
20
 
21
  try:
22
- configs = get_dataset_config_names(results_dataset_name, token=TOKEN)
 
 
 
23
  except (DatasetNotFoundError, FileNotFoundError):
 
24
  # Return an empty DataFrame with expected columns
 
25
  return pd.DataFrame(
26
  columns=[
27
  "System Name",
@@ -34,8 +36,17 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
34
  )
35
 
36
  rows = []
37
- for submission_id in tqdm(configs, total=len(configs), desc="Processing Submission Results"):
38
- submission_ds = load_dataset(results_dataset_name, submission_id, split="train", token=TOKEN)
 
 
 
 
 
 
 
 
 
39
  submission_df = pd.DataFrame(submission_ds)
40
 
41
  if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
@@ -59,7 +70,7 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
59
 
60
  full_df = pd.DataFrame(rows)
61
 
62
- # TODO: forbid multiple submissions under the same name?
63
  # Keep only the latest entry per unique (System Name, System Type, Organization) triplet
64
  final_df = (
65
  full_df.sort_values("Submitted On", ascending=False)
@@ -72,39 +83,3 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
72
  final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
73
 
74
  return final_df
75
-
76
-
77
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
78
- """Creates the different dataframes for the evaluation queues requestes"""
79
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
80
- all_evals = []
81
-
82
- for entry in entries:
83
- if ".json" in entry:
84
- file_path = os.path.join(save_path, entry)
85
- with open(file_path) as fp:
86
- data = json.load(fp)
87
-
88
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
89
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
90
-
91
- all_evals.append(data)
92
- elif ".md" not in entry:
93
- # this is a folder
94
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
95
- for sub_entry in sub_entries:
96
- file_path = os.path.join(save_path, entry, sub_entry)
97
- with open(file_path) as fp:
98
- data = json.load(fp)
99
-
100
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
101
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
102
- all_evals.append(data)
103
-
104
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
105
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
106
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
107
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
108
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
109
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
110
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
1
  import pandas as pd
2
+ from datasets import get_dataset_config_names, load_dataset
3
  from datasets.exceptions import DatasetNotFoundError
4
  from tqdm.auto import tqdm
5
 
6
+ from src.display.utils import AutoEvalColumn
 
7
  from src.envs import TOKEN
 
8
  from src.logger import get_logger
9
 
10
  logger = get_logger(__name__)
11
 
12
 
13
  def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
14
+ """
15
+ @brief Creates a dataframe from all the individual experiment results.
16
+ """
17
 
18
  try:
19
+ configs = get_dataset_config_names(
20
+ results_dataset_name,
21
+ token=TOKEN,
22
+ )
23
  except (DatasetNotFoundError, FileNotFoundError):
24
+
25
  # Return an empty DataFrame with expected columns
26
+ logger.warning("Failed to load configuration", exc_info=True)
27
  return pd.DataFrame(
28
  columns=[
29
  "System Name",
 
36
  )
37
 
38
  rows = []
39
+ for submission_id in tqdm(
40
+ configs,
41
+ total=len(configs),
42
+ desc="Processing Submission Results",
43
+ ):
44
+ submission_ds = load_dataset(
45
+ results_dataset_name,
46
+ submission_id,
47
+ split="train",
48
+ token=TOKEN,
49
+ )
50
  submission_df = pd.DataFrame(submission_ds)
51
 
52
  if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
 
70
 
71
  full_df = pd.DataFrame(rows)
72
 
73
+ # TODO: Forbid multiple submissions under the same name?
74
  # Keep only the latest entry per unique (System Name, System Type, Organization) triplet
75
  final_df = (
76
  full_df.sort_values("Submitted On", ascending=False)
 
83
  final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
84
 
85
  return final_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py DELETED
@@ -1,102 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- from datasets import get_dataset_config_names
8
- import huggingface_hub
9
- from huggingface_hub import ModelCard
10
- from huggingface_hub.hf_api import ModelInfo
11
- from transformers import AutoConfig
12
- from transformers.models.auto.tokenization_auto import AutoTokenizer
13
-
14
- from src.envs import SUBMISSIONS_REPO
15
-
16
- def check_model_card(repo_id: str) -> tuple[bool, str]:
17
- """Checks if the model card and license exist and have been filled"""
18
- try:
19
- card = ModelCard.load(repo_id)
20
- except huggingface_hub.utils.EntryNotFoundError:
21
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
22
-
23
- # Enforce license metadata
24
- if card.data.license is None:
25
- if not ("license_name" in card.data and "license_link" in card.data):
26
- return False, (
27
- "License not found. Please add a license to your model card using the `license` metadata or a"
28
- " `license_name`/`license_link` pair."
29
- )
30
-
31
- # Enforce card content
32
- if len(card.text) < 200:
33
- return False, "Please add a description to your model card, it is too short."
34
-
35
- return True, ""
36
-
37
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
38
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
39
- try:
40
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- if test_tokenizer:
42
- try:
43
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
44
- except ValueError as e:
45
- return (
46
- False,
47
- f"uses a tokenizer which is not in a transformers release: {e}",
48
- None
49
- )
50
- except Exception as e:
51
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
52
- return True, None, config
53
-
54
- except ValueError:
55
- return (
56
- False,
57
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
58
- None
59
- )
60
-
61
- except Exception as e:
62
- return False, "was not found on hub!", None
63
-
64
-
65
- def get_model_size(model_info: ModelInfo, precision: str):
66
- """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
67
- try:
68
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
69
- except (AttributeError, TypeError):
70
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
71
-
72
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
73
- model_size = size_factor * model_size
74
- return model_size
75
-
76
- def get_model_arch(model_info: ModelInfo):
77
- """Gets the model architecture from the configuration"""
78
- return model_info.config.get("architectures", "Unknown")
79
-
80
- def already_submitted_models(requested_models_dir: str) -> set[str]:
81
- """Gather a list of already submitted models to avoid duplicates"""
82
- depth = 1
83
- file_names = []
84
- users_to_submission_dates = defaultdict(list)
85
-
86
- for root, _, files in os.walk(requested_models_dir):
87
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
88
- if current_depth == depth:
89
- for file in files:
90
- if not file.endswith(".json"):
91
- continue
92
- with open(os.path.join(root, file), "r") as f:
93
- info = json.load(f)
94
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
95
-
96
- # Select organisation
97
- if info["model"].count("/") == 0 or "submitted_time" not in info:
98
- continue
99
- organisation, _ = info["model"].split("/")
100
- users_to_submission_dates[organisation].append(info["submitted_time"])
101
-
102
- return set(file_names), users_to_submission_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py CHANGED
@@ -1,50 +1,44 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
  import time
 
5
 
6
- from datasets import Dataset, DatasetDict
7
  import pandas as pd
8
- from pandas.api.types import is_integer_dtype, is_string_dtype
 
9
 
 
10
  from src.datamodel.data import F1Data
11
- from src.display.formatting import styled_error, styled_message, styled_warning
12
  from src.display.utils import ModelType
13
- from src.envs import API, SUBMISSIONS_REPO, TOKEN
14
  from src.logger import get_logger
15
-
16
- # from src.submission.check_validity import (
17
- # already_submitted_models,
18
- # check_model_card,
19
- # get_model_size,
20
- # is_model_on_hub,
21
- # )
22
 
23
  logger = get_logger(__name__)
24
 
25
 
26
- def validate_submission(lbdb: F1Data, pd_ds: pd.DataFrame) -> str | None:
27
- logger.info("Validating DS size %d columns %s set %s", len(pd_ds), pd_ds.columns, set(pd_ds.columns))
 
 
 
28
  expected_cols = ["problem_id", "solution"]
29
 
30
  if set(pd_ds.columns) != set(expected_cols):
31
- return f"Expected attributes: {expected_cols}, Got: {pd_ds.columns.tolist()}"
32
 
33
  if not is_integer_dtype(pd_ds["problem_id"]):
34
- return "problem_id must be str convertible to int"
35
 
36
- if any(type(v) != str for v in pd_ds["solution"]):
37
- return "solution must be of type str"
38
 
39
  submitted_ids = set(pd_ds.problem_id.astype(str))
40
  if submitted_ids != lbdb.code_problem_ids:
41
  missing = lbdb.code_problem_ids - submitted_ids
42
  unknown = submitted_ids - lbdb.code_problem_ids
43
- return f"Mismatched problem IDs: {len(missing)} missing, {len(unknown)} unknown"
44
  if len(pd_ds) > len(lbdb.code_problem_ids):
45
- return "Duplicate problem IDs exist in uploaded file"
46
-
47
- return None
48
 
49
 
50
  def add_new_solutions(
@@ -53,36 +47,33 @@ def add_new_solutions(
53
  org: str,
54
  sys_type: str,
55
  submission_path: str,
56
- skip_validation: bool = False,
57
  ):
58
- logger.info("ADD SUBMISSION! %s path %s", str((system_name, org, sys_type)), submission_path)
59
- if not system_name:
60
- return styled_error("Please fill system name")
61
 
62
- if not org:
63
- return styled_error("Please fill organization name")
 
 
64
 
65
- if not sys_type:
66
- return styled_error("Please select system type")
67
  sys_type = ModelType.from_str(sys_type).name
68
 
69
- if not submission_path:
70
- return styled_error("Please upload JSONL solutions file")
71
-
72
  try:
73
  submission_df = pd.read_json(submission_path, lines=True)
74
- except Exception as e:
75
- return styled_error(f"Cannot read uploaded JSONL file: {str(e)}")
76
-
77
- if not skip_validation:
78
- validation_error = validate_submission(lbdb, submission_df)
79
- if validation_error:
80
- return styled_error(validation_error)
81
 
82
  submission_id = f"{system_name}_{org}_{sys_type}_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}"
83
 
84
- # Seems good, creating the eval
85
- print(f"Adding new submission: {submission_id}")
86
  submission_ts = time.time_ns()
87
 
88
  def add_info(row):
@@ -96,31 +87,13 @@ def add_new_solutions(
96
  }
97
 
98
  ds = Dataset.from_pandas(submission_df).map(add_info)
99
-
100
- # dsdict = DatasetDict({submission_id: ds})
101
- # dsdict.push_to_hub(SUBMISSIONS_REPO, private=True)
102
-
103
- ds.push_to_hub(SUBMISSIONS_REPO, submission_id, private=True)
104
- # print("Creating eval file")
105
- # OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
106
- # os.makedirs(OUT_DIR, exist_ok=True)
107
- # out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
108
-
109
- # with open(out_path, "w") as f:
110
- # f.write(json.dumps(eval_entry))
111
-
112
- # print("Uploading eval file")
113
- # API.upload_file(
114
- # path_or_fileobj=out_path,
115
- # path_in_repo=out_path.split("eval-queue/")[1],
116
- # repo_id=QUEUE_REPO,
117
- # repo_type="dataset",
118
- # commit_message=f"Add {model} to eval queue",
119
- # )
120
-
121
- # # Remove the local file
122
- # os.remove(out_path)
123
 
124
  return styled_message(
125
- "Your request has been submitted to the evaluation queue!\nResults may take up to 24 hours to be processed and shown in the leaderboard."
 
126
  )
 
 
 
 
1
  import time
2
+ from datetime import datetime, timezone
3
 
 
4
  import pandas as pd
5
+ from datasets import Dataset
6
+ from pandas.api.types import is_integer_dtype
7
 
8
+ from app import is_valid
9
  from src.datamodel.data import F1Data
10
+ from src.display.formatting import styled_error, styled_message
11
  from src.display.utils import ModelType
12
+ from src.envs import SUBMISSIONS_REPO
13
  from src.logger import get_logger
14
+ from validation.validate import is_submission_file_valid
 
 
 
 
 
 
15
 
16
  logger = get_logger(__name__)
17
 
18
 
19
+ def _validate_all_submissions_present(
20
+ lbdb: F1Data,
21
+ pd_ds: pd.DataFrame,
22
+ ):
23
+ logger.info(f"Validating DS size {len(pd_ds)} columns {pd_ds.columns} set {set(pd_ds.columns)}")
24
  expected_cols = ["problem_id", "solution"]
25
 
26
  if set(pd_ds.columns) != set(expected_cols):
27
+ return ValueError(f"Expected attributes: {expected_cols}, Got: {pd_ds.columns.tolist()}")
28
 
29
  if not is_integer_dtype(pd_ds["problem_id"]):
30
+ return ValueError("problem_id must be str convertible to int")
31
 
32
+ if any(type(v) is not str for v in pd_ds["solution"]):
33
+ return ValueError("solution must be of type str")
34
 
35
  submitted_ids = set(pd_ds.problem_id.astype(str))
36
  if submitted_ids != lbdb.code_problem_ids:
37
  missing = lbdb.code_problem_ids - submitted_ids
38
  unknown = submitted_ids - lbdb.code_problem_ids
39
+ raise ValueError(f"Mismatched problem IDs: {len(missing)} missing, {len(unknown)} unknown")
40
  if len(pd_ds) > len(lbdb.code_problem_ids):
41
+ return ValueError("Duplicate problem IDs exist in uploaded file")
 
 
42
 
43
 
44
  def add_new_solutions(
 
47
  org: str,
48
  sys_type: str,
49
  submission_path: str,
50
+ ensure_all_present: bool = False,
51
  ):
52
+ logger.info(
53
+ f"Adding new submission! {system_name=}, {org=}, {sys_type=} and {submission_path=}",
54
+ )
55
 
56
+ # Double-checking.
57
+ for val in [system_name, org, sys_type]:
58
+ assert is_valid(val)
59
+ assert is_submission_file_valid(submission_path)
60
 
 
 
61
  sys_type = ModelType.from_str(sys_type).name
62
 
 
 
 
63
  try:
64
  submission_df = pd.read_json(submission_path, lines=True)
65
+ if ensure_all_present:
66
+ _validate_all_submissions_present(lbdb=lbdb, pd_ds=submission_df)
67
+ except Exception:
68
+ logger.warning("Failed to parse submission DF!", exc_info=True)
69
+ return styled_error(
70
+ "An error occurred. Please try again later."
71
+ ) # Use same message as external error. Avoid infoleak.
72
 
73
  submission_id = f"{system_name}_{org}_{sys_type}_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}"
74
 
75
+ # Seems good, creating the eval.
76
+ logger.info(f"Adding new submission: {submission_id}")
77
  submission_ts = time.time_ns()
78
 
79
  def add_info(row):
 
87
  }
88
 
89
  ds = Dataset.from_pandas(submission_df).map(add_info)
90
+ ds.push_to_hub(
91
+ SUBMISSIONS_REPO,
92
+ submission_id,
93
+ private=True,
94
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  return styled_message(
97
+ "Your request has been submitted to the evaluation queue!\n"
98
+ + "Results may take up to 24 hours to be processed and shown in the leaderboard."
99
  )
src/validation/__init__.py ADDED
File without changes
src/validation/validate.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import string
4
+
5
+ DATASET_SIZE = 120
6
+
7
+ MIN_INPUT_LENGTH = 2
8
+ MAX_INPUT_LENGTH = 20
9
+
10
+ MIN_SUBMISSION_SIZE = 1
11
+ MAX_SUBMISSION_SIZE = 1024 * 1024 * 120 # 120 MB.
12
+ MAX_SINGLE_SUBMISSION_SIZE = 1024 * 1024 # 1MB.
13
+ MAX_SUBMISSION_LINES = DATASET_SIZE + 1 # Allow empty line.
14
+
15
+
16
+ def is_valid(
17
+ s: str,
18
+ min_length: int = MIN_INPUT_LENGTH,
19
+ max_length: int = MAX_INPUT_LENGTH,
20
+ ) -> bool:
21
+ """
22
+ @brief Checks whether the given string is valid.
23
+ @param s The string to validate.
24
+ @return True iff all characters are in [a-zA-Z0-9], spaces, or '.' and '-', and the length if between
25
+ min length and max length.
26
+ """
27
+
28
+ characters = [c for c in s] # Not using the length from len(.) as that includes unicode characters.
29
+ if len(characters) < min_length or len(characters) > max_length:
30
+ return False
31
+
32
+ # Very important: We delimit using underscores. So these _CANNOT_ be allowed in sanitised strings.
33
+ ALLOWED = (
34
+ [c for c in string.ascii_lowercase]
35
+ + [c for c in string.ascii_uppercase]
36
+ + [c for c in string.digits]
37
+ + [" ", ".", "-"]
38
+ )
39
+ for c in s:
40
+ if c not in ALLOWED:
41
+ return False
42
+ return True
43
+
44
+
45
+ def is_submission_file_valid(submission_path: str) -> bool:
46
+ """
47
+ @brief Checks whether the given submission file is valid.
48
+ @param submission_path The path to the submission file.
49
+ @return True iff the file is within the size constraints, a JSONL, and every line is no longer than
50
+ the fixed maximum bound.
51
+ """
52
+
53
+ if not os.path.exists(submission_path):
54
+ return False
55
+
56
+ submission_size = os.stat(submission_path).st_size
57
+ if submission_size < MIN_SUBMISSION_SIZE or submission_size > MAX_SUBMISSION_SIZE:
58
+ return False
59
+
60
+ with open(submission_path, "r") as f:
61
+
62
+ # Not using readlines() to avoid consuming a large buffer at once.
63
+ n_lines = 0
64
+ seen_ids = set()
65
+ while len(line := f.readline(MAX_SINGLE_SUBMISSION_SIZE)) > 0:
66
+ n_lines += 1
67
+ if n_lines > MAX_SUBMISSION_LINES:
68
+ return False
69
+
70
+ if not line.startswith("{") or not line.endswith("}"):
71
+ return False
72
+
73
+ d = json.loads(line)
74
+ if set(d.keys()) != set(["problem_id", "solution"]):
75
+ return False
76
+
77
+ if not ((type(d["problem_id"]) is str or type(d["problem_id"]) is int) and type(d["solution"] is str)):
78
+ return False
79
+ if not d["problem_id"].isdigit():
80
+ return False
81
+ problem_id = int(d["problem_id"])
82
+ if problem_id < 0 or problem_id >= DATASET_SIZE:
83
+ return False
84
+
85
+ if problem_id in seen_ids:
86
+ return False # Duplicate submission.
87
+ seen_ids.add(problem_id)
88
+
89
+ return True