galb-dai commited on
Commit
8c131f3
·
1 Parent(s): ea641c7

Update flow.

Browse files
Files changed (3) hide show
  1. app.py +44 -143
  2. src/about.py +123 -25
  3. src/display/css_html_js.py +40 -104
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -5,7 +7,8 @@ from gradio.themes import Base, colors, sizes
5
  from gradio_leaderboard import Leaderboard, SelectColumns
6
  from huggingface_hub import whoami
7
 
8
- from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
 
9
  from src.datamodel.data import F1Data
10
  from src.display.css_html_js import custom_css
11
  from src.display.formatting import styled_error
@@ -18,8 +21,8 @@ from src.validation.validate import MAX_INPUT_LENGTH, MIN_INPUT_LENGTH, is_submi
18
 
19
  logger = get_logger(__name__)
20
 
21
- ENSURE_ALL_PRESENT = False # TODO: Switch to True.
22
- SPLIT = "warmup" # TODO temp
23
 
24
  lbdb = F1Data(
25
  cp_ds_name=CODE_PROBLEMS_REPO,
@@ -29,39 +32,32 @@ lbdb = F1Data(
29
  )
30
 
31
  leaderboard_df = None
32
-
33
  logger.info("Initialized LBDB")
34
 
35
 
36
  def restart_space():
37
- logger.info("Restarting space ")
38
  API.restart_space(repo_id=REPO_ID)
39
 
40
 
41
  def refresh_leaderboard_data():
42
- """Refresh the leaderboard data from the latest results"""
43
  global leaderboard_df
44
  try:
45
  logger.info("Loading leaderboard data...")
46
  new_leaderboard_df = get_leaderboard_df(RESULTS_REPO)
47
-
48
  if new_leaderboard_df is not None:
49
  logger.info("Leaderboard data refreshed successfully")
50
  leaderboard_df = new_leaderboard_df
51
  else:
52
  logger.warning("No new leaderboard data found")
53
- return None
54
  except Exception as e:
55
  logger.error(f"Error refreshing leaderboard data: {e}")
56
- return None
57
 
58
 
59
  def init_leaderboard(dataframe: pd.DataFrame):
60
-
61
  if dataframe is None:
62
  raise ValueError("Leaderboard DataFrame is None.")
63
-
64
- lb = Leaderboard(
65
  value=dataframe,
66
  datatype=[c.type for c in fields(AutoEvalColumn)],
67
  select_columns=SelectColumns(
@@ -69,84 +65,51 @@ def init_leaderboard(dataframe: pd.DataFrame):
69
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
70
  label="Select Columns to Display:",
71
  ),
72
- search_columns=[AutoEvalColumn.system.name, AutoEvalColumn.organization.name],
73
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
74
  bool_checkboxgroup_label="Hide models",
75
  interactive=False,
76
  )
77
- lb.col_count = (1, "fixed")
78
- return lb
79
 
80
 
81
  def add_solution_cbk(
82
  system_name: str,
83
  org: str,
 
84
  submission_path: str,
85
  profile: gr.OAuthProfile | None,
86
  oauth_token: gr.OAuthToken | None,
87
  ):
88
- logger.info("Fetching user details for submission")
89
- logger.info("PROFILE %s", profile)
90
- logger.info("TOKEN %s", oauth_token)
91
-
92
  if profile is None or oauth_token is None:
93
  return styled_error("Please sign in with Hugging Face before submitting.")
94
-
95
- # Display handle and display name (may change over time)
96
- logger.info(f"User handle: {profile.username}")
97
- display_name = profile.name or profile.username
98
- logger.info(f"Display name: {display_name}")
99
-
100
- # Stable account id
101
  user_info = fetch_user_info(oauth_token)
102
- logger.info("Logged in user info: %s", user_info)
103
  stable_id = user_info.get("id") if user_info else None
104
- logger.info(f"User stable ID: {stable_id}")
105
-
106
  if not stable_id:
107
  return styled_error("Could not retrieve your stable user ID. Please try signing in again.")
108
- user_id = stable_id
109
-
110
  if not profile.username:
111
  return styled_error("Could not retrieve username. Please try signing in again.")
112
- # We rely on underscores as separators in submission ID, replace it with "-".
113
- # user_id = profile.username.replace("_", "-")
114
-
115
  try:
116
- # Validating the submission file.
117
  if not submission_path:
118
  return styled_error("Please upload JSONL submission file.")
119
-
120
- if not is_submission_file_valid(
121
- submission_path,
122
- is_warmup_dataset=(SPLIT == "warmup"),
123
- ):
124
  return styled_error("Failed to read JSONL submission file. Please try again later.")
125
-
126
- # Validating all user-supplied arguments.
127
- for val, val_name in [
128
- (system_name, "System name"),
129
- (org, "Organisation name"),
130
- ]:
131
  if len(val) == 0:
132
  return styled_error(f"Please fill in the '{val_name}' field.")
133
-
134
  if not is_valid(val):
135
  return styled_error(
136
- f"{val_name} is invalid! Must only contain characters [a-zA-Z0-9], spaces, "
137
- + "or the special characters '-' and '.', and be of length between "
138
- + f"{MIN_INPUT_LENGTH} and {MAX_INPUT_LENGTH}."
139
  )
140
  except Exception:
141
  logger.warning("Failed to process user submission", exc_info=True)
142
- return styled_error("An error occurred. Please try again later.") # Intentionally vague.
143
-
144
  return add_new_solutions(
145
  lbdb,
146
  profile.username,
147
- user_id,
148
  system_name,
149
  org,
 
150
  submission_path,
151
  is_warmup_dataset=(SPLIT == "warmup"),
152
  ensure_all_present=ENSURE_ALL_PRESENT,
@@ -154,46 +117,32 @@ def add_solution_cbk(
154
 
155
 
156
  def gate_submission(oauth_token: gr.OAuthToken | None):
157
- """
158
- @brief Toggles the visibility of the login box and submission panel based on the user's login status.
159
- """
160
- logger.info("GATE TOKEN %s", oauth_token)
161
  if oauth_token is None:
162
- logger.info("GATE: NO TOKEN")
163
  return gr.update(visible=True), gr.update(visible=False)
164
  try:
165
  whoami(oauth_token.token)
166
- logger.info("GATE: TOKEN IS VALID")
167
  return gr.update(visible=False), gr.update(visible=True)
168
  except Exception:
169
- logger.info("GATE: TOKEN HAS EXPIRED")
170
  return gr.update(visible=True), gr.update(visible=False)
171
 
172
 
173
  def get_theme():
174
- cyber_theme = Base(
175
- # neon-ish accents driven by hues (affects tabs, primary buttons, sliders, etc.)
176
- primary_hue=colors.cyan, # selected tab / primary controls
177
- secondary_hue=colors.pink, # secondary accents
178
- neutral_hue=colors.gray, # keep neutrals subtle
179
- # # techno font
180
- # font=gr.themes.GoogleFont("Orbitron"),
181
- # font_mono=gr.themes.GoogleFont("JetBrains Mono"),
182
- text_size=sizes.text_md, # keep defaults
183
  spacing_size=sizes.spacing_md,
184
  radius_size=sizes.radius_md,
185
  ).set(
186
- # keep overrides minimal—dark canvas; let hues do the rest
187
- body_background_fill="#0b0f14", # deep blue-black
188
- background_fill_primary="#0b0f14", # panels
189
- background_fill_secondary="#0e141a", # subtle contrast
190
  )
191
- return cyber_theme
192
 
193
 
194
  blocks = gr.Blocks(css=custom_css, theme=get_theme())
195
  with blocks:
196
-
197
  gr.Image(
198
  "assets/banner.png",
199
  interactive=False,
@@ -203,104 +152,56 @@ with blocks:
203
  elem_classes=["banner_image"],
204
  )
205
 
206
- gr.HTML(
207
- """
208
- <style>
209
- body {
210
- background-color: #121212;
211
- color: white;
212
- margin: 0; /* Reset browser default */
213
- }
214
-
215
- /* Outer container margin & spacing */
216
- .gradio-container {
217
- max-width: 1100px;
218
- margin: 2rem auto; /* top/bottom spacing + horizontal centering */
219
- padding: 2rem; /* inner spacing */
220
- background-color: rgba(0, 0, 0, 0.6); /* optional: semi-transparent panel */
221
- border-radius: 12px; /* rounded corners */
222
- }
223
- </style>
224
- """
225
- )
226
-
227
- gr.HTML(TITLE)
228
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
229
-
230
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
231
- with gr.TabItem("🏅 Leaderboard", elem_id="formulaone-leaderboard-tab-table", id=0):
232
- refresh_leaderboard_data() # updates leaderboard_df
233
- assert leaderboard_df is not None
 
 
 
234
  leaderboard_component = init_leaderboard(leaderboard_df)
235
 
236
- with gr.TabItem("🚀 Submit Solutions", elem_id="llm-benchmark-tab-table", id=2):
237
- logger.info("Tab submission")
238
  with gr.Column():
239
- with gr.Row():
240
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
241
-
242
- with gr.Row():
243
  gr.Markdown("# ✉️✨ Submit your solutions", elem_classes="markdown-text")
244
-
245
- # Shown when logged OUT
246
  login_box = gr.Group(visible=True)
247
  with login_box:
248
  gr.Markdown("Please sign in with Hugging Face to submit")
249
  gr.LoginButton()
250
-
251
- # Shown when logged IN
252
  submit_panel = gr.Group(visible=False)
253
  with submit_panel:
254
  with gr.Row():
255
  with gr.Column():
256
  system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
257
  org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
258
- # sys_type_dropdown = gr.Dropdown(
259
- # choices=[t.to_str() for t in ModelType],
260
- # label=AutoEvalColumn.system_type.name,
261
- # multiselect=False,
262
- # value=ModelType.LLM.to_str(),
263
- # interactive=True,
264
- # )
265
-
266
  submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
267
-
268
- logger.info("Submit button")
269
- submit_button = gr.Button("Submit")
270
- # gr.LoginButton()
271
  submission_result = gr.Markdown()
272
-
273
  submit_button.click(
274
  add_solution_cbk,
275
- [
276
- system_name_textbox,
277
- org_textbox,
278
- submission_file,
279
- ],
280
  submission_result,
281
  )
282
 
283
  with gr.Row():
284
- logger.info("Citation")
285
  with gr.Accordion(CITATION_BUTTON_LABEL, open=False):
286
- gr.Code(
287
- value=CITATION_BUTTON_TEXT.strip(),
288
- elem_id="citation-block",
289
- )
290
 
291
- # UI refresh triggers latest data swap.
292
- # The work already happened in the background - refresh_leaderboard_data().
293
  blocks.load(lambda: leaderboard_df, inputs=[], outputs=[leaderboard_component])
294
-
295
- # On initial load (and after OAuth redirect), toggle the UI based on login status.
296
  blocks.load(gate_submission, inputs=None, outputs=[login_box, submit_panel])
297
 
298
-
299
- logger.info("Scheduler")
300
  scheduler = BackgroundScheduler()
301
  scheduler.add_job(restart_space, "interval", seconds=1800)
302
  scheduler.add_job(refresh_leaderboard_data, "interval", seconds=120)
303
  scheduler.start()
304
- logger.info("Launch")
305
  blocks.queue(default_concurrency_limit=40).launch()
306
- logger.info("Done")
 
1
+ # app.py
2
+
3
  import gradio as gr
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
 
7
  from gradio_leaderboard import Leaderboard, SelectColumns
8
  from huggingface_hub import whoami
9
 
10
+ # Updated import to get the new HTML variable
11
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, WHAT_IS_F1_HTML
12
  from src.datamodel.data import F1Data
13
  from src.display.css_html_js import custom_css
14
  from src.display.formatting import styled_error
 
21
 
22
  logger = get_logger(__name__)
23
 
24
+ ENSURE_ALL_PRESENT = False
25
+ SPLIT = "warmup"
26
 
27
  lbdb = F1Data(
28
  cp_ds_name=CODE_PROBLEMS_REPO,
 
32
  )
33
 
34
  leaderboard_df = None
 
35
  logger.info("Initialized LBDB")
36
 
37
 
38
  def restart_space():
39
+ logger.info("Restarting space")
40
  API.restart_space(repo_id=REPO_ID)
41
 
42
 
43
  def refresh_leaderboard_data():
 
44
  global leaderboard_df
45
  try:
46
  logger.info("Loading leaderboard data...")
47
  new_leaderboard_df = get_leaderboard_df(RESULTS_REPO)
 
48
  if new_leaderboard_df is not None:
49
  logger.info("Leaderboard data refreshed successfully")
50
  leaderboard_df = new_leaderboard_df
51
  else:
52
  logger.warning("No new leaderboard data found")
 
53
  except Exception as e:
54
  logger.error(f"Error refreshing leaderboard data: {e}")
 
55
 
56
 
57
  def init_leaderboard(dataframe: pd.DataFrame):
 
58
  if dataframe is None:
59
  raise ValueError("Leaderboard DataFrame is None.")
60
+ return Leaderboard(
 
61
  value=dataframe,
62
  datatype=[c.type for c in fields(AutoEvalColumn)],
63
  select_columns=SelectColumns(
 
65
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
66
  label="Select Columns to Display:",
67
  ),
68
+ search_columns=[AutoEvalColumn.system.name, AutoEvalColumn.system_type.name],
69
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
70
  bool_checkboxgroup_label="Hide models",
71
  interactive=False,
72
  )
 
 
73
 
74
 
75
  def add_solution_cbk(
76
  system_name: str,
77
  org: str,
78
+ sys_type: str,
79
  submission_path: str,
80
  profile: gr.OAuthProfile | None,
81
  oauth_token: gr.OAuthToken | None,
82
  ):
 
 
 
 
83
  if profile is None or oauth_token is None:
84
  return styled_error("Please sign in with Hugging Face before submitting.")
 
 
 
 
 
 
 
85
  user_info = fetch_user_info(oauth_token)
 
86
  stable_id = user_info.get("id") if user_info else None
 
 
87
  if not stable_id:
88
  return styled_error("Could not retrieve your stable user ID. Please try signing in again.")
 
 
89
  if not profile.username:
90
  return styled_error("Could not retrieve username. Please try signing in again.")
 
 
 
91
  try:
 
92
  if not submission_path:
93
  return styled_error("Please upload JSONL submission file.")
94
+ if not is_submission_file_valid(submission_path, is_warmup_dataset=(SPLIT == "warmup")):
 
 
 
 
95
  return styled_error("Failed to read JSONL submission file. Please try again later.")
96
+ for val, val_name in [(system_name, "System name"), (org, "Organisation name"), (sys_type, "System type")]:
 
 
 
 
 
97
  if len(val) == 0:
98
  return styled_error(f"Please fill in the '{val_name}' field.")
 
99
  if not is_valid(val):
100
  return styled_error(
101
+ f"{val_name} is invalid! Must only contain characters [a-zA-Z0-9], spaces, or the special characters '-' and '.', and be of length between {MIN_INPUT_LENGTH} and {MAX_INPUT_LENGTH}."
 
 
102
  )
103
  except Exception:
104
  logger.warning("Failed to process user submission", exc_info=True)
105
+ return styled_error("An error occurred. Please try again later.")
 
106
  return add_new_solutions(
107
  lbdb,
108
  profile.username,
109
+ stable_id,
110
  system_name,
111
  org,
112
+ sys_type,
113
  submission_path,
114
  is_warmup_dataset=(SPLIT == "warmup"),
115
  ensure_all_present=ENSURE_ALL_PRESENT,
 
117
 
118
 
119
  def gate_submission(oauth_token: gr.OAuthToken | None):
 
 
 
 
120
  if oauth_token is None:
 
121
  return gr.update(visible=True), gr.update(visible=False)
122
  try:
123
  whoami(oauth_token.token)
 
124
  return gr.update(visible=False), gr.update(visible=True)
125
  except Exception:
 
126
  return gr.update(visible=True), gr.update(visible=False)
127
 
128
 
129
  def get_theme():
130
+ return Base(
131
+ primary_hue=colors.cyan,
132
+ secondary_hue=colors.pink,
133
+ neutral_hue=colors.gray,
134
+ text_size=sizes.text_md,
 
 
 
 
135
  spacing_size=sizes.spacing_md,
136
  radius_size=sizes.radius_md,
137
  ).set(
138
+ body_background_fill="#0b0f14",
139
+ background_fill_primary="#0b0f14",
140
+ background_fill_secondary="#0e141a",
 
141
  )
 
142
 
143
 
144
  blocks = gr.Blocks(css=custom_css, theme=get_theme())
145
  with blocks:
 
146
  gr.Image(
147
  "assets/banner.png",
148
  interactive=False,
 
152
  elem_classes=["banner_image"],
153
  )
154
 
155
+ # The main layout is now controlled by these three tabs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
157
+ with gr.TabItem("What is FormulaOne", id=0):
158
+ gr.HTML(WHAT_IS_F1_HTML)
159
+
160
+ with gr.TabItem("🏅 FormulaOne Leaderboard", id=1):
161
+ refresh_leaderboard_data()
162
+ assert leaderboard_df is not None, "Leaderboard data failed to load."
163
  leaderboard_component = init_leaderboard(leaderboard_df)
164
 
165
+ with gr.TabItem("🚀 Submit Solutions", id=2):
 
166
  with gr.Column():
167
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
 
 
168
  gr.Markdown("# ✉️✨ Submit your solutions", elem_classes="markdown-text")
 
 
169
  login_box = gr.Group(visible=True)
170
  with login_box:
171
  gr.Markdown("Please sign in with Hugging Face to submit")
172
  gr.LoginButton()
 
 
173
  submit_panel = gr.Group(visible=False)
174
  with submit_panel:
175
  with gr.Row():
176
  with gr.Column():
177
  system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
178
  org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
179
+ sys_type_dropdown = gr.Dropdown(
180
+ choices=[t.to_str() for t in ModelType],
181
+ label=AutoEvalColumn.system_type.name,
182
+ multiselect=False,
183
+ value=ModelType.LLM.to_str(),
184
+ interactive=True,
185
+ )
 
186
  submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
187
+ submit_button = gr.Button("Submit", variant="primary")
 
 
 
188
  submission_result = gr.Markdown()
 
189
  submit_button.click(
190
  add_solution_cbk,
191
+ [system_name_textbox, org_textbox, sys_type_dropdown, submission_file],
 
 
 
 
192
  submission_result,
193
  )
194
 
195
  with gr.Row():
 
196
  with gr.Accordion(CITATION_BUTTON_LABEL, open=False):
197
+ gr.Code(value=CITATION_BUTTON_TEXT.strip(), elem_id="citation-block")
 
 
 
198
 
 
 
199
  blocks.load(lambda: leaderboard_df, inputs=[], outputs=[leaderboard_component])
 
 
200
  blocks.load(gate_submission, inputs=None, outputs=[login_box, submit_panel])
201
 
 
 
202
  scheduler = BackgroundScheduler()
203
  scheduler.add_job(restart_space, "interval", seconds=1800)
204
  scheduler.add_job(refresh_leaderboard_data, "interval", seconds=120)
205
  scheduler.start()
206
+
207
  blocks.queue(default_concurrency_limit=40).launch()
 
src/about.py CHANGED
@@ -1,28 +1,126 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
- TITLE = """
5
- <h1 id="space-title" style="
6
- text-align: center;
7
- font-family: 'Segoe UI', 'Helvetica Neue', sans-serif;
8
- font-weight: 300;
9
- letter-spacing: 0.05em;
10
- color: white;
11
- text-transform: none;
12
- font-size: 2.6rem;
13
- ">
14
- FormulaOne Leaderboard
15
- </h1>
16
- """
17
-
18
- INTRODUCTION_TEXT = """
19
- Welcome to the official leaderboard for the paper:
20
-
21
- **FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming** <br>
22
- *Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Nadav Schweiger, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
23
- **AAI, July 2025**
24
-
25
- FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  """
27
 
28
  LLM_BENCHMARKS_TEXT = """
 
1
+ WHAT_IS_F1_HTML = """
2
+ <!DOCTYPE html>
3
+ <html lang="en">
4
+ <body>
5
+ <main class="max-w-4xl mx-auto">
6
+ <header class="text-center mb-12">
7
+ <h1 class="text-4xl md:text-5xl font-bold text-white f1-h1">FormulaOne</h1>
8
+ </header>
9
+ <section>
10
+ <p class="text-lg mb-4 f1-p">Frontier AI models have recently demonstrated strong performance on mathematical and algorithmic benchmarks, including earning gold medals in olympiads, and attaining top percentile ratings in competitive programming contests. How well do such benchmarks capture the true depth of algorithmic reasoning, as it arises in real-world research problems?</p>
11
+ <p class="text-lg mb-4 f1-p">We believe that existing benchmarks fail to capture the deep reasoning skills required for complex, research-level algorithmic problems. To address this gap, we introduce <strong>FormulaOne</strong>.</p>
12
+ <p class="mb-4 f1-p"><strong>FormulaOne</strong> consists of 220 novel dynamic programming problems over graphs. The problems are organised into three categories, ranging from moderate difficulty and all the way up to research-level.</p>
13
+ <div class="overflow-x-auto">
14
+ <table class="f1-table">
15
+ <thead>
16
+ <tr>
17
+ <th class="f1-th">Category</th>
18
+ <th class="f1-th">Description</th>
19
+ </tr>
20
+ </thead>
21
+ <tbody>
22
+ <tr>
23
+ <td class="f1-td"><strong>FormulaOne Warmup</strong></td>
24
+ <td class="f1-td">A set of 100 “easier” problems.</td>
25
+ </tr>
26
+ <tr>
27
+ <td class="f1-td"><strong>FormulaOne Tier 1</strong></td>
28
+ <td class="f1-td">A set of 100 challenging problems.</td>
29
+ </tr>
30
+ <tr>
31
+ <td class="f1-td"><strong>FormulaOne Tier 2</strong></td>
32
+ <td class="f1-td">A set of 20 highly challenging problems.</td>
33
+ </tr>
34
+ </tbody>
35
+ </table>
36
+ </div>
37
+ <div class="mt-8">
38
+ <div class="border-b border-gray-700">
39
+ <nav class="-mb-px flex space-x-8" aria-label="Tabs">
40
+ <p class="whitespace-nowrap py-4 px-1 border-b-2 font-medium text-sm border-blue-400 text-blue-400">Example Problems</p>
41
+ </nav>
42
+ </div>
43
+ <div class="mt-4">
44
+ <div class="f1-problem-box">
45
+ <p class="font-bold text-lg mb-2">Warmup: Union-of-Paths-and-Cycles</p>
46
+ <p class="mb-2"><strong>Objective:</strong> Compute the sum of all weights of sets S⊆V such that the graph G[S], induced over S, is a collection of disjoint paths and cycles.</p>
47
+ </div>
48
+ <div class="f1-problem-box">
49
+ <p class="font-bold text-lg mb-2">Tier 1: Maximal-Union-of-Paths-and-Cycles</p>
50
+ <p class="mb-2"><strong>Objective:</strong> Compute the sum of all weights of sets S⊆V such that the graph G[S], induced over S, is a collection of disjoint paths and cycles, and S is maximal with respect to this property.</p>
51
+ </div>
52
+ <div class="f1-problem-box">
53
+ <p class="font-bold text-lg mb-2">Tier 2: Maximal-Union-of-Cycles</p>
54
+ <p class="mb-2"><strong>Objective:</strong> Compute the sum of all weights of sets S⊆V such that the graph G[S], induced over S, is a collection of disjoint cycles, and S is maximal with respect to this property.</p>
55
+ </div>
56
+ </div>
57
+ </div>
58
+ <p class="mt-6 mb-4 f1-p">The latter category is incredibly demanding, requiring resolution of many points of uncertainty, and involving an array of reasoning steps, including topological and geometric insight, knowledge of mathematical domains such as extremal graph theory and logic, combinatorial considerations, precise implementation, and more.</p>
59
+ <p class="f1-p">Despite Frontier models’ impressive performance on existing benchmarks, presently <strong>no model solves even a single FormulaOne Tier 2 problem</strong>.<sup><a href="#evaluation" class="f1-a">1</a></sup></p>
60
+ </section>
61
+ <section>
62
+ <h2 class="text-3xl font-bold text-white f1-h2">An “Infinite Well” of Problems</h2>
63
+ <p class="mb-4 f1-p">The novelty and vastness of FormulaOne stems from its theoretical foundation. The questions are not arbitrary puzzles, but are instead drawn from the highly expressive framework of <strong>Monadic Second-Order</strong> (MSO) logic on graphs. This provides a principled, semi-automatic way to generate a virtually infinite supply of mathematically deep algorithmic challenges. Despite their theoretical underpinnings, the problems in FormulaOne are natural and succinct:</p>
64
+ <div class="f1-problem-box">
65
+ <p class="font-bold text-lg mb-2">Problem #44</p>
66
+ <p class="mb-2"><strong>Input:</strong> A tree-like graph G=(V,E), a tree decomposition T of G, and a weight function w:V→N.</p>
67
+ <p class="mb-2"><strong>Objective:</strong> Compute the sum of all weights of sets S⊆V such that the graph G[S], induced over S, does not contain any cycle of length four.</p>
68
+ <p class="text-sm text-gray-400"><strong>Notation:</strong> The weight of a set of vertices S is defined as w(S) ≜ ∑<sub>v∈S</sub>w(v). The final result should be returned modulo 10<sup>9</sup>+7.</p>
69
+ </div>
70
+ <p class="mb-4 f1-p">While the problems are often natural to state, their solutions are far from obvious. The solvability of this vast class of problems is guaranteed by an algorithmic <strong>meta-theorem</strong> due to Courcelle, which broadly states:</p>
71
+ <blockquote class="my-6 f1-blockquote">
72
+ “For every sufficiently tree-like graph, any problem definable in an expressive formal logic — Monadic Second-Order (MSO) logic — can be solved by a dynamic programming algorithm that operates in time linear in the order of the graph.”
73
+ </blockquote>
74
+ <p class="f1-p">The key is to use a structure known as a tree decomposition, which organises the graph’s vertices into a series of overlapping sets, or “bags”, that are themselves arranged in a tree.</p>
75
+ <figure class="f1-figure">
76
+ <img src="/file=assets/bag_modifications.png" alt="An illustration of local modifications to bags (dashed boxes)" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
77
+ <figcaption class="f1-figcaption">An illustration of local modifications to bags: Introduce, Forget, and Join.</figcaption>
78
+ </figure>
79
+ <p class="mb-4 f1-p">An algorithm can then traverse this tree of bags, solving the problem piece by piece using dynamic programming. This process involves designing a “state” that summarises all necessary information about the partial solution within a bag, and then defining how this state transforms as vertices are introduced, forgotten, or bags are merged.</p>
80
+ <figure class="f1-figure">
81
+ <video class="w-full max-w-2xl mx-auto rounded-lg shadow-lg" autoplay loop muted playsinline>
82
+ <source src="/file=assets/dp_animation.mp4" type="video/mp4">
83
+ Your browser does not support the video tag.
84
+ </video>
85
+ <figcaption class="f1-figcaption">Animation showing the design of a compressed dynamic programming state-space.</figcaption>
86
+ </figure>
87
+ <p class="f1-p">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem – <strong>Maximal-Cluster-Graph</strong> – see the appendix of our paper.</p>
88
+ </section>
89
+ <section>
90
+ <h2 class="text-3xl font-bold text-white f1-h2">Guiding Principles</h2>
91
+ <ul class="list-disc list-inside space-y-4">
92
+ <li class="f1-li"><strong>An In-Distribution Benchmark for Reasoning.</strong> Unlike benchmarks that test for out-of-distribution generalisation, FormulaOne presents problems that are squarely <strong>in-distribution</strong> for models trained on code. Essentially, dynamic programming on graphs is the “bread and butter” of algorithmic programming. Thus, models’ current failure on FormulaOne highlights a fundamental deficit in deep, multi-step reasoning, rather than a lack of domain exposure.</li>
93
+ <li class="f1-li"><strong>An Unbounded Environment for Reinforcement Learning.</strong> The MSO framework allows for the generation of a nearly infinite stream of algorithmic problems with verifiable solutions, making it an ideal environment for training and evaluating agents with Reinforcement Learning with Verifiable Rewards (RLVR).</li>
94
+ <li class="f1-li"><strong>Probing the Frontiers of Complexity Theory.</strong> Many problems in our dataset are related to central conjectures in fine-grained complexity, such as the Strong Exponential Time Hypothesis. If a model were to discover a significantly faster algorithm for one of these problems, that would constitute a significant contribution to theoretical computer science.</li>
95
+ </ul>
96
+ </section>
97
+ <section id="evaluation">
98
+ <h2 class="text-3xl font-bold text-white f1-h2">Evaluation</h2>
99
+ <p class="mb-4 f1-p">To give models the best possible chance of success, we provide a generous few-shot prompt that covers a broad array of the ideas and techniques involved in solving these problems. All models were evaluated using their highest available reasoning settings and with the maximum context length permitted.</p>
100
+ <p class="mb-4 f1-p">Each submitted solution is subjected to a rigorous and automated test suite that measures three key aspects of its validity:</p>
101
+ <ul class="list-disc list-inside space-y-2 mb-6">
102
+ <li class="f1-li"><strong>Correctness:</strong> The output of the submitted algorithm must be correct on all graphs.</li>
103
+ <li class="f1-li"><strong>Consistency:</strong> The solution must produce the same output for a given graph, regardless of the specific structure of its tree decomposition.</li>
104
+ <li class="f1-li"><strong>Efficiency:</strong> The solution must be truly fixed-parameter linear.</li>
105
+ </ul>
106
+ <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our public GitHub repository: <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">https://github.com/double-ai/formulaone-dataset/tree/main</a>.</p>
107
+ <p class="f1-p">In contrast, to maintain the integrity of the core benchmark, only a minimal subset of tests is released for the <code>FormulaOne Tier 1</code> and <code>Tier 2</code> problems.</p>
108
+ <h3 class="text-2xl font-bold text-white mt-8 mb-4">Model Accuracy</h3>
109
+ <p class="mb-4 f1-p">On the <strong>FormulaOne-Warmup</strong> problems, frontier models perform reasonably well. This confirms they have a foundational capability for these types of algorithmic tasks.</p>
110
+ <figure class="f1-figure">
111
+ <img src="/file=assets/warmup_performance.png" alt="Plot showing model performance on FormulaOne-Warmup" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
112
+ <figcaption class="f1-figcaption">Performance of frontier models on the FormulaOne-Warmup dataset.</figcaption>
113
+ </figure>
114
+ <p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>FormulaOne Tier 1</strong>, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
115
+ <figure class="f1-figure">
116
+ <img src="/file=assets/tier1_performance.png" alt="Plot showing model performance on FormulaOne Tier 1" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
117
+ <figcaption class="f1-figcaption">Figure 1: Performance of frontier reasoning models on the FormulaOne dataset.</figcaption>
118
+ </figure>
119
+ <p class="f1-p">This trend culminates in <strong>FormulaOne Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
120
+ </section>
121
+ </main>
122
+ </body>
123
+ </html>
124
  """
125
 
126
  LLM_BENCHMARKS_TEXT = """
src/display/css_html_js.py CHANGED
@@ -1,117 +1,53 @@
1
  custom_css = """
2
-
3
  .markdown-text {
4
  font-size: 16px !important;
5
  }
6
-
7
  .banner_image { width: 75% !important; align-self: center !important; }
8
-
9
  @import url('https://fonts.googleapis.com/css2?family=Exo+2:wght@500;600&display=swap');
10
-
11
- button[role="tab"][aria-controls="formulaone-leaderboard-tab-table"],
12
- button[role="tab"][aria-controls="llm-benchmark-tab-table"] {
13
  font-family: 'Exo 2', system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif !important;
14
  letter-spacing: 0.25px;
15
  font-weight: 600;
16
- }
17
-
18
-
19
- #models-to-add-text {
20
- font-size: 18px !important;
21
- }
22
-
23
- #citation-button span {
24
- font-size: 16px !important;
25
- }
26
-
27
- #citation-button textarea {
28
- font-size: 16px !important;
29
- }
30
-
31
- #citation-button > label > button {
32
- margin: 6px;
33
- transform: scale(1.3);
34
- }
35
-
36
- #leaderboard-table {
37
- margin-top: 15px
38
- }
39
-
40
- #leaderboard-table-lite {
41
- margin-top: 15px
42
- }
43
-
44
- #search-bar-table-box > div:first-child {
45
- background: none;
46
- border: none;
47
- }
48
-
49
- #search-bar {
50
- padding: 0px;
51
- }
52
-
53
- /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
54
  #leaderboard-table td:nth-child(2),
55
- #leaderboard-table th:nth-child(2) {
56
- max-width: 400px;
57
- overflow: auto;
58
- white-space: nowrap;
59
- }
60
-
61
- /* Change leaderboard to 'single column' */
62
- #formulaone-leaderboard-tab-table .column .row .column {
63
- min-width: 100% !important;
64
- }
65
-
66
-
67
- .tab-buttons button {
68
- font-size: 20px;
69
- }
70
-
71
- #scale-logo {
72
- border-style: none !important;
73
- box-shadow: none;
74
- display: block;
75
- margin-left: auto;
76
- margin-right: auto;
77
- max-width: 600px;
78
- }
79
-
80
- #scale-logo .download {
81
- display: none;
82
- }
83
- #filter_type{
84
- border: 0;
85
- padding-left: 0;
86
- padding-top: 0;
87
- }
88
- #filter_type label {
89
- display: flex;
90
- }
91
- #filter_type label > span{
92
- margin-top: var(--spacing-lg);
93
- margin-right: 0.5em;
94
- }
95
- #filter_type label > .wrap{
96
- width: 103px;
97
- }
98
- #filter_type label > .wrap .wrap-inner{
99
- padding: 2px;
100
- }
101
- #filter_type label > .wrap .wrap-inner input{
102
- width: 1px
103
- }
104
- #filter-columns-type{
105
- border:0;
106
- padding:0.5;
107
- }
108
- #filter-columns-size{
109
- border:0;
110
- padding:0.5;
111
- }
112
- #box-filter > .form{
113
- border: 0
114
- }
115
  """
116
 
117
  get_window_url_params = """
 
1
  custom_css = """
 
2
  .markdown-text {
3
  font-size: 16px !important;
4
  }
 
5
  .banner_image { width: 75% !important; align-self: center !important; }
 
6
  @import url('https://fonts.googleapis.com/css2?family=Exo+2:wght@500;600&display=swap');
7
+ button[role="tab"] {
 
 
8
  font-family: 'Exo 2', system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif !important;
9
  letter-spacing: 0.25px;
10
  font-weight: 600;
11
+ font-size: 18px !important; /* Increased font size for tabs */
12
+ }
13
+ #models-to-add-text { font-size: 18px !important; }
14
+ #citation-button span { font-size: 16px !important; }
15
+ #citation-button textarea { font-size: 16px !important; }
16
+ #citation-button > label > button { margin: 6px; transform: scale(1.3); }
17
+ #leaderboard-table { margin-top: 15px }
18
+ #leaderboard-table-lite { margin-top: 15px }
19
+ #search-bar-table-box > div:first-child { background: none; border: none; }
20
+ #search-bar { padding: 0px; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  #leaderboard-table td:nth-child(2),
22
+ #leaderboard-table th:nth-child(2) { max-width: 400px; overflow: auto; white-space: nowrap; }
23
+ .tab-buttons button { font-size: 20px; }
24
+ #scale-logo { border-style: none !important; box-shadow: none; display: block; margin-left: auto; margin-right: auto; max-width: 600px; }
25
+ #scale-logo .download { display: none; }
26
+ #filter_type{ border: 0; padding-left: 0; padding-top: 0; }
27
+ #filter_type label { display: flex; }
28
+ #filter_type label > span{ margin-top: var(--spacing-lg); margin-right: 0.5em; }
29
+ #filter_type label > .wrap{ width: 103px; }
30
+ #filter_type label > .wrap .wrap-inner{ padding: 2px; }
31
+ #filter_type label > .wrap .wrap-inner input{ width: 1px }
32
+ #filter-columns-type{ border:0; padding:0.5; }
33
+ #filter-columns-size{ border:0; padding:0.5; }
34
+ #box-filter > .form{ border: 0 }
35
+ .banner_image img { height: 200px !important; object-fit: cover !important; }
36
+
37
+ /* Styles for the "What is FormulaOne" HTML content */
38
+ .f1-h1 { font-weight: 700; font-size: 2.25rem; line-height: 2.5rem; color: white; text-align: center; margin-bottom: 2rem;}
39
+ .f1-h2 { font-weight: 700; border-bottom: 1px solid #374151; padding-bottom: 0.5rem; margin-top: 2.5rem; margin-bottom: 1.5rem; color: white; font-size: 1.875rem; line-height: 2.25rem; }
40
+ .f1-p, .f1-li { line-height: 1.75; color: #d1d5db; }
41
+ .f1-a { color: #60a5fa; text-decoration: none; font-weight: 500; }
42
+ .f1-a:hover { text-decoration: underline; }
43
+ .f1-blockquote { border-left: 4px solid #4b5563; padding-left: 1rem; margin-left: 0; font-style: italic; color: #9ca3af; }
44
+ .f1-problem-box { background-color: #1f2937; border: 1px solid #374151; border-radius: 0.5rem; padding: 1.5rem; margin-top: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1); }
45
+ .f1-problem-box strong { color: #f9fafb; }
46
+ .f1-table { width: 100%; margin-top: 1.5rem; border-collapse: collapse; }
47
+ .f1-th, .f1-td { text-align: left; padding: 0.75rem 1rem; border-bottom: 1px solid #374151; }
48
+ .f1-th { background-color: #374151; font-weight: 600; color: #f9fafb; }
49
+ .f1-figure { margin-top: 1.5rem; margin-bottom: 1.5rem; text-align: center; }
50
+ .f1-figcaption { margin-top: 0.5rem; font-size: 0.875rem; color: #9ca3af; font-style: italic; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  """
52
 
53
  get_window_url_params = """