Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update.
Browse files- app.py +16 -48
- src/about.py +13 -18
- src/display/css_html_js.py +14 -13
app.py
CHANGED
@@ -83,7 +83,7 @@ def init_leaderboard(dataframe: pd.DataFrame):
|
|
83 |
def add_solution_cbk(
|
84 |
system_name: str,
|
85 |
org: str,
|
86 |
-
sys_type: str,
|
87 |
submission_path: str,
|
88 |
profile: gr.OAuthProfile | None,
|
89 |
oauth_token: gr.OAuthToken | None,
|
@@ -112,8 +112,6 @@ def add_solution_cbk(
|
|
112 |
|
113 |
if not profile.username:
|
114 |
return styled_error("Could not retrieve username. Please try signing in again.")
|
115 |
-
# We rely on underscores as separators in submission ID, replace it with "-".
|
116 |
-
# user_id = profile.username.replace("_", "-")
|
117 |
|
118 |
try:
|
119 |
# Validating the submission file.
|
@@ -127,10 +125,12 @@ def add_solution_cbk(
|
|
127 |
return styled_error("Failed to read JSONL submission file. Please try again later.")
|
128 |
|
129 |
# Validating all user-supplied arguments.
|
|
|
|
|
130 |
for val, val_name in [
|
131 |
(system_name, "System name"),
|
132 |
(org, "Organisation name"),
|
133 |
-
(sys_type, "System type"),
|
134 |
]:
|
135 |
if len(val) == 0:
|
136 |
return styled_error(f"Please fill in the '{val_name}' field.")
|
@@ -151,7 +151,7 @@ def add_solution_cbk(
|
|
151 |
user_id,
|
152 |
system_name,
|
153 |
org,
|
154 |
-
sys_type,
|
155 |
submission_path,
|
156 |
is_warmup_dataset=(SPLIT == "warmup"),
|
157 |
ensure_all_present=ENSURE_ALL_PRESENT,
|
@@ -176,45 +176,27 @@ def gate_submission(oauth_token: gr.OAuthToken | None):
|
|
176 |
|
177 |
|
178 |
def get_theme():
|
179 |
-
|
180 |
-
|
181 |
-
primary_hue=colors.
|
182 |
-
secondary_hue=colors.
|
183 |
-
neutral_hue=colors.gray,
|
184 |
-
# # techno font
|
185 |
-
# font=gr.themes.GoogleFont("Orbitron"),
|
186 |
-
# font_mono=gr.themes.GoogleFont("JetBrains Mono"),
|
187 |
-
text_size=sizes.text_md, # keep defaults
|
188 |
-
spacing_size=sizes.spacing_md,
|
189 |
-
radius_size=sizes.radius_md,
|
190 |
).set(
|
191 |
-
#
|
192 |
-
|
193 |
-
background_fill_primary="#0b0f14", # panels
|
194 |
-
background_fill_secondary="#0e141a", # subtle contrast
|
195 |
)
|
196 |
-
return cyber_theme
|
197 |
-
|
198 |
-
|
199 |
-
gr.Image(
|
200 |
-
"assets/banner.png",
|
201 |
-
interactive=False,
|
202 |
-
show_label=False,
|
203 |
-
show_download_button=False,
|
204 |
-
container=False,
|
205 |
-
elem_classes=["banner_image"],
|
206 |
-
)
|
207 |
|
208 |
|
209 |
blocks = gr.Blocks(css=custom_css, theme=get_theme())
|
210 |
with blocks:
|
|
|
211 |
|
212 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
213 |
with gr.TabItem("What is FormulaOne", id=0):
|
214 |
gr.HTML(WHAT_IS_F1_HTML)
|
215 |
|
216 |
with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboard-tab-table", id=1):
|
217 |
-
refresh_leaderboard_data()
|
218 |
assert leaderboard_df is not None
|
219 |
leaderboard_component = init_leaderboard(leaderboard_df)
|
220 |
|
@@ -227,34 +209,24 @@ with blocks:
|
|
227 |
with gr.Row():
|
228 |
gr.Markdown("# ✉️✨ Submit your solutions", elem_classes="markdown-text")
|
229 |
|
230 |
-
# Shown when logged OUT
|
231 |
login_box = gr.Group(visible=True)
|
232 |
with login_box:
|
233 |
gr.Markdown("Please sign in with Hugging Face to submit")
|
234 |
gr.LoginButton()
|
235 |
|
236 |
-
# Shown when logged IN
|
237 |
submit_panel = gr.Group(visible=False)
|
238 |
with submit_panel:
|
239 |
with gr.Row():
|
240 |
with gr.Column():
|
241 |
system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
|
242 |
org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
|
243 |
-
# sys_type_dropdown = gr.Dropdown(
|
244 |
-
# choices=[t.to_str() for t in ModelType],
|
245 |
-
# label=AutoEvalColumn.system_type.name,
|
246 |
-
# multiselect=False,
|
247 |
-
# value=ModelType.LLM.to_str(),
|
248 |
-
# interactive=True,
|
249 |
-
# )
|
250 |
-
|
251 |
submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
|
252 |
|
253 |
logger.info("Submit button")
|
254 |
-
submit_button = gr.Button("Submit")
|
255 |
-
# gr.LoginButton()
|
256 |
submission_result = gr.Markdown()
|
257 |
|
|
|
258 |
submit_button.click(
|
259 |
add_solution_cbk,
|
260 |
[
|
@@ -273,11 +245,7 @@ with blocks:
|
|
273 |
elem_id="citation-block",
|
274 |
)
|
275 |
|
276 |
-
# UI refresh triggers latest data swap.
|
277 |
-
# The work already happened in the background - refresh_leaderboard_data().
|
278 |
blocks.load(lambda: leaderboard_df, inputs=[], outputs=[leaderboard_component])
|
279 |
-
|
280 |
-
# On initial load (and after OAuth redirect), toggle the UI based on login status.
|
281 |
blocks.load(gate_submission, inputs=None, outputs=[login_box, submit_panel])
|
282 |
|
283 |
|
|
|
83 |
def add_solution_cbk(
|
84 |
system_name: str,
|
85 |
org: str,
|
86 |
+
# sys_type: str, # This was commented out in your version
|
87 |
submission_path: str,
|
88 |
profile: gr.OAuthProfile | None,
|
89 |
oauth_token: gr.OAuthToken | None,
|
|
|
112 |
|
113 |
if not profile.username:
|
114 |
return styled_error("Could not retrieve username. Please try signing in again.")
|
|
|
|
|
115 |
|
116 |
try:
|
117 |
# Validating the submission file.
|
|
|
125 |
return styled_error("Failed to read JSONL submission file. Please try again later.")
|
126 |
|
127 |
# Validating all user-supplied arguments.
|
128 |
+
# I am adding a placeholder for sys_type since your original add_new_solutions call expects it
|
129 |
+
sys_type = "default" # Placeholder
|
130 |
for val, val_name in [
|
131 |
(system_name, "System name"),
|
132 |
(org, "Organisation name"),
|
133 |
+
# (sys_type, "System type"), # This was commented out
|
134 |
]:
|
135 |
if len(val) == 0:
|
136 |
return styled_error(f"Please fill in the '{val_name}' field.")
|
|
|
151 |
user_id,
|
152 |
system_name,
|
153 |
org,
|
154 |
+
sys_type, # Passing the placeholder
|
155 |
submission_path,
|
156 |
is_warmup_dataset=(SPLIT == "warmup"),
|
157 |
ensure_all_present=ENSURE_ALL_PRESENT,
|
|
|
176 |
|
177 |
|
178 |
def get_theme():
|
179 |
+
# MODIFICATION: Switched to a light theme
|
180 |
+
return gr.themes.Soft(
|
181 |
+
primary_hue=gr.themes.colors.blue,
|
182 |
+
secondary_hue=gr.themes.colors.sky,
|
183 |
+
neutral_hue=gr.themes.colors.gray,
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
).set(
|
185 |
+
body_background_fill="#FFFFFF",
|
186 |
+
panel_background_fill="#f3f4f6", # A light gray for panels
|
|
|
|
|
187 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
|
190 |
blocks = gr.Blocks(css=custom_css, theme=get_theme())
|
191 |
with blocks:
|
192 |
+
# MODIFICATION: Banner gr.Image call is removed.
|
193 |
|
194 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
195 |
with gr.TabItem("What is FormulaOne", id=0):
|
196 |
gr.HTML(WHAT_IS_F1_HTML)
|
197 |
|
198 |
with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboard-tab-table", id=1):
|
199 |
+
refresh_leaderboard_data()
|
200 |
assert leaderboard_df is not None
|
201 |
leaderboard_component = init_leaderboard(leaderboard_df)
|
202 |
|
|
|
209 |
with gr.Row():
|
210 |
gr.Markdown("# ✉️✨ Submit your solutions", elem_classes="markdown-text")
|
211 |
|
|
|
212 |
login_box = gr.Group(visible=True)
|
213 |
with login_box:
|
214 |
gr.Markdown("Please sign in with Hugging Face to submit")
|
215 |
gr.LoginButton()
|
216 |
|
|
|
217 |
submit_panel = gr.Group(visible=False)
|
218 |
with submit_panel:
|
219 |
with gr.Row():
|
220 |
with gr.Column():
|
221 |
system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
|
222 |
org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
|
224 |
|
225 |
logger.info("Submit button")
|
226 |
+
submit_button = gr.Button("Submit", variant="primary")
|
|
|
227 |
submission_result = gr.Markdown()
|
228 |
|
229 |
+
# Using your original .click() call which does not include sys_type_dropdown
|
230 |
submit_button.click(
|
231 |
add_solution_cbk,
|
232 |
[
|
|
|
245 |
elem_id="citation-block",
|
246 |
)
|
247 |
|
|
|
|
|
248 |
blocks.load(lambda: leaderboard_df, inputs=[], outputs=[leaderboard_component])
|
|
|
|
|
249 |
blocks.load(gate_submission, inputs=None, outputs=[login_box, submit_panel])
|
250 |
|
251 |
|
src/about.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
<!DOCTYPE html>
|
3 |
<html lang="en">
|
4 |
<body>
|
5 |
<main class="max-w-4xl mx-auto">
|
6 |
<header class="text-center mb-12">
|
7 |
-
<h1 class="text-4xl md:text-5xl font-bold text-
|
8 |
</header>
|
9 |
<section>
|
10 |
<p class="text-lg mb-4 f1-p">Frontier AI models have recently demonstrated strong performance on mathematical and algorithmic benchmarks, including earning gold medals in olympiads, and attaining top percentile ratings in competitive programming contests. How well do such benchmarks capture the true depth of algorithmic reasoning, as it arises in real-world research problems?</p>
|
@@ -35,9 +38,9 @@ WHAT_IS_F1_HTML = """
|
|
35 |
</table>
|
36 |
</div>
|
37 |
<div class="mt-8">
|
38 |
-
<div class="border-b border-gray-
|
39 |
<nav class="-mb-px flex space-x-8" aria-label="Tabs">
|
40 |
-
<p class="whitespace-nowrap py-4 px-1 border-b-2 font-medium text-sm border-blue-
|
41 |
</nav>
|
42 |
</div>
|
43 |
<div class="mt-4">
|
@@ -59,13 +62,13 @@ WHAT_IS_F1_HTML = """
|
|
59 |
<p class="f1-p">Despite Frontier models’ impressive performance on existing benchmarks, presently <strong>no model solves even a single FormulaOne Tier 2 problem</strong>.<sup><a href="#evaluation" class="f1-a">1</a></sup></p>
|
60 |
</section>
|
61 |
<section>
|
62 |
-
<h2 class="text-3xl font-bold text-
|
63 |
<p class="mb-4 f1-p">The novelty and vastness of FormulaOne stems from its theoretical foundation. The questions are not arbitrary puzzles, but are instead drawn from the highly expressive framework of <strong>Monadic Second-Order</strong> (MSO) logic on graphs. This provides a principled, semi-automatic way to generate a virtually infinite supply of mathematically deep algorithmic challenges. Despite their theoretical underpinnings, the problems in FormulaOne are natural and succinct:</p>
|
64 |
<div class="f1-problem-box">
|
65 |
<p class="font-bold text-lg mb-2">Problem #44</p>
|
66 |
<p class="mb-2"><strong>Input:</strong> A tree-like graph G=(V,E), a tree decomposition T of G, and a weight function w:V→N.</p>
|
67 |
<p class="mb-2"><strong>Objective:</strong> Compute the sum of all weights of sets S⊆V such that the graph G[S], induced over S, does not contain any cycle of length four.</p>
|
68 |
-
<p class="text-sm text-gray-
|
69 |
</div>
|
70 |
<p class="mb-4 f1-p">While the problems are often natural to state, their solutions are far from obvious. The solvability of this vast class of problems is guaranteed by an algorithmic <strong>meta-theorem</strong> due to Courcelle, which broadly states:</p>
|
71 |
<blockquote class="my-6 f1-blockquote">
|
@@ -84,10 +87,10 @@ WHAT_IS_F1_HTML = """
|
|
84 |
</video>
|
85 |
<figcaption class="f1-figcaption">Animation showing the design of a compressed dynamic programming state-space.</figcaption>
|
86 |
</figure>
|
87 |
-
<p class="f1-p">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem – <strong>Maximal-Cluster-Graph</strong> – see the appendix of our paper
|
88 |
</section>
|
89 |
<section>
|
90 |
-
<h2 class="text-3xl font-bold text-
|
91 |
<ul class="list-disc list-inside space-y-4">
|
92 |
<li class="f1-li"><strong>An In-Distribution Benchmark for Reasoning.</strong> Unlike benchmarks that test for out-of-distribution generalisation, FormulaOne presents problems that are squarely <strong>in-distribution</strong> for models trained on code. Essentially, dynamic programming on graphs is the “bread and butter” of algorithmic programming. Thus, models’ current failure on FormulaOne highlights a fundamental deficit in deep, multi-step reasoning, rather than a lack of domain exposure.</li>
|
93 |
<li class="f1-li"><strong>An Unbounded Environment for Reinforcement Learning.</strong> The MSO framework allows for the generation of a nearly infinite stream of algorithmic problems with verifiable solutions, making it an ideal environment for training and evaluating agents with Reinforcement Learning with Verifiable Rewards (RLVR).</li>
|
@@ -95,7 +98,7 @@ WHAT_IS_F1_HTML = """
|
|
95 |
</ul>
|
96 |
</section>
|
97 |
<section id="evaluation">
|
98 |
-
<h2 class="text-3xl font-bold text-
|
99 |
<p class="mb-4 f1-p">To give models the best possible chance of success, we provide a generous few-shot prompt that covers a broad array of the ideas and techniques involved in solving these problems. All models were evaluated using their highest available reasoning settings and with the maximum context length permitted.</p>
|
100 |
<p class="mb-4 f1-p">Each submitted solution is subjected to a rigorous and automated test suite that measures three key aspects of its validity:</p>
|
101 |
<ul class="list-disc list-inside space-y-2 mb-6">
|
@@ -105,7 +108,7 @@ WHAT_IS_F1_HTML = """
|
|
105 |
</ul>
|
106 |
<p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our public GitHub repository: <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">https://github.com/double-ai/formulaone-dataset/tree/main</a>.</p>
|
107 |
<p class="f1-p">In contrast, to maintain the integrity of the core benchmark, only a minimal subset of tests is released for the <code>FormulaOne Tier 1</code> and <code>Tier 2</code> problems.</p>
|
108 |
-
<h3 class="text-2xl font-bold text-
|
109 |
<p class="mb-4 f1-p">On the <strong>FormulaOne-Warmup</strong> problems, frontier models perform reasonably well. This confirms they have a foundational capability for these types of algorithmic tasks.</p>
|
110 |
<figure class="f1-figure">
|
111 |
<img src="/file=assets/warmup_performance.png" alt="Plot showing model performance on FormulaOne-Warmup" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
|
@@ -123,14 +126,6 @@ WHAT_IS_F1_HTML = """
|
|
123 |
</html>
|
124 |
"""
|
125 |
|
126 |
-
LLM_BENCHMARKS_TEXT = """
|
127 |
-
## How it works
|
128 |
-
|
129 |
-
## Reproducibility
|
130 |
-
To reproduce our results, here is the commands you can run:
|
131 |
-
|
132 |
-
"""
|
133 |
-
|
134 |
EVALUATION_QUEUE_TEXT = """
|
135 |
## Submitting to the FormulaOne Leaderboard
|
136 |
|
|
|
1 |
+
# The paper's URL for linking
|
2 |
+
PAPER_URL = "https://arxiv.org/abs/2507.13337"
|
3 |
+
|
4 |
+
WHAT_IS_F1_HTML = f"""
|
5 |
<!DOCTYPE html>
|
6 |
<html lang="en">
|
7 |
<body>
|
8 |
<main class="max-w-4xl mx-auto">
|
9 |
<header class="text-center mb-12">
|
10 |
+
<h1 class="text-4xl md:text-5xl font-bold text-gray-900 f1-h1">FormulaOne</h1>
|
11 |
</header>
|
12 |
<section>
|
13 |
<p class="text-lg mb-4 f1-p">Frontier AI models have recently demonstrated strong performance on mathematical and algorithmic benchmarks, including earning gold medals in olympiads, and attaining top percentile ratings in competitive programming contests. How well do such benchmarks capture the true depth of algorithmic reasoning, as it arises in real-world research problems?</p>
|
|
|
38 |
</table>
|
39 |
</div>
|
40 |
<div class="mt-8">
|
41 |
+
<div class="border-b border-gray-200">
|
42 |
<nav class="-mb-px flex space-x-8" aria-label="Tabs">
|
43 |
+
<p class="whitespace-nowrap py-4 px-1 border-b-2 font-medium text-sm border-blue-600 text-blue-600">Example Problems</p>
|
44 |
</nav>
|
45 |
</div>
|
46 |
<div class="mt-4">
|
|
|
62 |
<p class="f1-p">Despite Frontier models’ impressive performance on existing benchmarks, presently <strong>no model solves even a single FormulaOne Tier 2 problem</strong>.<sup><a href="#evaluation" class="f1-a">1</a></sup></p>
|
63 |
</section>
|
64 |
<section>
|
65 |
+
<h2 class="text-3xl font-bold text-gray-900 f1-h2">An “Infinite Well” of Problems</h2>
|
66 |
<p class="mb-4 f1-p">The novelty and vastness of FormulaOne stems from its theoretical foundation. The questions are not arbitrary puzzles, but are instead drawn from the highly expressive framework of <strong>Monadic Second-Order</strong> (MSO) logic on graphs. This provides a principled, semi-automatic way to generate a virtually infinite supply of mathematically deep algorithmic challenges. Despite their theoretical underpinnings, the problems in FormulaOne are natural and succinct:</p>
|
67 |
<div class="f1-problem-box">
|
68 |
<p class="font-bold text-lg mb-2">Problem #44</p>
|
69 |
<p class="mb-2"><strong>Input:</strong> A tree-like graph G=(V,E), a tree decomposition T of G, and a weight function w:V→N.</p>
|
70 |
<p class="mb-2"><strong>Objective:</strong> Compute the sum of all weights of sets S⊆V such that the graph G[S], induced over S, does not contain any cycle of length four.</p>
|
71 |
+
<p class="text-sm text-gray-600"><strong>Notation:</strong> The weight of a set of vertices S is defined as w(S) ≜ ∑<sub>v∈S</sub>w(v). The final result should be returned modulo 10<sup>9</sup>+7.</p>
|
72 |
</div>
|
73 |
<p class="mb-4 f1-p">While the problems are often natural to state, their solutions are far from obvious. The solvability of this vast class of problems is guaranteed by an algorithmic <strong>meta-theorem</strong> due to Courcelle, which broadly states:</p>
|
74 |
<blockquote class="my-6 f1-blockquote">
|
|
|
87 |
</video>
|
88 |
<figcaption class="f1-figcaption">Animation showing the design of a compressed dynamic programming state-space.</figcaption>
|
89 |
</figure>
|
90 |
+
<p class="f1-p">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem – <strong>Maximal-Cluster-Graph</strong> – see the <a href="{PAPER_URL}" target="_blank" rel="noopener noreferrer" class="f1-a">appendix of our paper</a>.</p>
|
91 |
</section>
|
92 |
<section>
|
93 |
+
<h2 class="text-3xl font-bold text-gray-900 f1-h2">Guiding Principles</h2>
|
94 |
<ul class="list-disc list-inside space-y-4">
|
95 |
<li class="f1-li"><strong>An In-Distribution Benchmark for Reasoning.</strong> Unlike benchmarks that test for out-of-distribution generalisation, FormulaOne presents problems that are squarely <strong>in-distribution</strong> for models trained on code. Essentially, dynamic programming on graphs is the “bread and butter” of algorithmic programming. Thus, models’ current failure on FormulaOne highlights a fundamental deficit in deep, multi-step reasoning, rather than a lack of domain exposure.</li>
|
96 |
<li class="f1-li"><strong>An Unbounded Environment for Reinforcement Learning.</strong> The MSO framework allows for the generation of a nearly infinite stream of algorithmic problems with verifiable solutions, making it an ideal environment for training and evaluating agents with Reinforcement Learning with Verifiable Rewards (RLVR).</li>
|
|
|
98 |
</ul>
|
99 |
</section>
|
100 |
<section id="evaluation">
|
101 |
+
<h2 class="text-3xl font-bold text-gray-900 f1-h2">Evaluation</h2>
|
102 |
<p class="mb-4 f1-p">To give models the best possible chance of success, we provide a generous few-shot prompt that covers a broad array of the ideas and techniques involved in solving these problems. All models were evaluated using their highest available reasoning settings and with the maximum context length permitted.</p>
|
103 |
<p class="mb-4 f1-p">Each submitted solution is subjected to a rigorous and automated test suite that measures three key aspects of its validity:</p>
|
104 |
<ul class="list-disc list-inside space-y-2 mb-6">
|
|
|
108 |
</ul>
|
109 |
<p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our public GitHub repository: <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">https://github.com/double-ai/formulaone-dataset/tree/main</a>.</p>
|
110 |
<p class="f1-p">In contrast, to maintain the integrity of the core benchmark, only a minimal subset of tests is released for the <code>FormulaOne Tier 1</code> and <code>Tier 2</code> problems.</p>
|
111 |
+
<h3 class="text-2xl font-bold text-gray-900 mt-8 mb-4">Model Accuracy</h3>
|
112 |
<p class="mb-4 f1-p">On the <strong>FormulaOne-Warmup</strong> problems, frontier models perform reasonably well. This confirms they have a foundational capability for these types of algorithmic tasks.</p>
|
113 |
<figure class="f1-figure">
|
114 |
<img src="/file=assets/warmup_performance.png" alt="Plot showing model performance on FormulaOne-Warmup" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
|
|
|
126 |
</html>
|
127 |
"""
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
EVALUATION_QUEUE_TEXT = """
|
130 |
## Submitting to the FormulaOne Leaderboard
|
131 |
|
src/display/css_html_js.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
custom_css = """
|
2 |
.markdown-text {
|
3 |
font-size: 16px !important;
|
@@ -8,7 +10,7 @@ button[role="tab"] {
|
|
8 |
font-family: 'Exo 2', system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif !important;
|
9 |
letter-spacing: 0.25px;
|
10 |
font-weight: 600;
|
11 |
-
font-size: 18px !important;
|
12 |
}
|
13 |
#models-to-add-text { font-size: 18px !important; }
|
14 |
#citation-button span { font-size: 16px !important; }
|
@@ -32,22 +34,21 @@ button[role="tab"] {
|
|
32 |
#filter-columns-type{ border:0; padding:0.5; }
|
33 |
#filter-columns-size{ border:0; padding:0.5; }
|
34 |
#box-filter > .form{ border: 0 }
|
35 |
-
.banner_image img { height: 200px !important; object-fit: cover !important; }
|
36 |
|
37 |
-
/* Styles for the "What is FormulaOne" HTML content */
|
38 |
-
.f1-h1 { font-weight: 700; font-size: 2.25rem; line-height: 2.5rem; color:
|
39 |
-
.f1-h2 { font-weight: 700; border-bottom: 1px solid #
|
40 |
-
.f1-p, .f1-li { line-height: 1.75; color: #
|
41 |
-
.f1-a { color: #
|
42 |
.f1-a:hover { text-decoration: underline; }
|
43 |
-
.f1-blockquote { border-left: 4px solid #
|
44 |
-
.f1-problem-box { background-color: #
|
45 |
-
.f1-problem-box strong { color: #
|
46 |
.f1-table { width: 100%; margin-top: 1.5rem; border-collapse: collapse; }
|
47 |
-
.f1-th, .f1-td { text-align: left; padding: 0.75rem 1rem; border-bottom: 1px solid #
|
48 |
-
.f1-th { background-color: #
|
49 |
.f1-figure { margin-top: 1.5rem; margin-bottom: 1.5rem; text-align: center; }
|
50 |
-
.f1-figcaption { margin-top: 0.5rem; font-size: 0.875rem; color: #
|
51 |
"""
|
52 |
|
53 |
get_window_url_params = """
|
|
|
1 |
+
# src/display/css_html_js.py
|
2 |
+
|
3 |
custom_css = """
|
4 |
.markdown-text {
|
5 |
font-size: 16px !important;
|
|
|
10 |
font-family: 'Exo 2', system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif !important;
|
11 |
letter-spacing: 0.25px;
|
12 |
font-weight: 600;
|
13 |
+
font-size: 18px !important;
|
14 |
}
|
15 |
#models-to-add-text { font-size: 18px !important; }
|
16 |
#citation-button span { font-size: 16px !important; }
|
|
|
34 |
#filter-columns-type{ border:0; padding:0.5; }
|
35 |
#filter-columns-size{ border:0; padding:0.5; }
|
36 |
#box-filter > .form{ border: 0 }
|
|
|
37 |
|
38 |
+
/* Light Theme Styles for the "What is FormulaOne" HTML content */
|
39 |
+
.f1-h1 { font-weight: 700; font-size: 2.25rem; line-height: 2.5rem; color: #111827; text-align: center; margin-bottom: 2rem;}
|
40 |
+
.f1-h2 { font-weight: 700; border-bottom: 1px solid #e5e7eb; padding-bottom: 0.5rem; margin-top: 2.5rem; margin-bottom: 1.5rem; color: #111827; font-size: 1.875rem; line-height: 2.25rem; }
|
41 |
+
.f1-p, .f1-li { line-height: 1.75; color: #374151; }
|
42 |
+
.f1-a { color: #2563eb; text-decoration: none; font-weight: 500; }
|
43 |
.f1-a:hover { text-decoration: underline; }
|
44 |
+
.f1-blockquote { border-left: 4px solid #d1d5db; padding-left: 1rem; margin-left: 0; font-style: italic; color: #4b5563; }
|
45 |
+
.f1-problem-box { background-color: #f9fafb; border: 1px solid #e5e7eb; border-radius: 0.5rem; padding: 1.5rem; margin-top: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 1px 2px 0 rgb(0 0 0 / 0.05); }
|
46 |
+
.f1-problem-box strong { color: #111827; }
|
47 |
.f1-table { width: 100%; margin-top: 1.5rem; border-collapse: collapse; }
|
48 |
+
.f1-th, .f1-td { text-align: left; padding: 0.75rem 1rem; border-bottom: 1px solid #e5e7eb; }
|
49 |
+
.f1-th { background-color: #f9fafb; font-weight: 600; color: #374151; }
|
50 |
.f1-figure { margin-top: 1.5rem; margin-bottom: 1.5rem; text-align: center; }
|
51 |
+
.f1-figcaption { margin-top: 0.5rem; font-size: 0.875rem; color: #6b7280; font-style: italic; }
|
52 |
"""
|
53 |
|
54 |
get_window_url_params = """
|