galb-dai commited on
Commit
260568f
·
1 Parent(s): 180dbc4
Files changed (3) hide show
  1. app.py +9 -5
  2. src/about.py +97 -105
  3. src/display/css_html_js.py +37 -26
app.py CHANGED
@@ -6,6 +6,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
6
  from gradio_leaderboard import Leaderboard, SelectColumns
7
  from huggingface_hub import whoami
8
 
 
9
  from src.about import (
10
  CITATION_BUTTON_LABEL,
11
  CITATION_BUTTON_TEXT,
@@ -208,11 +209,11 @@ blocks = gr.Blocks(
208
  with blocks:
209
 
210
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
211
- with gr.TabItem("What is FormulaOne", id=0):
212
- # Top content up to (but not including) the Examples and the 'latter category' paragraph
213
  gr.HTML(WHAT_IS_F1_HTML_TOP)
214
 
215
- # Examples of FormulaOne problems (LaTeX via Markdown; bottom 'tabs' via Radio)
216
  with gr.Group(elem_id="f1-examples"):
217
  gr.HTML(
218
  '<div class="f1-tabs-body"><h3 class="f1-examples-title">Examples of FormulaOne problems</h3></div>'
@@ -231,6 +232,7 @@ with blocks:
231
  "Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that the induced subgraph $G[S]$ is a disjoint union of paths and cycles."
232
  ),
233
  latex_delimiters=_latex,
 
234
  )
235
  md_tier1 = gr.Markdown(
236
  value=(
@@ -239,6 +241,7 @@ with blocks:
239
  ),
240
  visible=False,
241
  latex_delimiters=_latex,
 
242
  )
243
  md_tier2 = gr.Markdown(
244
  value=(
@@ -247,6 +250,7 @@ with blocks:
247
  ),
248
  visible=False,
249
  latex_delimiters=_latex,
 
250
  )
251
 
252
  tab_radio = gr.Radio(
@@ -257,10 +261,10 @@ with blocks:
257
  )
258
  tab_radio.change(_select_example_tab, inputs=tab_radio, outputs=[md_warmup, md_tier1, md_tier2])
259
 
260
- # Bottom content: the 'latter category' paragraph and all following sections
261
  gr.HTML(WHAT_IS_F1_HTML_BOTTOM)
262
 
263
- # (5) Rename tab to "Leaderboard"
264
  with gr.TabItem("Leaderboard", elem_id="formulaone-leaderboard-tab-table", id=1):
265
  gr.Markdown(
266
  """
 
6
  from gradio_leaderboard import Leaderboard, SelectColumns
7
  from huggingface_hub import whoami
8
 
9
+ # NOTE: split WHAT_IS_F1_HTML into top/bottom so we can insert a Gradio-based tabbed element between them.
10
  from src.about import (
11
  CITATION_BUTTON_LABEL,
12
  CITATION_BUTTON_TEXT,
 
209
  with blocks:
210
 
211
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
212
+ with gr.TabItem("What is FormulaOne", id=0, elem_id="what-is-tab"):
213
+ # Top content
214
  gr.HTML(WHAT_IS_F1_HTML_TOP)
215
 
216
+ # Examples (kept inside a centered, 800px container)
217
  with gr.Group(elem_id="f1-examples"):
218
  gr.HTML(
219
  '<div class="f1-tabs-body"><h3 class="f1-examples-title">Examples of FormulaOne problems</h3></div>'
 
232
  "Given a tree-like graph $G=(V,E)$ and a weight function $w:V\\to\\mathbb{N}$, compute the sum of all weights of sets $S\\subseteq V$ such that the induced subgraph $G[S]$ is a disjoint union of paths and cycles."
233
  ),
234
  latex_delimiters=_latex,
235
+ elem_classes=["f1-problem-markdown"],
236
  )
237
  md_tier1 = gr.Markdown(
238
  value=(
 
241
  ),
242
  visible=False,
243
  latex_delimiters=_latex,
244
+ elem_classes=["f1-problem-markdown"],
245
  )
246
  md_tier2 = gr.Markdown(
247
  value=(
 
250
  ),
251
  visible=False,
252
  latex_delimiters=_latex,
253
+ elem_classes=["f1-problem-markdown"],
254
  )
255
 
256
  tab_radio = gr.Radio(
 
261
  )
262
  tab_radio.change(_select_example_tab, inputs=tab_radio, outputs=[md_warmup, md_tier1, md_tier2])
263
 
264
+ # Bottom content
265
  gr.HTML(WHAT_IS_F1_HTML_BOTTOM)
266
 
267
+ # Rename tab to "Leaderboard"
268
  with gr.TabItem("Leaderboard", elem_id="formulaone-leaderboard-tab-table", id=1):
269
  gr.Markdown(
270
  """
src/about.py CHANGED
@@ -1,117 +1,109 @@
1
  # The paper's URL for linking
2
  PAPER_URL = "https://arxiv.org/abs/2507.13337"
3
 
4
- # Top part (through the categories table). We insert the Gradio-based examples AFTER this.
5
  WHAT_IS_F1_HTML_TOP = f"""
6
- <!DOCTYPE html>
7
- <html lang="en">
8
- <body>
9
- <main class="f1-container">
10
- <header class="text-center mb-12">
11
- <h1 class="text-4xl md:text-5xl font-bold text-gray-900 f1-h1">FormulaOne</h1>
12
- </header>
13
-
14
- <section>
15
- <p class="text-lg mb-4 f1-p">Frontier AI models have recently demonstrated strong performance on mathematical and algorithmic benchmarks, including earning <a href="https://deepmind.google/discover/blog/advanced-version-of-gemini-with-deep-think-officially-achieves-gold-medal-standard-at-the-international-mathematical-olympiad/" target="_blank" rel="noopener noreferrer" class="f1-a">gold medals in olympiads</a>, and attaining <a href="https://arxiv.org/html/2502.06807v1" target="_blank" rel="noopener noreferrer" class="f1-a">top percentile ratings</a> in competitive programming contests. How well do such benchmarks capture the true depth of algorithmic reasoning, as it arises in real-world research problems?</p>
16
-
17
- <p class="text-lg mb-4 f1-p">We believe that existing benchmarks fail to capture the deep reasoning skills required for complex, research-level algorithmic problems. To address this gap, <a href="{PAPER_URL}" target="_blank" rel="noopener noreferrer" class="f1-a">we introduce <strong>FormulaOne</strong></a>.</p>
18
-
19
- <p class="mb-4 f1-p"><strong>FormulaOne</strong> consists of 220 novel dynamic programming problems over graphs. The problems are organised into three categories, ranging from moderate difficulty and all the way up to research-level.</p>
20
-
21
- <!-- Clean, centered table -->
22
- <div class="f1-table-wrap">
23
- <table class="f1-table" aria-label="FormulaOne categories">
24
- <thead>
25
- <tr>
26
- <th>Category</th>
27
- <th>Size</th>
28
- <th>Description</th>
29
- </tr>
30
- </thead>
31
- <tbody>
32
- <tr>
33
- <td>Warmup</td>
34
- <td>100</td>
35
- <td>A set of “easier” problems.</td>
36
- </tr>
37
- <tr>
38
- <td>Tier 1</td>
39
- <td>100</td>
40
- <td>A set of challenging problems.</td>
41
- </tr>
42
- <tr>
43
- <td>Tier 2</td>
44
- <td>20</td>
45
- <td>A set of highly challenging problems.</td>
46
- </tr>
47
- </tbody>
48
- </table>
49
- </div>
50
- </section>
51
  """
52
 
53
- # Bottom part (the paragraph after Examples, all remaining sections).
54
  WHAT_IS_F1_HTML_BOTTOM = """
55
- <section>
56
- <p class="mb-4 f1-p">The latter category is incredibly demanding, requiring resolution of many points of uncertainty, and involving an array of reasoning steps, including topological and geometric insight, knowledge of mathematical domains such as extremal graph theory and logic, combinatorial considerations, precise implementation, and more.</p>
57
- <p class="f1-p">Despite <a href="https://epoch.ai/frontiermath" target="_blank" rel="noopener noreferrer" class="f1-a">impressive</a> <a href="https://artificialanalysis.ai/evaluations/gpqa-diamond" target="_blank" rel="noopener noreferrer" class="f1-a">performance</a> on existing benchmarks, presently <strong>no model solves even a single FormulaOne Tier 2 problem</strong>.<sup><a href="#evaluation" class="f1-a">1</a></sup></p>
58
- </section>
59
-
60
- <section>
61
- <h2 class="f1-h2">An “Infinite Well” of Problems</h2>
62
- <!-- Removed the example problem box, per request -->
63
- <p class="mb-4 f1-p">While the problems are often natural to state, their solutions are far from obvious. The solvability of this vast class of problems is guaranteed by an algorithmic <strong>meta-theorem</strong> due to <a href="https://en.wikipedia.org/wiki/Courcelle%27s_theorem" target="_blank" rel="noopener noreferrer" class="f1-a">Courcelle</a>, which broadly states:</p>
64
- <blockquote class="my-6 f1-blockquote">
65
- “For every sufficiently tree-like graph, any problem definable in an expressive formal logic — Monadic Second-Order (MSO) logic — can be solved by a dynamic programming algorithm that operates in time linear in the order of the graph.”
66
- </blockquote>
67
- <p class="f1-p">The key is to use a structure known as a tree decomposition, which organises the graph’s vertices into a series of overlapping sets, or “bags”, that are themselves arranged in a tree.</p>
68
- <figure class="f1-figure">
69
- <img src="/file=assets/bag_modifications.png" alt="An illustration of local modifications to bags (dashed boxes)" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
70
- <figcaption class="f1-figcaption">An illustration of local modifications to bags: Introduce, Forget, and Join.</figcaption>
71
- </figure>
72
- <p class="mb-4 f1-p">An algorithm can then traverse this tree of bags, solving the problem piece by piece using dynamic programming. This process involves designing a “state” that summarises all necessary information about the partial solution within a bag, and then defining how this state transforms as vertices are introduced, forgotten, or bags are merged.</p>
73
- <figure class="f1-figure">
74
- <video class="w-full max-w-2xl mx-auto rounded-lg shadow-lg" autoplay loop muted playsinline>
75
- <source src="/file=assets/dp_animation.mp4" type="video/mp4">
76
- Your browser does not support the video tag.
77
- </video>
78
- <figcaption class="f1-figcaption">Animation showing the design of a compressed dynamic programming state-space.</figcaption>
79
- </figure>
80
- <p class="f1-p">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem – <strong>Maximal-Cluster-Graph</strong> – <a href="https://arxiv.org/pdf/2507.13337#appendix.A" target="_blank" rel="noopener noreferrer" class="f1-a">see the appendix of our paper</a>.</p>
81
- </section>
82
-
83
- <section id="evaluation">
84
- <h2 class="f1-h2">Evaluation</h2>
85
- <p class="mb-4 f1-p">To give models the best possible chance of success, we provide a generous few-shot prompt that covers a broad array of the ideas and techniques involved in solving these problems. All models were evaluated using their highest available reasoning settings and with the maximum context length permitted.</p>
86
- <p class="mb-4 f1-p">Each submitted solution is subjected to a rigorous and automated <a href="https://arxiv.org/pdf/2507.13337#section.4" target="_blank" rel="noopener noreferrer" class="f1-a">test suite</a> that measures three key aspects of its validity:</p>
87
- <ul class="list-disc list-inside space-y-2 mb-6">
88
- <li class="f1-li"><strong>Correctness:</strong> The output of the submitted algorithm must be correct on all graphs.</li>
89
- <li class="f1-li"><strong>Consistency:</strong> The solution must produce the same output for a given graph, regardless of the specific structure of its tree decomposition.</li>
90
- <li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
91
- </ul>
92
- <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
93
- <p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Tier 1 and Tier 2 problems.</p>
94
-
95
- <!-- (6) Same level as Evaluation -->
96
- <h2 class="f1-h2">Model Accuracy</h2>
97
- <p class="mb-4 f1-p">On the <strong>FormulaOne-Warmup</strong> problems, frontier models perform reasonably well. This confirms they have a foundational capability for these types of algorithmic tasks.</p>
98
- <figure class="f1-figure">
99
- <img src="/file=assets/warmup_performance.png" alt="Plot showing model performance on FormulaOne-Warmup" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
100
- <figcaption class="f1-figcaption">Performance of frontier models on the FormulaOne-Warmup dataset.</figcaption>
101
- </figure>
102
- <p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>Tier 1</strong>, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
103
- <figure class="f1-figure">
104
- <img src="/file=assets/tier1_performance.png" alt="Plot showing model performance on FormulaOne Tier 1" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
105
- <figcaption class="f1-figcaption">Figure 1: Performance of frontier reasoning models on the FormulaOne dataset.</figcaption>
106
- </figure>
107
- <p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
108
- </section>
109
- </main>
110
- </body>
111
- </html>
112
  """
113
 
114
-
115
  EVALUATION_QUEUE_TEXT = """
116
  ## Submitting to the FormulaOne Leaderboard
117
 
 
1
  # The paper's URL for linking
2
  PAPER_URL = "https://arxiv.org/abs/2507.13337"
3
 
4
+ # Top chunk — self-contained (no dangling <main/>). Includes the clean "table" (via divs).
5
  WHAT_IS_F1_HTML_TOP = f"""
6
+ <div class="f1-container">
7
+ <header class="text-center mb-12">
8
+ <h1 class="text-4xl md:text-5xl font-bold text-gray-900 f1-h1">FormulaOne</h1>
9
+ </header>
10
+
11
+ <section>
12
+ <p class="text-lg mb-4 f1-p">Frontier AI models have recently demonstrated strong performance on mathematical and algorithmic benchmarks, including earning <a href="https://deepmind.google/discover/blog/advanced-version-of-gemini-with-deep-think-officially-achieves-gold-medal-standard-at-the-international-mathematical-olympiad/" target="_blank" rel="noopener noreferrer" class="f1-a">gold medals in olympiads</a>, and attaining <a href="https://arxiv.org/html/2502.06807v1" target="_blank" rel="noopener noreferrer" class="f1-a">top percentile ratings</a> in competitive programming contests. How well do such benchmarks capture the true depth of algorithmic reasoning, as it arises in real-world research problems?</p>
13
+
14
+ <p class="text-lg mb-4 f1-p">We believe that existing benchmarks fail to capture the deep reasoning skills required for complex, research-level algorithmic problems. To address this gap, <a href="{PAPER_URL}" target="_blank" rel="noopener noreferrer" class="f1-a">we introduce <strong>FormulaOne</strong></a>.</p>
15
+
16
+ <p class="mb-4 f1-p"><strong>FormulaOne</strong> consists of 220 novel dynamic programming problems over graphs. The problems are organised into three categories, ranging from moderate difficulty and all the way up to research-level.</p>
17
+
18
+ <!-- Clean, centered "table" using divs -->
19
+ <div class="f1-grid-wrap" role="region" aria-label="FormulaOne categories">
20
+ <div class="f1-grid-table" role="table">
21
+ <div class="f1-grid-row f1-grid-head" role="row">
22
+ <div class="f1-grid-cell" role="columnheader">Category</div>
23
+ <div class="f1-grid-cell" role="columnheader">Size</div>
24
+ <div class="f1-grid-cell" role="columnheader">Description</div>
25
+ </div>
26
+ <div class="f1-grid-row" role="row">
27
+ <div class="f1-grid-cell" role="cell">Warmup</div>
28
+ <div class="f1-grid-cell" role="cell">100</div>
29
+ <div class="f1-grid-cell" role="cell">A set of “easier” problems.</div>
30
+ </div>
31
+ <div class="f1-grid-row" role="row">
32
+ <div class="f1-grid-cell" role="cell">Tier 1</div>
33
+ <div class="f1-grid-cell" role="cell">100</div>
34
+ <div class="f1-grid-cell" role="cell">A set of challenging problems.</div>
35
+ </div>
36
+ <div class="f1-grid-row" role="row">
37
+ <div class="f1-grid-cell" role="cell">Tier 2</div>
38
+ <div class="f1-grid-cell" role="cell">20</div>
39
+ <div class="f1-grid-cell" role="cell">A set of highly challenging problems.</div>
40
+ </div>
41
+ </div>
42
+ </div>
43
+ </section>
44
+ </div>
 
 
 
 
 
 
45
  """
46
 
47
+ # Bottom chunk self-contained, width-constrained.
48
  WHAT_IS_F1_HTML_BOTTOM = """
49
+ <div class="f1-container">
50
+ <section>
51
+ <p class="mb-4 f1-p">The latter category is incredibly demanding, requiring resolution of many points of uncertainty, and involving an array of reasoning steps, including topological and geometric insight, knowledge of mathematical domains such as extremal graph theory and logic, combinatorial considerations, precise implementation, and more.</p>
52
+ <p class="f1-p">Despite <a href="https://epoch.ai/frontiermath" target="_blank" rel="noopener noreferrer" class="f1-a">impressive</a> <a href="https://artificialanalysis.ai/evaluations/gpqa-diamond" target="_blank" rel="noopener noreferrer" class="f1-a">performance</a> on existing benchmarks, presently <strong>no model solves even a single FormulaOne Tier 2 problem</strong>.<sup><a href="#evaluation" class="f1-a">1</a></sup></p>
53
+ </section>
54
+
55
+ <section>
56
+ <h2 class="f1-h2">An “Infinite Well” of Problems</h2>
57
+ <!-- Example problem removed, per request -->
58
+ <p class="mb-4 f1-p">While the problems are often natural to state, their solutions are far from obvious. The solvability of this vast class of problems is guaranteed by an algorithmic <strong>meta-theorem</strong> due to <a href="https://en.wikipedia.org/wiki/Courcelle%27s_theorem" target="_blank" rel="noopener noreferrer" class="f1-a">Courcelle</a>, which broadly states:</p>
59
+ <blockquote class="my-6 f1-blockquote">
60
+ “For every sufficiently tree-like graph, any problem definable in an expressive formal logic — Monadic Second-Order (MSO) logic — can be solved by a dynamic programming algorithm that operates in time linear in the order of the graph.”
61
+ </blockquote>
62
+ <p class="f1-p">The key is to use a structure known as a tree decomposition, which organises the graph’s vertices into a series of overlapping sets, or “bags”, that are themselves arranged in a tree.</p>
63
+ <figure class="f1-figure">
64
+ <img src="/file=assets/bag_modifications.png" alt="An illustration of local modifications to bags (dashed boxes)" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
65
+ <figcaption class="f1-figcaption">An illustration of local modifications to bags: Introduce, Forget, and Join.</figcaption>
66
+ </figure>
67
+ <p class="mb-4 f1-p">An algorithm can then traverse this tree of bags, solving the problem piece by piece using dynamic programming. This process involves designing a “state” that summarises all necessary information about the partial solution within a bag, and then defining how this state transforms as vertices are introduced, forgotten, or bags are merged.</p>
68
+ <figure class="f1-figure">
69
+ <video class="w-full max-w-2xl mx-auto rounded-lg shadow-lg" autoplay loop muted playsinline>
70
+ <source src="/file=assets/dp_animation.mp4" type="video/mp4">
71
+ Your browser does not support the video tag.
72
+ </video>
73
+ <figcaption class="f1-figcaption">Animation showing the design of a compressed dynamic programming state-space.</figcaption>
74
+ </figure>
75
+ <p class="f1-p">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem – <strong>Maximal-Cluster-Graph</strong> – <a href="https://arxiv.org/pdf/2507.13337#appendix.A" target="_blank" rel="noopener noreferrer" class="f1-a">see the appendix of our paper</a>.</p>
76
+ </section>
77
+
78
+ <section id="evaluation">
79
+ <h2 class="f1-h2">Evaluation</h2>
80
+ <p class="mb-4 f1-p">To give models the best possible chance of success, we provide a generous few-shot prompt that covers a broad array of the ideas and techniques involved in solving these problems. All models were evaluated using their highest available reasoning settings and with the maximum context length permitted.</p>
81
+ <p class="mb-4 f1-p">Each submitted solution is subjected to a rigorous and automated <a href="https://arxiv.org/pdf/2507.13337#section.4" target="_blank" rel="noopener noreferrer" class="f1-a">test suite</a> that measures three key aspects of its validity:</p>
82
+ <ul class="list-disc list-inside space-y-2 mb-6">
83
+ <li class="f1-li"><strong>Correctness:</strong> The output of the submitted algorithm must be correct on all graphs.</li>
84
+ <li class="f1-li"><strong>Consistency:</strong> The solution must produce the same output for a given graph, regardless of the specific tree decomposition.</li>
85
+ <li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
86
+ </ul>
87
+ <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
88
+ <p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Tier 1 and Tier 2 problems.</p>
89
+
90
+ <!-- Make "Model Accuracy" same level as Evaluation -->
91
+ <h2 class="f1-h2">Model Accuracy</h2>
92
+ <p class="mb-4 f1-p">On the <strong>FormulaOne-Warmup</strong> problems, frontier models perform reasonably well. This confirms they have a foundational capability for these types of algorithmic tasks.</p>
93
+ <figure class="f1-figure">
94
+ <img src="/file=assets/warmup_performance.png" alt="Plot showing model performance on FormulaOne-Warmup" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
95
+ <figcaption class="f1-figcaption">Performance of frontier models on the FormulaOne-Warmup dataset.</figcaption>
96
+ </figure>
97
+ <p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>Tier 1</strong>, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
98
+ <figure class="f1-figure">
99
+ <img src="/file=assets/tier1_performance.png" alt="Plot showing model performance on FormulaOne Tier 1" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
100
+ <figcaption class="f1-figcaption">Figure 1: Performance of frontier reasoning models on the FormulaOne dataset.</figcaption>
101
+ </figure>
102
+ <p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
103
+ </section>
104
+ </div>
 
105
  """
106
 
 
107
  EVALUATION_QUEUE_TEXT = """
108
  ## Submitting to the FormulaOne Leaderboard
109
 
src/display/css_html_js.py CHANGED
@@ -7,9 +7,11 @@ custom_css = """
7
  --f1-bg-muted: #f9fafb;
8
  }
9
 
10
- /* Readable width */
11
  .f1-container { max-width: 800px; margin: 0 auto; padding: 0 16px; }
12
  .markdown-text { font-size: 16px !important; max-width: 800px; margin: 0 auto; }
 
 
13
 
14
  /* Paragraphs: nice wrapping */
15
  .f1-p, .f1-li {
@@ -39,41 +41,50 @@ custom_css = """
39
  .f1-a { color: #2563eb; text-decoration: none; font-weight: 500; }
40
  .f1-a:hover { text-decoration: underline; }
41
 
42
- /* Blockquote & problem box */
43
  .f1-blockquote { border-left: 4px solid #d1d5db; padding-left: 1rem; margin-left: 0; font-style: italic; color: #4b5563; }
44
- .f1-problem-box { background-color: var(--f1-bg-muted); border: 1px solid var(--f1-border); border-radius: 8px; padding: 16px; margin-top: 16px; margin-bottom: 16px; box-shadow: 0 1px 2px 0 rgb(0 0 0 / 0.04); }
45
  .f1-problem-name { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; font-weight: 600; text-align: center; }
46
 
47
- /* Clean, centered table: only top & left borders; not full width; headers centered */
48
- .f1-table-wrap { margin: 10px auto 8px auto; text-align: center; }
49
- .f1-table {
50
- border-collapse: collapse;
51
- width: auto; /* not full width */
52
- margin: 0 auto; /* centered */
53
  border-top: 1px solid var(--f1-border);
54
  border-left: 1px solid var(--f1-border);
55
  background: var(--f1-bg);
56
  }
57
- .f1-table th, .f1-table td {
58
- padding: 8px 12px;
59
- text-align: left;
60
- vertical-align: top;
61
- }
62
- .f1-table th { text-align: center; } /* header cells centered */
63
- .f1-table tr + tr td { border-top: 1px solid var(--f1-border); } /* row separators that visually keep top line only */
64
- .f1-table td + td, .f1-table th + th { border-left: 1px solid var(--f1-border); } /* vertical grid from left border only */
65
 
66
- /* Examples block (Gradio-based) */
67
- #f1-examples { border: 1px solid var(--f1-border); border-radius: 8px; background: var(--f1-bg); margin-bottom: 12px; }
68
  #f1-examples .f1-examples-title { font-weight: 700; margin: 12px 14px 4px 14px; color: var(--f1-text); font-size: 1.1rem; }
69
- #f1-example-radio { border-top: 1px solid var(--f1-border); padding: 6px 8px; }
70
- #f1-example-radio .wrap { display: flex; gap: 8px; flex-wrap: wrap; justify-content: flex-start; }
71
- #f1-example-radio label { border: 1px solid transparent; border-radius: 6px; padding: 6px 10px; cursor: pointer; }
72
- #f1-example-radio input[type="radio"]:checked + span { background: var(--f1-bg-muted); border: 1px solid var(--f1-border); border-radius: 6px; padding: 6px 10px; }
73
 
74
- /* Leaderboard: full-width controls, nesting rule, center container */
75
- #formulaone-leaderboard-tab-table { max-width: 1200px; margin-left: auto; margin-right: auto; } /* center the whole tab (8) */
76
- #formulaone-leaderboard-tab-table .gr-column .gr-row .gr-column { min-width: 80% !important; } /* (7) exact chain rule */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  #formulaone-leaderboard-tab-table .gr-row, #formulaone-leaderboard-tab-table .gr-column { width: 100% !important; max-width: 100% !important; }
78
  #formulaone-leaderboard-tab-table [data-testid="dropdown"], #formulaone-leaderboard-tab-table input[type="text"] { width: 100% !important; }
79
 
 
7
  --f1-bg-muted: #f9fafb;
8
  }
9
 
10
+ /* Readable width everywhere */
11
  .f1-container { max-width: 800px; margin: 0 auto; padding: 0 16px; }
12
  .markdown-text { font-size: 16px !important; max-width: 800px; margin: 0 auto; }
13
+ #what-is-tab { max-width: 800px; margin-left: auto; margin-right: auto; } /* keep the whole tab narrow */
14
+ #f1-examples { max-width: 800px; margin: 0 auto; } /* ensure examples stay narrow */
15
 
16
  /* Paragraphs: nice wrapping */
17
  .f1-p, .f1-li {
 
41
  .f1-a { color: #2563eb; text-decoration: none; font-weight: 500; }
42
  .f1-a:hover { text-decoration: underline; }
43
 
44
+ /* Blockquote & problem name */
45
  .f1-blockquote { border-left: 4px solid #d1d5db; padding-left: 1rem; margin-left: 0; font-style: italic; color: #4b5563; }
 
46
  .f1-problem-name { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; font-weight: 600; text-align: center; }
47
 
48
+ /* ===== Clean "table" using divs (centered, not full width, only top & left borders) ===== */
49
+ .f1-grid-wrap { text-align: center; margin: 10px auto 8px auto; }
50
+ .f1-grid-table {
51
+ display: inline-block; /* center by shrink-to-fit */
 
 
52
  border-top: 1px solid var(--f1-border);
53
  border-left: 1px solid var(--f1-border);
54
  background: var(--f1-bg);
55
  }
56
+ .f1-grid-row { display: grid; grid-template-columns: auto auto 1fr; align-items: start; }
57
+ .f1-grid-row + .f1-grid-row { border-top: 1px solid var(--f1-border); } /* horizontal separators */
58
+ .f1-grid-cell { padding: 8px 12px; text-align: left; }
59
+ .f1-grid-head .f1-grid-cell { font-weight: 600; text-align: center; } /* centered headers */
 
 
 
 
60
 
61
+ /* ===== Examples card (look like the nicer previous version) ===== */
62
+ #f1-examples { border: 1px solid var(--f1-border); border-radius: 10px; background: var(--f1-bg); box-shadow: 0 1px 2px rgba(0,0,0,0.04); margin-bottom: 12px; }
63
  #f1-examples .f1-examples-title { font-weight: 700; margin: 12px 14px 4px 14px; color: var(--f1-text); font-size: 1.1rem; }
 
 
 
 
64
 
65
+ /* Problem content: consistent, subtle background */
66
+ #f1-examples .f1-problem-markdown .markdown {
67
+ background: var(--f1-bg-muted);
68
+ border: 1px solid var(--f1-border);
69
+ border-radius: 8px;
70
+ padding: 16px;
71
+ margin: 0 14px 6px 14px;
72
+ }
73
+
74
+ /* Bottom "tabs" using Radio, styled as quiet pills aligned to the bottom edge */
75
+ #f1-example-radio { border-top: 1px solid var(--f1-border); padding: 6px 8px 8px 8px; margin: 0 6px 6px; }
76
+ #f1-example-radio .wrap { display: flex; gap: 6px; flex-wrap: wrap; justify-content: flex-start; }
77
+ #f1-example-radio label { border: 1px solid transparent; border-radius: 999px; padding: 6px 10px; cursor: pointer; }
78
+ #f1-example-radio input[type="radio"]:checked + span {
79
+ background: var(--f1-bg-muted);
80
+ border: 1px solid var(--f1-border);
81
+ border-radius: 999px;
82
+ padding: 6px 10px;
83
+ }
84
+
85
+ /* Leaderboard: center the whole tab and satisfy the nesting/min-width rule */
86
+ #formulaone-leaderboard-tab-table { max-width: 1200px; margin-left: auto; margin-right: auto; } /* center (8) */
87
+ #formulaone-leaderboard-tab-table .gr-column .gr-row .gr-column { min-width: 80% !important; } /* (7) exact chain rule */
88
  #formulaone-leaderboard-tab-table .gr-row, #formulaone-leaderboard-tab-table .gr-column { width: 100% !important; max-width: 100% !important; }
89
  #formulaone-leaderboard-tab-table [data-testid="dropdown"], #formulaone-leaderboard-tab-table input[type="text"] { width: 100% !important; }
90