FormulaOne-Leaderboard

Runtime error

App Files Files Community

galb-dai commited on Aug 13

Commit

005b269

1 Parent(s): cfae876

Changed.

Browse files

Files changed (3) hide show

app.py +15 -10
src/about.py +123 -128
src/display/css_html_js.py +79 -27

app.py CHANGED Viewed

@@ -173,18 +173,23 @@ def gate_submission(oauth_token: gr.OAuthToken | None):
 def get_theme():
-    # return gr.themes.Soft(
-    #     primary_hue=gr.themes.colors.blue,
-    #     secondary_hue=gr.themes.colors.sky,
-    #     neutral_hue=gr.themes.colors.gray,
-    # ).set(
-    #     body_background_fill="#FFFFFF",
-    #     panel_background_fill="#f3f4f6",
-    # )
     return "light"
-blocks = gr.Blocks(css=custom_css, theme=get_theme(), js="() => { document.body.classList.remove('dark') }")
 with blocks:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
@@ -214,7 +219,7 @@ with blocks:
             login_box = gr.Group(visible=True)
             with login_box:
-                gr.Markdown("Please sign in with Hugging Face to submit")
                 gr.LoginButton()
             submit_panel = gr.Group(visible=False)

 def get_theme():
     return "light"
+# Force light theme even if HF user prefers dark
+_force_light_js = (
+    "() => {\n"
+    "  try {\n"
+    "    document.body.classList.remove('dark');\n"
+    "    document.documentElement.classList.remove('dark');\n"
+    "    document.documentElement.setAttribute('data-theme','light');\n"
+    "    // Some HF shells add data-color-mode\n"
+    "    document.documentElement.setAttribute('data-color-mode','light');\n"
+    "  } catch (e) {}\n"
+    "}"
+)
+blocks = gr.Blocks(css=custom_css, theme=get_theme(), js=_force_light_js)
 with blocks:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
             login_box = gr.Group(visible=True)
             with login_box:
+                gr.Markdown("Please sign in with Hugging Face to submit", elem_classes="markdown-text")
                 gr.LoginButton()
             submit_panel = gr.Group(visible=False)

src/about.py CHANGED Viewed

@@ -3,162 +3,157 @@ PAPER_URL = "https://arxiv.org/abs/2507.13337"
 WHAT_IS_F1_HTML = f"""
 <!DOCTYPE html>
-<html lang="en">
 <body>
-    <main class="max-w-3xl mx-auto">
-        <header class="text-center mb-12">
-            <h1 class="text-4xl md:text-5xl font-bold text-gray-900 f1-h1">FormulaOne</h1>
         </header>
         <section>
-            <p class="text-lg mb-4 f1-p">Frontier AI models have recently demonstrated strong performance on mathematical and algorithmic benchmarks, including earning <a href="https://deepmind.google/discover/blog/advanced-version-of-gemini-with-deep-think-officially-achieves-gold-medal-standard-at-the-international-mathematical-olympiad/" target="_blank" rel="noopener noreferrer" class="f1-a">gold medals in olympiads</a>, and attaining <a href="https://arxiv.org/html/2502.06807v1" target="_blank" rel="noopener noreferrer" class="f1-a">top percentile ratings</a> in competitive programming contests. How well do such benchmarks capture the true depth of algorithmic reasoning, as it arises in real-world research problems?</p>
-            <p class="text-lg mb-4 f1-p">We believe that existing benchmarks fail to capture the deep reasoning skills required for complex, research-level algorithmic problems. To address this gap, <a href="{PAPER_URL}" target="_blank" rel="noopener noreferrer" class="f1-a">we introduce <strong>FormulaOne</strong></a>.</p>
-            <p class="mb-4 f1-p"><strong>FormulaOne</strong> consists of 220 novel dynamic programming problems over graphs. The problems are organised into three categories, ranging from moderate difficulty and all the way up to research-level.</p>
-            <!-- Nicely Styled Card Layout for Categories -->
-            <div class="my-8 grid grid-cols-1 gap-4 sm:grid-cols-3 text-center">
-                <div class="f1-category-card">
-                    <h3 class="text-lg font-bold text-gray-900">FormulaOne Warmup</h3>
-                    <p class="mt-1 text-sm text-gray-600">A set of 100 “easier” problems.</p>
-                </div>
-                <div class="f1-category-card">
-                    <h3 class="text-lg font-bold text-gray-900">FormulaOne Tier 1</h3>
-                    <p class="mt-1 text-sm text-gray-600">A set of 100 challenging problems.</p>
-                </div>
-                <div class="f1-category-card">
-                    <h3 class="text-lg font-bold text-gray-900">FormulaOne Tier 2</h3>
-                    <p class="mt-1 text-sm text-gray-600">A set of 20 highly challenging problems.</p>
-                </div>
             </div>
-            <!-- Tabbed Problem Viewer -->
-            <div class="mt-8">
-                <div class="border-b border-gray-200">
-                    <nav class="-mb-px flex space-x-6" aria-label="Tabs">
-                        <button id="problem-tab-btn-warmup" class="problem-tab-button shrink-0 border-b-2 px-1 pb-4 text-sm font-medium">Warmup Example</button>
-                        <button id="problem-tab-btn-tier1" class="problem-tab-button shrink-0 border-b-2 px-1 pb-4 text-sm font-medium">Tier 1 Example</button>
-                        <button id="problem-tab-btn-tier2" class="problem-tab-button shrink-0 border-b-2 px-1 pb-4 text-sm font-medium">Tier 2 Example</button>
-                    </nav>
                 </div>
-                <div class="mt-4">
-                    <div id="problem-tab-content-warmup" class="problem-tab-content f1-problem-box">
-                        <p class="font-bold text-lg mb-2">Union-of-Paths-and-Cycles</p>
-                        <p class="mb-2"><strong>Objective:</strong> Compute the sum of all weights of sets S⊆V such that the graph G[S], induced over S, is a collection of disjoint paths and cycles.</p>
-                    </div>
-                    <div id="problem-tab-content-tier1" class="problem-tab-content f1-problem-box hidden">
-                        <p class="font-bold text-lg mb-2">Maximal-Union-of-Paths-and-Cycles</p>
-                        <p class="mb-2"><strong>Objective:</strong> Compute the sum of all weights of sets S⊆V such that the graph G[S], induced over S, is a collection of disjoint paths and cycles, and S is maximal with respect to this property.</p>
-                    </div>
-                    <div id="problem-tab-content-tier2" class="problem-tab-content f1-problem-box hidden">
-                        <p class="font-bold text-lg mb-2">Maximal-Union-of-Cycles</p>
-                        <p class="mb-2"><strong>Objective:</strong> Compute the sum of all weights of sets S⊆V such that the graph G[S], induced over S, is a collection of disjoint cycles, and S is maximal with respect to this property.</p>
-                    </div>
                 </div>
             </div>
-            <p class="mt-6 mb-4 f1-p">The latter category is incredibly demanding, requiring resolution of many points of uncertainty, and involving an array of reasoning steps, including topological and geometric insight, knowledge of mathematical domains such as extremal graph theory and logic, combinatorial considerations, precise implementation, and more.</p>
-            <p class="f1-p">Despite <a href="https://epoch.ai/frontiermath" target="_blank" rel="noopener noreferrer" class="f1-a">impressive</a> <a href="https://artificialanalysis.ai/evaluations/gpqa-diamond" target="_blank" rel="noopener noreferrer" class="f1-a">performance</a> on existing benchmarks, presently <strong>no model solves even a single FormulaOne Tier 2 problem</strong>.<sup><a href="#evaluation" class="f1-a">1</a></sup></p>
         </section>
         <section>
-            <h2 class="text-3xl font-bold text-gray-900 f1-h2">An “Infinite Well” of Problems</h2>
-            <p class="mb-4 f1-p">The novelty and vastness of FormulaOne stems from its theoretical foundation. The questions are not arbitrary puzzles, but are instead drawn from the highly expressive framework of <a href="https://en.wikipedia.org/wiki/Monadic_second-order_logic" target="_blank" rel="noopener noreferrer" class="f1-a"><strong>Monadic Second-Order</strong> (MSO) logic on graphs</a>. This provides a principled, semi-automatic way to generate a virtually infinite supply of mathematically deep algorithmic challenges. Despite their theoretical underpinnings, the problems in FormulaOne are natural and succinct:</p>
-            <div class="f1-problem-box">
-                <p class="font-bold text-lg mb-2">Problem #44</p>
-                <p class="mb-2"><strong>Input:</strong> A tree-like graph G=(V,E), a tree decomposition T of G, and a weight function w:V→N.</p>
-                <p class="mb-2"><strong>Objective:</strong> Compute the sum of all weights of sets S⊆V such that the graph G[S], induced over S, does not contain any cycle of length four.</p>
-                <p class="text-sm text-gray-600"><strong>Notation:</strong> The weight of a set of vertices S is defined as w(S) ≜ ∑<sub>v∈S</sub>w(v). The final result should be returned modulo 10<sup>9</sup>+7.</p>
             </div>
-            <p class="mb-4 f1-p">While the problems are often natural to state, their solutions are far from obvious. The solvability of this vast class of problems is guaranteed by an algorithmic <strong>meta-theorem</strong> due to <a href="https://en.wikipedia.org/wiki/Courcelle%27s_theorem" target="_blank" rel="noopener noreferrer" class="f1-a">Courcelle</a>, which broadly states:</p>
-            <blockquote class="my-6 f1-blockquote">
-                “For every sufficiently tree-like graph, any problem definable in an expressive formal logic — Monadic Second-Order (MSO) logic — can be solved by a dynamic programming algorithm that operates in time linear in the order of the graph.”
-            </blockquote>
-            <p class="f1-p">The key is to use a structure known as a tree decomposition, which organises the graph’s vertices into a series of overlapping sets, or “bags”, that are themselves arranged in a tree.</p>
-            <figure class="f1-figure">
-                <img src="/file=assets/bag_modifications.png" alt="An illustration of local modifications to bags (dashed boxes)" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
-                <figcaption class="f1-figcaption">An illustration of local modifications to bags: Introduce, Forget, and Join.</figcaption>
             </figure>
-            <p class="mb-4 f1-p">An algorithm can then traverse this tree of bags, solving the problem piece by piece using dynamic programming. This process involves designing a “state” that summarises all necessary information about the partial solution within a bag, and then defining how this state transforms as vertices are introduced, forgotten, or bags are merged.</p>
-            <figure class="f1-figure">
-                <video class="w-full max-w-2xl mx-auto rounded-lg shadow-lg" autoplay loop muted playsinline>
-                    <source src="/file=assets/dp_animation.mp4" type="video/mp4">
                     Your browser does not support the video tag.
                 </video>
-                <figcaption class="f1-figcaption">Animation showing the design of a compressed dynamic programming state-space.</figcaption>
             </figure>
-            <p class="f1-p">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem – <strong>Maximal-Cluster-Graph</strong> – <a href="https://arxiv.org/pdf/2507.13337#appendix.A" target="_blank" rel="noopener noreferrer" class="f1-a">see the appendix of our paper</a>.</p>
-        </section>
-        <section>
-            <h2 class="text-3xl font-bold text-gray-900 f1-h2">Guiding Principles</h2>
-            <ul class="list-disc list-inside space-y-4">
-                <li class="f1-li"><strong>An In-Distribution Benchmark for Reasoning.</strong> Unlike <a href="https://arcprize.org/arc-agi" target="_blank" rel="noopener noreferrer" class="f1-a">benchmarks</a> that test for out-of-distribution generalisation, FormulaOne presents problems that are squarely <strong>in-distribution</strong> for models trained on code. Essentially, dynamic programming on graphs is the “bread and butter” of algorithmic programming. Thus, models’ current failure on FormulaOne highlights a fundamental deficit in deep, multi-step reasoning, rather than a lack of domain exposure.</li>
-                <li class="f1-li"><strong>An Unbounded Environment for Reinforcement Learning.</strong> The MSO framework allows for the generation of a nearly infinite stream of algorithmic problems with verifiable solutions, making it an ideal environment for training and evaluating agents with Reinforcement Learning with Verifiable Rewards (RLVR).</li>
-                <li class="f1-li"><strong>Probing the Frontiers of Complexity Theory.</strong> Many problems in our dataset are related to central conjectures in fine-grained complexity, such as the <a href="https://en.wikipedia.org/wiki/Exponential_time_hypothesis" target="_blank" rel="noopener noreferrer" class="f1-a">Strong Exponential Time Hypothesis</a> (SETH). If a model were to discover a significantly faster algorithm for one of these problems, that would constitute a significant contribution to theoretical computer science.</li>
-            </ul>
         </section>
-        <section id="evaluation">
-            <h2 class="text-3xl font-bold text-gray-900 f1-h2">Evaluation</h2>
-            <p class="mb-4 f1-p">To give models the best possible chance of success, we provide a generous few-shot prompt that covers a broad array of the ideas and techniques involved in solving these problems. All models were evaluated using their highest available reasoning settings and with the maximum context length permitted.</p>
-            <p class="mb-4 f1-p">Each submitted solution is subjected to a rigorous and automated <a href="https://arxiv.org/pdf/2507.13337#section.4" target="_blank" rel="noopener noreferrer" class="f1-a">test suite</a> that measures three key aspects of its validity:</p>
-            <ul class="list-disc list-inside space-y-2 mb-6">
-                <li class="f1-li"><strong>Correctness:</strong> The output of the submitted algorithm must be correct on all graphs.</li>
-                <li class="f1-li"><strong>Consistency:</strong> The solution must produce the same output for a given graph, regardless of the specific structure of its tree decomposition.</li>
-                <li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
             </ul>
-            <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our public GitHub repository: <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">https://github.com/double-ai/formulaone-dataset/tree/main</a>.</p>
-            <p class="f1-p">In contrast, to maintain the integrity of the core benchmark, only a minimal subset of tests is released for the <code>FormulaOne Tier 1</code> and <code>Tier 2</code> problems.</p>
-            <h3 class="text-2xl font-bold text-gray-900 mt-8 mb-4">Model Accuracy</h3>
-            <p class="mb-4 f1-p">On the <strong>FormulaOne-Warmup</strong> problems, frontier models perform reasonably well. This confirms they have a foundational capability for these types of algorithmic tasks.</p>
-            <figure class="f1-figure">
-                <img src="/file=assets/warmup_performance.png" alt="Plot showing model performance on FormulaOne-Warmup" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
-                <figcaption class="f1-figcaption">Performance of frontier models on the FormulaOne-Warmup dataset.</figcaption>
             </figure>
-            <p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>FormulaOne Tier 1</strong>, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
-            <figure class="f1-figure">
-                <img src="/file=assets/tier1_performance.png" alt="Plot showing model performance on FormulaOne Tier 1" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
-                <figcaption class="f1-figcaption">Figure 1: Performance of frontier reasoning models on the FormulaOne dataset.</figcaption>
             </figure>
-            <p class="f1-p">This trend culminates in <strong>FormulaOne Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
         </section>
     </main>
     <script>
-        // This script needs to be run after the DOM is loaded.
-        // Gradio's gr.HTML component loads content dynamically, so we use a small delay.
-        setTimeout(() => {{
-            const problemTabs = [
-                {{ btn: document.getElementById('problem-tab-btn-warmup'), content: document.getElementById('problem-tab-content-warmup') }},
-                {{ btn: document.getElementById('problem-tab-btn-tier1'), content: document.getElementById('problem-tab-content-tier1') }},
-                {{ btn: document.getElementById('problem-tab-btn-tier2'), content: document.getElementById('problem-tab-content-tier2') }}
-            ];
-            const activeClasses = 'border-blue-600 text-blue-600';
-            const inactiveClasses = 'border-transparent text-gray-500 hover:text-gray-700 hover:border-gray-300';
-            problemTabs.forEach(tab => {{
-                if(tab.btn) {{
-                    tab.btn.addEventListener('click', () => {{
-                        problemTabs.forEach(t => {{
-                            if(t.btn) {{
-                                t.btn.classList.remove(...activeClasses.split(' '));
-                                t.btn.classList.add(...inactiveClasses.split(' '));
-                                t.content.classList.add('hidden');
-                            }}
-                        }});
-                        tab.btn.classList.add(...activeClasses.split(' '));
-                        tab.btn.classList.remove(...inactiveClasses.split(' '));
-                        tab.content.classList.remove('hidden');
-                    }});
-                }}
-            }});
-            // Set initial active tab
-            if (problemTabs.length > 0 && problemTabs[0].btn) {{
-                problemTabs[0].btn.click();
-            }}
-        }}, 100); // 100ms delay to ensure elements are in the DOM
     </script>
 </body>
 </html>
 """
 EVALUATION_QUEUE_TEXT = """
 ## Submitting to the FormulaOne Leaderboard

 WHAT_IS_F1_HTML = f"""
 <!DOCTYPE html>
+<html lang=\"en\">
 <body>
+    <main class=\"f1-container\">
+        <header class=\"text-center mb-12\">
+            <h1 class=\"f1-h1\">FormulaOne</h1>
         </header>
         <section>
+            <p class=\"text-lg mb-4 f1-p\">Frontier AI models have recently demonstrated strong performance on mathematical and algorithmic benchmarks, including earning <a href=\"https://deepmind.google/discover/blog/advanced-version-of-gemini-with-deep-think-officially-achieves-gold-medal-standard-at-the-international-mathematical-olympiad/\" target=\"_blank\" rel=\"noopener noreferrer\" class=\"f1-a\">gold medals in olympiads</a>, and attaining <a href=\"https://arxiv.org/html/2502.06807v1\" target=\"_blank\" rel=\"noopener noreferrer\" class=\"f1-a\">top percentile ratings</a> in competitive programming contests. How well do such benchmarks capture the true depth of algorithmic reasoning, as it arises in real-world research problems?</p>
+            <p class=\"text-lg mb-4 f1-p\">We believe that existing benchmarks fail to capture the deep reasoning skills required for complex, research-level algorithmic problems. To address this gap, <a href=\"{PAPER_URL}\" target=\"_blank\" rel=\"noopener noreferrer\" class=\"f1-a\">we introduce <strong>FormulaOne</strong></a>.</p>
+            <p class=\"mb-4 f1-p\"><strong>FormulaOne</strong> consists of 220 novel dynamic programming problems over graphs. The problems are organised into three categories, ranging from moderate difficulty and all the way up to research-level.</p>
+            <!-- Beautiful comparison table -->
+            <div class=\"f1-table-wrapper\">
+              <table class=\"f1-table\" aria-label=\"Comparison of FormulaOne tiers\">
+                <thead>
+                  <tr>
+                    <th scope=\"col\">Attribute</th>
+                    <th scope=\"col\">Warmup</th>
+                    <th scope=\"col\">Tier 1</th>
+                    <th scope=\"col\">Tier 2</th>
+                  </tr>
+                </thead>
+                <tbody>
+                  <tr>
+                    <td><strong>Problems</strong></td>
+                    <td>100</td>
+                    <td>100</td>
+                    <td>20</td>
+                  </tr>
+                  <tr>
+                    <td><strong>Difficulty</strong></td>
+                    <td>Moderate</td>
+                    <td>Challenging</td>
+                    <td>Research‑level</td>
+                  </tr>
+                  <tr>
+                    <td><strong>Description</strong></td>
+                    <td>A set of 100 “easier” problems.</td>
+                    <td>A set of 100 challenging problems.</td>
+                    <td>A set of 20 highly challenging problems.</td>
+                  </tr>
+                </tbody>
+              </table>
             </div>
+            <!-- Tabbed Problem Viewer with bottom tabs -->
+            <div class=\"f1-tabs\" id=\"f1-problem-tabs\">
+              <div class=\"f1-tabs-header\">Examples of FormulaOne problems</div>
+              <div class=\"f1-tabs-content\">
+                <div id=\"f1-tab-warmup\" class=\"f1-problem-box\" role=\"tabpanel\">
+                    <p class=\"mb-2\"><strong>Union of Paths and Cycles</strong></p>
+                    <p class=\"mb-2 f1-p\"><strong>Objective:</strong> Compute the sum of all weights of sets \( S\\subseteq V \) such that the induced subgraph \( G[S] \) is a disjoint union of paths and cycles.</p>
                 </div>
+                <div id=\"f1-tab-tier1\" class=\"f1-problem-box\" role=\"tabpanel\" hidden>
+                    <p class=\"mb-2\"><strong>Maximal Union of Paths and Cycles</strong></p>
+                    <p class=\"mb-2 f1-p\"><strong>Objective:</strong> Compute the sum of all weights of sets \( S\\subseteq V \) such that \( G[S] \) is a disjoint union of paths and cycles, and \( S \) is maximal with respect to this property.</p>
                 </div>
+                <div id=\"f1-tab-tier2\" class=\"f1-problem-box\" role=\"tabpanel\" hidden>
+                    <p class=\"mb-2\"><strong>Maximal Union of Cycles</strong></p>
+                    <p class=\"mb-2 f1-p\"><strong>Objective:</strong> Compute the sum of all weights of sets \( S\\subseteq V \) such that \( G[S] \) is a disjoint union of cycles, and \( S \) is maximal with respect to this property.</p>
+                </div>
+              </div>
+              <div class=\"f1-tabs-bar\" role=\"tablist\" aria-label=\"Example tiers\">
+                <button class=\"f1-tab-btn is-active\" data-target=\"f1-tab-warmup\" role=\"tab\" aria-controls=\"f1-tab-warmup\" aria-selected=\"true\">Warmup</button>
+                <button class=\"f1-tab-btn\" data-target=\"f1-tab-tier1\" role=\"tab\" aria-controls=\"f1-tab-tier1\">Tier 1</button>
+                <button class=\"f1-tab-btn\" data-target=\"f1-tab-tier2\" role=\"tab\" aria-controls=\"f1-tab-tier2\">Tier 2</button>
+              </div>
             </div>
+            <p class=\"mt-6 mb-4 f1-p\">The latter category is incredibly demanding, requiring resolution of many points of uncertainty, and involving an array of reasoning steps, including topological and geometric insight, knowledge of mathematical domains such as extremal graph theory and logic, combinatorial considerations, precise implementation, and more.</p>
+            <p class=\"f1-p\">Despite <a href=\"https://epoch.ai/frontiermath\" target=\"_blank\" rel=\"noopener noreferrer\" class=\"f1-a\">impressive</a> <a href=\"https://artificialanalysis.ai/evaluations/gpqa-diamond\" target=\"_blank\" rel=\"noopener noreferrer\" class=\"f1-a\">performance</a> on existing benchmarks, presently <strong>no model solves even a single FormulaOne Tier 2 problem</strong>.<sup><a href=\"#evaluation\" class=\"f1-a\">1</a></sup></p>
         </section>
         <section>
+            <h2 class=\"f1-h2\">An “Infinite Well” of Problems</h2>
+            <div class=\"f1-problem-box\">
+                <p class=\"mb-2 f1-p\"><strong>Input:</strong> A tree‑like graph \( G=(V,E) \), a tree decomposition \( T \) of \( G \), and a weight function \( w:V\\to\\mathbb{{N}} \).</p>
+                <p class=\"mb-2 f1-p\"><strong>Objective:</strong> Compute the sum of all weights of sets \( S\\subseteq V \) such that the induced subgraph \( G[S] \) does not contain any 4‑cycle \( C_4 \).</p>
+                <p class=\"text-sm text-gray-600 f1-p\"><strong>Notation:</strong> The weight of a set of vertices \( S \) is defined as \( w(S) \\coloneqq \\sum_{{v\\in S}} w(v) \). The final result should be returned modulo \( 10^9+7 \).</p>
             </div>
+            <p class=\"mb-4 f1-p\">While the problems are often natural to state, their solutions are far from obvious. The solvability of this vast class of problems is guaranteed by an algorithmic <strong>meta-theorem</strong> due to <a href=\"https://en.wikipedia.org/wiki/Courcelle%27s_theorem\" target=\"_blank\" rel=\"noopener noreferrer\" class=\"f1-a\">Courcelle</a>, which broadly states:</p>
+            <blockquote class=\"my-6 f1-blockquote\">“For every sufficiently tree-like graph, any problem definable in an expressive formal logic — Monadic Second-Order (MSO) logic — can be solved by a dynamic programming algorithm that operates in time linear in the order of the graph.”</blockquote>
+            <p class=\"f1-p\">The key is to use a structure known as a tree decomposition, which organises the graph’s vertices into a series of overlapping sets, or “bags”, that are themselves arranged in a tree.</p>
+            <figure class=\"f1-figure\">
+                <img src=\"/file=assets/bag_modifications.png\" alt=\"An illustration of local modifications to bags (dashed boxes)\" class=\"max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md\">
+                <figcaption class=\"f1-figcaption\">An illustration of local modifications to bags: Introduce, Forget, and Join.</figcaption>
             </figure>
+            <p class=\"mb-4 f1-p\">An algorithm can then traverse this tree of bags, solving the problem piece by piece using dynamic programming. This process involves designing a “state” that summarises all necessary information about the partial solution within a bag, and then defining how this state transforms as vertices are introduced, forgotten, or bags are merged.</p>
+            <figure class=\"f1-figure\">
+                <video class=\"w-full max-w-2xl mx-auto rounded-lg shadow-lg\" autoplay loop muted playsinline>
+                    <source src=\"/file=assets/dp_animation.mp4\" type=\"video/mp4\">
                     Your browser does not support the video tag.
                 </video>
+                <figcaption class=\"f1-figcaption\">Animation showing the design of a compressed dynamic programming state-space.</figcaption>
             </figure>
+            <p class=\"f1-p\">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem – <strong>Maximal-Cluster-Graph</strong> – <a href=\"https://arxiv.org/pdf/2507.13337#appendix.A\" target=\"_blank\" rel=\"noopener noreferrer\" class=\"f1-a\">see the appendix of our paper</a>.</p>
         </section>
+        <section id=\"evaluation\">
+            <h2 class=\"f1-h2\">Evaluation</h2>
+            <p class=\"mb-4 f1-p\">To give models the best possible chance of success, we provide a generous few-shot prompt that covers a broad array of the ideas and techniques involved in solving these problems. All models were evaluated using their highest available reasoning settings and with the maximum context length permitted.</p>
+            <p class=\"mb-4 f1-p\">Each submitted solution is subjected to a rigorous and automated <a href=\"https://arxiv.org/pdf/2507.13337#section.4\" target=\"_blank\" rel=\"noopener noreferrer\" class=\"f1-a\">test suite</a> that measures three key aspects of its validity:</p>
+            <ul class=\"list-disc list-inside space-y-2 mb-6\">
+                <li class=\"f1-li\"><strong>Correctness:</strong> The output of the submitted algorithm must be correct on all graphs.</li>
+                <li class=\"f1-li\"><strong>Consistency:</strong> The solution must produce the same output for a given graph, regardless of the specific structure of its tree decomposition.</li>
+                <li class=\"f1-li\"><strong>Efficiency:</strong> The solution must be truly <a href=\"https://en.wikipedia.org/wiki/Parameterized_complexity\" target=\"_blank\" rel=\"noopener noreferrer\" class=\"f1-a\">fixed-parameter linear</a>.</li>
             </ul>
+            <p class=\"mb-4 f1-p\">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our public GitHub repository: <a href=\"https://github.com/double-ai/formulaone-dataset/tree/main\" target=\"_blank\" rel=\"noopener noreferrer\" class=\"f1-a\">https://github.com/double-ai/formulaone-dataset/tree/main</a>.</p>
+            <p class=\"f1-p\">In contrast, to maintain the integrity of the core benchmark, only a minimal subset of tests is released for the <code>FormulaOne Tier 1</code> and <code>Tier 2</code> problems.</p>
+            <h3 class=\"text-2xl font-bold text-gray-900 mt-8 mb-4\">Model Accuracy</h3>
+            <p class=\"mb-4 f1-p\">On the <strong>FormulaOne-Warmup</strong> problems, frontier models perform reasonably well. This confirms they have a foundational capability for these types of algorithmic tasks.</p>
+            <figure class=\"f1-figure\">
+                <img src=\"/file=assets/warmup_performance.png\" alt=\"Plot showing model performance on FormulaOne-Warmup\" class=\"max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md\">
+                <figcaption class=\"f1-figcaption\">Performance of frontier models on the FormulaOne-Warmup dataset.</figcaption>
             </figure>
+            <p class=\"mb-4 f1-p\">However, as the reasoning depth increases in <strong>FormulaOne Tier 1</strong>, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
+            <figure class=\"f1-figure\">
+                <img src=\"/file=assets/tier1_performance.png\" alt=\"Plot showing model performance on FormulaOne Tier 1\" class=\"max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md\">
+                <figcaption class=\"f1-figcaption\">Figure 1: Performance of frontier reasoning models on the FormulaOne dataset.</figcaption>
             </figure>
+            <p class=\"f1-p\">This trend culminates in <strong>FormulaOne Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
         </section>
     </main>
+    <!-- MathJax for LaTeX rendering -->
+    <script id=\"MathJax-script\" async src=\"https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js\"></script>
     <script>
+      // Initialize bottom tabs and retrigger MathJax when switching
+      setTimeout(() => {{
+        const container = document.getElementById('f1-problem-tabs');
+        if (!container) return;
+        const buttons = container.querySelectorAll('.f1-tab-btn');
+        const panels = Array.from(container.querySelectorAll('[role="tabpanel"]'));
+        function activate(targetId) {{
+          panels.forEach(p => p.hidden = (p.id !== targetId));
+          buttons.forEach(b => b.classList.toggle('is-active', b.dataset.target === targetId));
+          if (window.MathJax && window.MathJax.typesetPromise) {{
+            window.MathJax.typesetPromise();
+          }}
+        }}
+        buttons.forEach(btn => btn.addEventListener('click', () => activate(btn.dataset.target)));
+        // Initial state
+        activate('f1-tab-warmup');
+      }}, 120);
     </script>
 </body>
 </html>
 """
 EVALUATION_QUEUE_TEXT = """
 ## Submitting to the FormulaOne Leaderboard

src/display/css_html_js.py CHANGED Viewed

@@ -1,26 +1,41 @@
 custom_css = """
 .markdown-text {
     font-size: 16px !important;
 }
 .banner_image { width: 75% !important; align-self: center !important; }
-@import url('https://fonts.googleapis.com/css2?family=Exo+2:wght@500;600&display=swap');
 button[role="tab"] {
-  font-family: 'Exo 2', system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif !important;
   letter-spacing: 0.25px;
   font-weight: 600;
   font-size: 18px !important;
 }
 #models-to-add-text { font-size: 18px !important; }
 #citation-button span { font-size: 16px !important; }
 #citation-button textarea { font-size: 16px !important; }
 #citation-button > label > button { margin: 6px; transform: scale(1.3); }
 #leaderboard-table { margin-top: 15px }
 #leaderboard-table-lite { margin-top: 15px }
 #search-bar-table-box > div:first-child { background: none; border: none; }
 #search-bar { padding: 0px; }
 #leaderboard-table td:nth-child(2),
 #leaderboard-table th:nth-child(2) { max-width: 400px; overflow: auto; white-space: nowrap; }
-.tab-buttons button { font-size: 20px; }
 #scale-logo { border-style: none !important; box-shadow: none; display: block; margin-left: auto; margin-right: auto; max-width: 600px; }
 #scale-logo .download { display: none; }
 #filter_type{ border: 0; padding-left: 0; padding-top: 0; }
@@ -33,38 +48,75 @@ button[role="tab"] {
 #filter-columns-size{ border:0; padding:0.5; }
 #box-filter > .form{ border: 0 }
-/* Light Theme Styles for the "What is FormulaOne" HTML content */
-.f1-h1 { font-weight: 700; font-size: 2.25rem; line-height: 2.5rem; color: #111827; text-align: center; margin-bottom: 3rem !important;}
-.f1-h2 { font-weight: 700; border-bottom: 1px solid #e5e7eb; padding-bottom: 0.5rem; margin-top: 2.5rem; margin-bottom: 1.5rem; color: #111827; font-size: 1.875rem; line-height: 2.25rem; }
 .f1-p, .f1-li { line-height: 1.75; color: #374151; }
 .f1-a { color: #2563eb; text-decoration: none; font-weight: 500; }
 .f1-a:hover { text-decoration: underline; }
-.f1-blockquote { border-left: 4px solid #d1d5db; padding-left: 1rem; margin-left: 0; font-style: italic; color: #4b5563; }
-.f1-problem-box { background-color: #f9fafb; border: 1px solid #e5e7eb; border-radius: 0.5rem; padding: 1.5rem; margin-top: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 1px 2px 0 rgb(0 0 0 / 0.05); }
-.f1-problem-box strong { color: #111827; }
-.f1-figure { margin-top: 1.5rem; margin-bottom: 1.5rem; text-align: center; }
-.f1-figcaption { margin-top: 0.5rem; font-size: 0.875rem; color: #6b7280; font-style: italic; }
-.problem-tab-button { cursor: pointer; }
-/* New styles for the category cards */
-.f1-category-card {
-    display: block;
-    border-radius: 0.5rem;
-    border: 1px solid #e5e7eb;
-    padding: 1rem;
-    background-color: white;
-    box-shadow: 0 1px 2px 0 rgb(0 0 0 / 0.05);
-}
-/* Override for the dark Login Button */
-div[data-testid="login-button"] > button {
     background: #ffffff !important;
     color: #374151 !important;
     border: 1px solid #d1d5db !important;
 }
-div[data-testid="login-button"] > button:hover {
-    background: #f9fafb !important;
-}
 """
 get_window_url_params = """

 custom_css = """
+/* Typography + readable width */
 .markdown-text {
     font-size: 16px !important;
+    line-height: 1.75 !important;
+    max-width: 900px; /* constrain paragraphs */
+    margin: 0 auto;   /* center */
 }
+/* Banner/image helpers */
 .banner_image { width: 75% !important; align-self: center !important; }
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=Exo+2:wght@500;600&display=swap');
+:root { font-family: Inter, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; }
+/* Tabs (top) */
 button[role="tab"] {
+  font-family: 'Exo 2', Inter, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif !important;
   letter-spacing: 0.25px;
   font-weight: 600;
   font-size: 18px !important;
 }
+.tab-buttons button { font-size: 20px; }
 #models-to-add-text { font-size: 18px !important; }
 #citation-button span { font-size: 16px !important; }
 #citation-button textarea { font-size: 16px !important; }
 #citation-button > label > button { margin: 6px; transform: scale(1.3); }
+/* Leaderboard spacing */
 #leaderboard-table { margin-top: 15px }
 #leaderboard-table-lite { margin-top: 15px }
 #search-bar-table-box > div:first-child { background: none; border: none; }
 #search-bar { padding: 0px; }
 #leaderboard-table td:nth-child(2),
 #leaderboard-table th:nth-child(2) { max-width: 400px; overflow: auto; white-space: nowrap; }
 #scale-logo { border-style: none !important; box-shadow: none; display: block; margin-left: auto; margin-right: auto; max-width: 600px; }
 #scale-logo .download { display: none; }
 #filter_type{ border: 0; padding-left: 0; padding-top: 0; }
 #filter-columns-size{ border:0; padding:0.5; }
 #box-filter > .form{ border: 0 }
+/* ---------- "What is FormulaOne" (HTML) ---------- */
+/* Constrain width of the whole page section */
+.f1-container { max-width: 900px; margin: 0 auto; padding: 0 1rem; }
+/* Heading spacing fix */
+.f1-h1 { font-weight: 700; font-size: 2.5rem; line-height: 1.1; color: #111827; text-align: center; margin-bottom: 1.25rem !important; }
+.f1-h2 { font-weight: 700; border-bottom: 1px solid #e5e7eb; padding-bottom: 0.5rem; margin-top: 2.25rem; margin-bottom: 1rem; color: #111827; font-size: 1.75rem; line-height: 2.1rem; }
 .f1-p, .f1-li { line-height: 1.75; color: #374151; }
 .f1-a { color: #2563eb; text-decoration: none; font-weight: 500; }
 .f1-a:hover { text-decoration: underline; }
+.f1-blockquote { border-left: 4px solid #d1d5db; padding-left: 1rem; margin-left: 0; font-style: italic; color: #4b5563; background: #fafafa; }
+.f1-problem-box { background-color: #f9fafb; border: 1px solid #e5e7eb; border-radius: 0.75rem; padding: 1.25rem 1.25rem; margin-top: 1.25rem; margin-bottom: 1.25rem; box-shadow: 0 1px 2px 0 rgb(0 0 0 / 0.05); }
+.f1-figure { margin-top: 1.25rem; margin-bottom: 1.25rem; text-align: center; }
+.f1-figcaption { margin-top: 0.5rem; font-size: 0.9rem; color: #6b7280; font-style: italic; }
+/* Beautiful table for Warmup / Tier 1 / Tier 2 */
+.f1-table-wrapper { margin: 1.5rem 0; overflow-x: auto; }
+.f1-table { width: 100%; border-collapse: separate; border-spacing: 0; background: #fff; border: 1px solid #e5e7eb; border-radius: 0.75rem; box-shadow: 0 1px 2px 0 rgba(0,0,0,0.04); }
+.f1-table thead th { background: #f3f4f6; color: #111827; font-weight: 700; text-align: left; padding: 0.85rem 1rem; border-bottom: 1px solid #e5e7eb; }
+.f1-table tbody td { padding: 0.75rem 1rem; color: #374151; border-bottom: 1px solid #f1f5f9; }
+.f1-table tbody tr:nth-child(odd) td { background: #fafafa; }
+.f1-table tbody tr:last-child td { border-bottom: none; }
+.f1-table th:first-child, .f1-table td:first-child { border-top-left-radius: 0.75rem; }
+.f1-table th:last-child, .f1-table td:last-child { border-top-right-radius: 0.75rem; }
+/* Bottom tabs: "Examples of FormulaOne problems" */
+.f1-tabs { border: 1px solid #e5e7eb; border-radius: 0.75rem; background: #fff; box-shadow: 0 1px 2px 0 rgba(0,0,0,0.04); margin-top: 1.25rem; }
+.f1-tabs .f1-tabs-header { padding: 1rem 1rem 0 1rem; font-weight: 700; color: #111827; font-size: 1.1rem; }
+.f1-tabs .f1-tabs-content { padding: 1rem; }
+.f1-tabs .f1-tabs-bar { display: flex; gap: 0.5rem; justify-content: center; border-top: 1px solid #e5e7eb; padding: 0.6rem; background: #fafafa; border-bottom-left-radius: 0.75rem; border-bottom-right-radius: 0.75rem; }
+.f1-tabs .f1-tab-btn { appearance: none; border: 1px solid #d1d5db; background: #ffffff; padding: 0.45rem 0.9rem; border-radius: 999px; font-weight: 600; font-size: 0.95rem; color: #374151; cursor: pointer; transition: all 120ms ease; }
+.f1-tabs .f1-tab-btn:hover { transform: translateY(-1px); box-shadow: 0 1px 2px rgba(0,0,0,0.06); }
+.f1-tabs .f1-tab-btn.is-active { background: #2563eb; color: white; border-color: #2563eb; box-shadow: 0 1px 2px rgba(37,99,235,.35); }
+/* Code blocks in citation look cleaner */
+#citation-block { border-radius: 0.5rem; }
+/* Nice cards used elsewhere */
+.f1-category-card { display: block; border-radius: 0.5rem; border: 1px solid #e5e7eb; padding: 1rem; background-color: white; box-shadow: 0 1px 2px 0 rgb(0 0 0 / 0.05); }
+/* Login Button — force light */
+div[data-testid="login-button"] > button,
+button.sso-button,
+[data-testid="login-button"] button,
+button[data-testid="oauth-login"],
+.gr-login button {
     background: #ffffff !important;
     color: #374151 !important;
     border: 1px solid #d1d5db !important;
 }
+div[data-testid="login-button"] > button:hover,
+button.sso-button:hover,
+[data-testid="login-button"] button:hover,
+button[data-testid="oauth-login"]:hover,
+.gr-login button:hover { background: #f9fafb !important; }
+/* Leaderboard controls should stretch full width */
+#formulaone-leaderboard-tab-table .gr-row,
+#formulaone-leaderboard-tab-table .gr-column { width: 100% !important; }
+#formulaone-leaderboard-tab-table input[type="text"],
+#formulaone-leaderboard-tab-table select,
+#formulaone-leaderboard-tab-table .wrap,
+#formulaone-leaderboard-tab-table .wrap-inner,
+#formulaone-leaderboard-tab-table .container { width: 100% !important; max-width: 100% !important; }
+/* Light theme enforcement if outer shell is dark */
+html.dark, body.dark { background: #ffffff !important; color-scheme: light; }
+html.dark *, body.dark * { --tw-ring-color: rgba(37, 99, 235, 0.4); }
 """
 get_window_url_params = """