galb-dai commited on
Commit
e30b579
·
1 Parent(s): 97d33e1
Files changed (3) hide show
  1. app.py +3 -2
  2. src/about.py +7 -9
  3. src/display/css_html_js.py +49 -37
app.py CHANGED
@@ -215,8 +215,9 @@ with blocks:
215
 
216
  # Examples (kept inside a centered, 800px container)
217
  with gr.Group(elem_id="f1-examples", elem_classes=["f1-container"]):
 
218
  gr.HTML(
219
- '<div class="f1-tabs-body"><h3 class="f1-examples-title">Examples of FormulaOne problems</h3></div>'
220
  )
221
 
222
  _latex = [
@@ -257,7 +258,7 @@ with blocks:
257
  choices=["Warmup", "Tier 1", "Tier 2"],
258
  value="Warmup",
259
  label=None,
260
- show_label=False, # hide the "Radio" caption
261
  elem_id="f1-example-radio",
262
  )
263
  tab_radio.change(_select_example_tab, inputs=tab_radio, outputs=[md_warmup, md_tier1, md_tier2])
 
215
 
216
  # Examples (kept inside a centered, 800px container)
217
  with gr.Group(elem_id="f1-examples", elem_classes=["f1-container"]):
218
+ # centered pill title
219
  gr.HTML(
220
+ '<div class="f1-tabs-body"><div class="f1-examples-chip">Examples of FormulaOne problems</div></div>'
221
  )
222
 
223
  _latex = [
 
258
  choices=["Warmup", "Tier 1", "Tier 2"],
259
  value="Warmup",
260
  label=None,
261
+ show_label=False, # hide caption
262
  elem_id="f1-example-radio",
263
  )
264
  tab_radio.change(_select_example_tab, inputs=tab_radio, outputs=[md_warmup, md_tier1, md_tier2])
src/about.py CHANGED
@@ -1,7 +1,7 @@
1
  # The paper's URL for linking
2
  PAPER_URL = "https://arxiv.org/abs/2507.13337"
3
 
4
- # Top chunk — self-contained (no dangling <main/>). Includes the clean "table" (via divs).
5
  WHAT_IS_F1_HTML_TOP = f"""
6
  <div class="f1-container">
7
  <header class="text-center mb-12">
@@ -15,7 +15,7 @@ WHAT_IS_F1_HTML_TOP = f"""
15
 
16
  <p class="mb-4 f1-p"><strong>FormulaOne</strong> consists of 220 novel dynamic programming problems over graphs. The problems are organised into three categories, ranging from moderate difficulty and all the way up to research-level.</p>
17
 
18
- <!-- Clean, centered "table" using divs -->
19
  <div class="f1-grid-wrap" role="region" aria-label="FormulaOne categories">
20
  <div class="f1-grid-table" role="table">
21
  <div class="f1-grid-row f1-grid-head" role="row">
@@ -44,7 +44,7 @@ WHAT_IS_F1_HTML_TOP = f"""
44
  </div>
45
  """
46
 
47
- # Bottom chunk — self-contained, width-constrained.
48
  WHAT_IS_F1_HTML_BOTTOM = """
49
  <div class="f1-container">
50
  <section>
@@ -54,7 +54,6 @@ WHAT_IS_F1_HTML_BOTTOM = """
54
 
55
  <section>
56
  <h2 class="f1-h2">An “Infinite Well” of Problems</h2>
57
- <!-- Example problem removed, per request -->
58
  <p class="mb-4 f1-p">While the problems are often natural to state, their solutions are far from obvious. The solvability of this vast class of problems is guaranteed by an algorithmic <strong>meta-theorem</strong> due to <a href="https://en.wikipedia.org/wiki/Courcelle%27s_theorem" target="_blank" rel="noopener noreferrer" class="f1-a">Courcelle</a>, which broadly states:</p>
59
  <blockquote class="my-6 f1-blockquote">
60
  “For every sufficiently tree-like graph, any problem definable in an expressive formal logic — Monadic Second-Order (MSO) logic — can be solved by a dynamic programming algorithm that operates in time linear in the order of the graph.”
@@ -67,12 +66,12 @@ WHAT_IS_F1_HTML_BOTTOM = """
67
  <p class="mb-4 f1-p">An algorithm can then traverse this tree of bags, solving the problem piece by piece using dynamic programming. This process involves designing a “state” that summarises all necessary information about the partial solution within a bag, and then defining how this state transforms as vertices are introduced, forgotten, or bags are merged.</p>
68
  <figure class="f1-figure">
69
  <video class="w-full max-w-2xl mx-auto rounded-lg shadow-lg" autoplay loop muted playsinline>
70
- <source src="/file=assets/dp_animation.mp4" type="video/mp4">
71
  Your browser does not support the video tag.
72
  </video>
73
  <figcaption class="f1-figcaption">Animation showing the design of a compressed dynamic programming state-space.</figcaption>
74
  </figure>
75
- <p class="f1-p">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem --- <code>Maximal-Cluster-Graph</code> --- <a href="https://arxiv.org/pdf/2507.13337#appendix.A" target="_blank" rel="noopener noreferrer" class="f1-a">see the appendix of our paper</a>.</p>
76
  </section>
77
 
78
  <section id="evaluation">
@@ -87,7 +86,6 @@ WHAT_IS_F1_HTML_BOTTOM = """
87
  <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
88
  <p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Tier 1 and Tier 2 problems.</p>
89
 
90
- <!-- Same level as Evaluation -->
91
  <h2 class="f1-h2">Model Accuracy</h2>
92
  <p class="mb-4 f1-p">On the <strong>FormulaOne-Warmup</strong> problems, frontier models perform reasonably well. This confirms they have a foundational capability for these types of algorithmic tasks.</p>
93
  <figure class="f1-figure">
@@ -96,8 +94,8 @@ WHAT_IS_F1_HTML_BOTTOM = """
96
  </figure>
97
  <p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>Tier 1</strong>, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
98
  <figure class="f1-figure">
99
- <img src="/file=assets/tier1_performance.png" alt="Plot showing model performance on FormulaOne Tier 1" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
100
- <figcaption class="f1-figcaption">Figure 1: Performance of frontier reasoning models on the FormulaOne dataset.</figcaption>
101
  </figure>
102
  <p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
103
  </section>
 
1
  # The paper's URL for linking
2
  PAPER_URL = "https://arxiv.org/abs/2507.13337"
3
 
4
+ # Top chunk — self-contained (div grid table).
5
  WHAT_IS_F1_HTML_TOP = f"""
6
  <div class="f1-container">
7
  <header class="text-center mb-12">
 
15
 
16
  <p class="mb-4 f1-p"><strong>FormulaOne</strong> consists of 220 novel dynamic programming problems over graphs. The problems are organised into three categories, ranging from moderate difficulty and all the way up to research-level.</p>
17
 
18
+ <!-- Clean, centered "table" using a single grid -->
19
  <div class="f1-grid-wrap" role="region" aria-label="FormulaOne categories">
20
  <div class="f1-grid-table" role="table">
21
  <div class="f1-grid-row f1-grid-head" role="row">
 
44
  </div>
45
  """
46
 
47
+ # Bottom chunk — width-constrained; em-dashes; corrected video; captions centered/dark; wording tweak.
48
  WHAT_IS_F1_HTML_BOTTOM = """
49
  <div class="f1-container">
50
  <section>
 
54
 
55
  <section>
56
  <h2 class="f1-h2">An “Infinite Well” of Problems</h2>
 
57
  <p class="mb-4 f1-p">While the problems are often natural to state, their solutions are far from obvious. The solvability of this vast class of problems is guaranteed by an algorithmic <strong>meta-theorem</strong> due to <a href="https://en.wikipedia.org/wiki/Courcelle%27s_theorem" target="_blank" rel="noopener noreferrer" class="f1-a">Courcelle</a>, which broadly states:</p>
58
  <blockquote class="my-6 f1-blockquote">
59
  “For every sufficiently tree-like graph, any problem definable in an expressive formal logic — Monadic Second-Order (MSO) logic — can be solved by a dynamic programming algorithm that operates in time linear in the order of the graph.”
 
66
  <p class="mb-4 f1-p">An algorithm can then traverse this tree of bags, solving the problem piece by piece using dynamic programming. This process involves designing a “state” that summarises all necessary information about the partial solution within a bag, and then defining how this state transforms as vertices are introduced, forgotten, or bags are merged.</p>
67
  <figure class="f1-figure">
68
  <video class="w-full max-w-2xl mx-auto rounded-lg shadow-lg" autoplay loop muted playsinline>
69
+ <source src="/file=assets/DominatingSetAnimation.mp4" type="video/mp4">
70
  Your browser does not support the video tag.
71
  </video>
72
  <figcaption class="f1-figcaption">Animation showing the design of a compressed dynamic programming state-space.</figcaption>
73
  </figure>
74
+ <p class="f1-p">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem &mdash; <code>Maximal-Cluster-Graph</code> &mdash; <a href="https://arxiv.org/pdf/2507.13337#appendix.A" target="_blank" rel="noopener noreferrer" class="f1-a">see the appendix of our paper</a>.</p>
75
  </section>
76
 
77
  <section id="evaluation">
 
86
  <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
87
  <p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Tier 1 and Tier 2 problems.</p>
88
 
 
89
  <h2 class="f1-h2">Model Accuracy</h2>
90
  <p class="mb-4 f1-p">On the <strong>FormulaOne-Warmup</strong> problems, frontier models perform reasonably well. This confirms they have a foundational capability for these types of algorithmic tasks.</p>
91
  <figure class="f1-figure">
 
94
  </figure>
95
  <p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>Tier 1</strong>, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
96
  <figure class="f1-figure">
97
+ <img src="/file=assets/tier1_performance.png" alt="Plot showing model performance on Tier 1" class="max-w-full md:max-w-2xl mx-auto rounded-lg shadow-md">
98
+ <figcaption class="f1-figcaption">Performance of frontier reasoning models on the FormulaOne dataset.</figcaption>
99
  </figure>
100
  <p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
101
  </section>
src/display/css_html_js.py CHANGED
@@ -10,23 +10,21 @@ custom_css = """
10
  /* Readable width everywhere */
11
  .f1-container { max-width: 800px; margin: 0 auto; padding: 0 16px; }
12
  .markdown-text { font-size: 16px !important; max-width: 800px; margin: 0 auto; }
13
- #what-is-tab { max-width: 800px; margin-left: auto; margin-right: auto; } /* keep the whole tab narrow */
14
- #f1-examples { max-width: 800px; margin: 0 auto; } /* ensure examples stay narrow */
15
 
16
- /* Paragraphs: nice wrapping */
17
  .f1-p, .f1-li {
18
  line-height: 1.75;
19
  color: #374151;
20
- text-align: left;
21
  text-wrap: pretty;
22
  overflow-wrap: break-word;
23
  hyphens: auto;
24
- word-break: normal;
25
  }
26
 
27
  /* Headings */
28
  .f1-h1 { font-weight: 700; font-size: 2.25rem; line-height: 2.5rem; color: var(--f1-text); text-align: center; margin-bottom: 1.25rem !important; }
29
- .f1-h2 { /* smaller section headings */
30
  font-weight: 700;
31
  border-bottom: 1px solid var(--f1-border);
32
  padding-bottom: 0.45rem;
@@ -41,69 +39,83 @@ custom_css = """
41
  .f1-a { color: #2563eb; text-decoration: none; font-weight: 500; }
42
  .f1-a:hover { text-decoration: underline; }
43
 
44
- /* Blockquote & problem name */
45
- .f1-blockquote { border-left: 4px solid #d1d5db; padding-left: 1rem; margin-left: 0; font-style: italic; color: #4b5563; }
 
46
  .f1-problem-name { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; font-weight: 600; text-align: center; }
47
 
48
- /* ===== Clean "table" using divs (centered, not full width, borders all around) ===== */
49
  .f1-grid-wrap { text-align: center; margin: 10px auto 8px auto; }
50
  .f1-grid-table {
51
- display: inline-block; /* center by shrink-to-fit */
52
- border-top: 1px solid var(--f1-border);
53
- border-left: 1px solid var(--f1-border);
54
- border-right: 1px solid var(--f1-border); /* add right border */
55
- border-bottom: 1px solid var(--f1-border); /* add bottom border */
56
  background: var(--f1-bg);
57
  border-radius: 8px;
58
  overflow: hidden;
59
  }
60
- .f1-grid-row { display: grid; grid-template-columns: auto auto 1fr; align-items: start; }
61
- .f1-grid-row + .f1-grid-row { border-top: 1px solid var(--f1-border); } /* horizontal separators */
62
- .f1-grid-cell { padding: 10px 14px; text-align: left; }
63
- .f1-grid-head .f1-grid-cell { font-weight: 600; text-align: center; } /* centered headers */
64
- .f1-grid-row .f1-grid-cell + .f1-grid-cell { border-left: 1px solid var(--f1-border); }
 
 
 
 
 
 
 
 
65
 
66
- /* ===== Examples card (restore nice look, fix width, unify background, center title) ===== */
67
  #f1-examples {
68
- background: var(--f1-bg-muted); /* match problem box bg */
69
  border: 1px solid var(--f1-border);
70
  border-radius: 10px;
71
  box-shadow: 0 1px 2px rgba(0,0,0,0.04);
72
  margin-bottom: 12px;
73
  }
74
- #f1-examples .f1-examples-title {
 
 
 
 
 
 
75
  font-weight: 700;
76
- margin: 12px 14px 8px 14px;
77
- color: var(--f1-text);
78
- font-size: 1.1rem;
79
- text-align: center; /* center heading */
80
  }
81
 
82
  /* Problem content: consistent background + padding */
83
  #f1-examples .f1-problem-markdown .markdown {
84
- background: var(--f1-bg-muted); /* same as container */
85
  border: 1px solid var(--f1-border);
86
  border-radius: 8px;
87
- padding: 18px; /* ensure inner padding */
88
- margin: 0 14px 8px 14px;
89
  }
90
 
91
- /* Bottom "tabs" using Radio -> show only pills (hide inputs), no "Radio" label */
92
  #f1-example-radio { border-top: 1px solid var(--f1-border); padding: 8px 10px 10px 10px; margin: 0 8px 8px; }
93
- #f1-example-radio input[type="radio"] { display: none; } /* hide the radio bullet */
94
  #f1-example-radio .wrap { display: flex; gap: 8px; flex-wrap: wrap; justify-content: flex-start; }
95
- #f1-example-radio label { border: 1px solid transparent; border-radius: 999px; padding: 6px 12px; cursor: pointer; background: #fff; }
96
- #f1-example-radio label:hover { background: #f3f4f6; }
97
- #f1-example-radio input[type="radio"]:checked + span {
98
- background: #e5e7eb; /* subtle active pill */
99
  border: 1px solid var(--f1-border);
100
  border-radius: 999px;
101
  padding: 6px 12px;
 
 
 
 
 
 
 
 
102
  }
103
 
104
  /* Leaderboard: center the whole tab and apply requested nesting/min-width rule with .column/.row */
105
- #formulaone-leaderboard-tab-table { max-width: 1200px; margin-left: auto; margin-right: auto; } /* center */
106
- #formulaone-leaderboard-tab-table .column .row .column { min-width: 80% !important; } /* exact chain rule */
107
  #formulaone-leaderboard-tab-table .row, #formulaone-leaderboard-tab-table .column { width: 100% !important; max-width: 100% !important; }
108
  #formulaone-leaderboard-tab-table [data-testid="dropdown"], #formulaone-leaderboard-tab-table input[type="text"] { width: 100% !important; }
109
 
 
10
  /* Readable width everywhere */
11
  .f1-container { max-width: 800px; margin: 0 auto; padding: 0 16px; }
12
  .markdown-text { font-size: 16px !important; max-width: 800px; margin: 0 auto; }
13
+ #what-is-tab { max-width: 800px; margin-left: auto; margin-right: auto; }
14
+ #f1-examples { max-width: 800px; margin: 0 auto; }
15
 
16
+ /* Body text */
17
  .f1-p, .f1-li {
18
  line-height: 1.75;
19
  color: #374151;
 
20
  text-wrap: pretty;
21
  overflow-wrap: break-word;
22
  hyphens: auto;
 
23
  }
24
 
25
  /* Headings */
26
  .f1-h1 { font-weight: 700; font-size: 2.25rem; line-height: 2.5rem; color: var(--f1-text); text-align: center; margin-bottom: 1.25rem !important; }
27
+ .f1-h2 {
28
  font-weight: 700;
29
  border-bottom: 1px solid var(--f1-border);
30
  padding-bottom: 0.45rem;
 
39
  .f1-a { color: #2563eb; text-decoration: none; font-weight: 500; }
40
  .f1-a:hover { text-decoration: underline; }
41
 
42
+ /* Blockquote, captions, problem name */
43
+ .f1-blockquote { border-left: 4px solid #d1d5db; padding-left: 1rem; font-style: italic; color: #4b5563; }
44
+ .f1-figcaption { margin-top: 0.5rem; font-size: 0.875rem; color: #111827; text-align: center; } /* centered + very dark */
45
  .f1-problem-name { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; font-weight: 600; text-align: center; }
46
 
47
+ /* ===== Clean "table" using a single grid (equal column widths across rows) ===== */
48
  .f1-grid-wrap { text-align: center; margin: 10px auto 8px auto; }
49
  .f1-grid-table {
50
+ display: grid; /* one grid for all rows */
51
+ grid-template-columns: 1fr 1fr 1fr; /* equal column widths */
52
+ border: 1px solid var(--f1-border); /* full border */
 
 
53
  background: var(--f1-bg);
54
  border-radius: 8px;
55
  overflow: hidden;
56
  }
57
+ .f1-grid-row { display: contents; } /* rows don’t reset column widths */
58
+ .f1-grid-cell {
59
+ padding: 10px 14px;
60
+ text-align: left;
61
+ border-left: 1px solid var(--f1-border);
62
+ border-top: 1px solid var(--f1-border);
63
+ }
64
+ .f1-grid-cell:nth-child(3n+1) { border-left: none; } /* first col cells: no left border */
65
+ .f1-grid-head .f1-grid-cell {
66
+ font-weight: 600;
67
+ text-align: center;
68
+ }
69
+ .f1-grid-head .f1-grid-cell { border-top: none; } /* top border only on body rows */
70
 
71
+ /* ===== Examples card: background, centered pill heading, pretty pills ===== */
72
  #f1-examples {
73
+ background: var(--f1-bg-muted); /* #f9fafb */
74
  border: 1px solid var(--f1-border);
75
  border-radius: 10px;
76
  box-shadow: 0 1px 2px rgba(0,0,0,0.04);
77
  margin-bottom: 12px;
78
  }
79
+ .f1-tabs-body { padding-top: 12px; text-align: center; }
80
+ .f1-examples-chip {
81
+ display: inline-block;
82
+ background: #e5e7eb; /* slightly darker gray pill */
83
+ color: #111827;
84
+ padding: 6px 12px;
85
+ border-radius: 999px;
86
  font-weight: 700;
 
 
 
 
87
  }
88
 
89
  /* Problem content: consistent background + padding */
90
  #f1-examples .f1-problem-markdown .markdown {
91
+ background: var(--f1-bg-muted);
92
  border: 1px solid var(--f1-border);
93
  border-radius: 8px;
94
+ padding: 18px;
95
+ margin: 10px 14px 8px 14px;
96
  }
97
 
98
+ /* Bottom "tabs" using Radio -> show only pills (hide inputs) */
99
  #f1-example-radio { border-top: 1px solid var(--f1-border); padding: 8px 10px 10px 10px; margin: 0 8px 8px; }
100
+ #f1-example-radio input[type="radio"] { display: none; }
101
  #f1-example-radio .wrap { display: flex; gap: 8px; flex-wrap: wrap; justify-content: flex-start; }
102
+ #f1-example-radio label {
 
 
 
103
  border: 1px solid var(--f1-border);
104
  border-radius: 999px;
105
  padding: 6px 12px;
106
+ cursor: pointer;
107
+ background: #f3f4f6; /* light gray for inactive pills */
108
+ }
109
+ #f1-example-radio input[type="radio"]:checked + span {
110
+ background: #e5e7eb; /* selected pill slightly darker */
111
+ border-color: var(--f1-border);
112
+ border-radius: 999px;
113
+ padding: 6px 12px;
114
  }
115
 
116
  /* Leaderboard: center the whole tab and apply requested nesting/min-width rule with .column/.row */
117
+ #formulaone-leaderboard-tab-table { max-width: 1200px; margin-left: auto; margin-right: auto; }
118
+ #formulaone-leaderboard-tab-table .column .row .column { min-width: 80% !important; } /* exact chain rule */
119
  #formulaone-leaderboard-tab-table .row, #formulaone-leaderboard-tab-table .column { width: 100% !important; max-width: 100% !important; }
120
  #formulaone-leaderboard-tab-table [data-testid="dropdown"], #formulaone-leaderboard-tab-table input[type="text"] { width: 100% !important; }
121