galb-dai commited on
Commit
985eb9a
·
1 Parent(s): ba1146a

Update tiers.

Browse files
app.py CHANGED
@@ -203,9 +203,9 @@ def get_theme():
203
  # --- Gradio-based tabs for examples (no JS in HTML) ---
204
  def _select_example_tab(choice: str):
205
  return (
206
- gr.update(visible=(choice == "Warmup")),
207
- gr.update(visible=(choice == "Tier 1")),
208
- gr.update(visible=(choice == "Tier 2")),
209
  )
210
 
211
 
@@ -219,25 +219,25 @@ MODEL_RELEASES = {
219
  "o3 Pro": "2025-06-10",
220
  }
221
 
222
- TIER_TOTALS = {"Warmup": 100, "Tier 1": 100, "Tier 2": 20}
223
  MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"]
224
 
225
  ACCURACY_PCT = {
226
- "Warmup": {
227
  "GPT-5": 49,
228
  "Gemini 2.5 Pro": 30,
229
  "Grok 4": 28,
230
  "Claude Opus 4": 30,
231
  "o3 Pro": 24,
232
  },
233
- "Tier 1": {
234
  "GPT-5": 4,
235
  "Gemini 2.5 Pro": 0,
236
  "Grok 4": 0,
237
  "Claude Opus 4": 0,
238
  "o3 Pro": 0,
239
  },
240
- "Tier 2": {
241
  "GPT-5": 0,
242
  "Gemini 2.5 Pro": 0,
243
  "Grok 4": 0,
@@ -301,7 +301,7 @@ def build_accuracy_figure(tier: str):
301
  return fig
302
 
303
 
304
- _initial_accuracy_fig = build_accuracy_figure("Tier 1")
305
 
306
  # Force light theme even if HF user prefers dark
307
  blocks = gr.Blocks(
@@ -369,7 +369,7 @@ with blocks:
369
  with gr.Row(elem_id="f1-tier-select-row"):
370
  tier_selector = gr.Radio(
371
  choices=list(TIER_TOTALS.keys()),
372
- value="Tier 1",
373
  label=None,
374
  show_label=False,
375
  elem_id="f1-tier-select",
@@ -461,8 +461,8 @@ with blocks:
461
  )
462
 
463
  tab_radio = gr.Radio(
464
- choices=["Warmup", "Tier 1", "Tier 2"],
465
- value="Warmup",
466
  label=None,
467
  show_label=False,
468
  elem_id="f1-example-radio",
@@ -492,7 +492,7 @@ with blocks:
492
  # Evaluation: Warmup figure
493
  gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False)
494
  gr.Image(
495
- "assets/warmup_performance.png",
496
  width=600,
497
  show_label=False,
498
  elem_classes=["f1-image"],
@@ -500,14 +500,16 @@ with blocks:
500
  show_download_button=False,
501
  show_fullscreen_button=False,
502
  )
503
- gr.HTML('<div class="f1-figcaption">Performance of frontier models on the FormulaOne-Warmup dataset.</div>')
 
 
504
 
505
  # Between warmup and tier1 figs
506
  gr.HTML(WHAT_IS_F1_HTML_AFTER_WARMUPFIG)
507
 
508
- # Tier 1 figure with corrected caption text
509
  gr.Image(
510
- "assets/tier1_performance.png",
511
  width=600,
512
  show_label=False,
513
  elem_classes=["f1-image"],
@@ -516,10 +518,10 @@ with blocks:
516
  show_fullscreen_button=False,
517
  )
518
  gr.HTML(
519
- '<div class="f1-figcaption">Performance of frontier reasoning models on Tier 1 of FormulaOne.</div>'
520
  )
521
 
522
- # Tail after Tier 1 fig
523
  gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL)
524
 
525
  # Rename tab to "Leaderboard" and cap at 800px width
 
203
  # --- Gradio-based tabs for examples (no JS in HTML) ---
204
  def _select_example_tab(choice: str):
205
  return (
206
+ gr.update(visible=(choice == "Shallow")),
207
+ gr.update(visible=(choice == "Deeper")),
208
+ gr.update(visible=(choice == "Deepest")),
209
  )
210
 
211
 
 
219
  "o3 Pro": "2025-06-10",
220
  }
221
 
222
+ TIER_TOTALS = {"Shallow Tier": 100, "Deeper Tier": 100, "Deepest Tier": 20}
223
  MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"]
224
 
225
  ACCURACY_PCT = {
226
+ "Shallow Tier": {
227
  "GPT-5": 49,
228
  "Gemini 2.5 Pro": 30,
229
  "Grok 4": 28,
230
  "Claude Opus 4": 30,
231
  "o3 Pro": 24,
232
  },
233
+ "Deeper Tier": {
234
  "GPT-5": 4,
235
  "Gemini 2.5 Pro": 0,
236
  "Grok 4": 0,
237
  "Claude Opus 4": 0,
238
  "o3 Pro": 0,
239
  },
240
+ "Deepest Tier": {
241
  "GPT-5": 0,
242
  "Gemini 2.5 Pro": 0,
243
  "Grok 4": 0,
 
301
  return fig
302
 
303
 
304
+ _initial_accuracy_fig = build_accuracy_figure("Deeper Tier")
305
 
306
  # Force light theme even if HF user prefers dark
307
  blocks = gr.Blocks(
 
369
  with gr.Row(elem_id="f1-tier-select-row"):
370
  tier_selector = gr.Radio(
371
  choices=list(TIER_TOTALS.keys()),
372
+ value="Deeper Tier",
373
  label=None,
374
  show_label=False,
375
  elem_id="f1-tier-select",
 
461
  )
462
 
463
  tab_radio = gr.Radio(
464
+ choices=["Shallow", "Deeper", "Deepest"],
465
+ value="Shallow",
466
  label=None,
467
  show_label=False,
468
  elem_id="f1-example-radio",
 
492
  # Evaluation: Warmup figure
493
  gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False)
494
  gr.Image(
495
+ "assets/shallow_tier_performance.png",
496
  width=600,
497
  show_label=False,
498
  elem_classes=["f1-image"],
 
500
  show_download_button=False,
501
  show_fullscreen_button=False,
502
  )
503
+ gr.HTML(
504
+ '<div class="f1-figcaption">Performance of frontier models on the FormulaOne-Shallow ("warmup") dataset.</div>'
505
+ )
506
 
507
  # Between warmup and tier1 figs
508
  gr.HTML(WHAT_IS_F1_HTML_AFTER_WARMUPFIG)
509
 
510
+ # Deeper tier figure with corrected caption text
511
  gr.Image(
512
+ "assets/deeper_tier_performance.png",
513
  width=600,
514
  show_label=False,
515
  elem_classes=["f1-image"],
 
518
  show_fullscreen_button=False,
519
  )
520
  gr.HTML(
521
+ '<div class="f1-figcaption">Performance of frontier reasoning models on the Deeper Tier of FormulaOne.</div>'
522
  )
523
 
524
+ # Tail after Deeper Tier fig
525
  gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL)
526
 
527
  # Rename tab to "Leaderboard" and cap at 800px width
assets/{tier1_performance.png → deeper_tier_performance.png} RENAMED
File without changes
assets/{warmup_performance.png → shallow_tier_performance.png} RENAMED
File without changes
src/about.py CHANGED
@@ -23,17 +23,17 @@ WHAT_IS_F1_HTML_TOP = f"""
23
  <div class="f1-grid-cell" role="columnheader">Description</div>
24
  </div>
25
  <div class="f1-grid-row" role="row">
26
- <div class="f1-grid-cell" role="cell">Warmup</div>
27
  <div class="f1-grid-cell" role="cell">100</div>
28
  <div class="f1-grid-cell" role="cell">A set of “easier” problems.</div>
29
  </div>
30
  <div class="f1-grid-row" role="row">
31
- <div class="f1-grid-cell" role="cell">Tier 1</div>
32
  <div class="f1-grid-cell" role="cell">100</div>
33
  <div class="f1-grid-cell" role="cell">A set of challenging problems.</div>
34
  </div>
35
  <div class="f1-grid-row" role="row">
36
- <div class="f1-grid-cell" role="cell">Tier 2</div>
37
  <div class="f1-grid-cell" role="cell">20</div>
38
  <div class="f1-grid-cell" role="cell">A set of highly challenging problems.</div>
39
  </div>
@@ -83,7 +83,7 @@ WHAT_IS_F1_HTML_AFTER_VIDEO = """
83
  <li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
84
  </ul>
85
  <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
86
- <p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Tier 1 and Tier 2 problems. Solutions submitted for evaluation on our benchmark are evaluated against a withheld comprehensive test-suite.</p>
87
  """
88
 
89
  # Evaluation: begins the "Model Accuracy" subsection and the Warmup paragraph, up to (but not including) the Warmup figure.
@@ -93,13 +93,13 @@ WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG = """
93
  <!-- warmup_performance figure inserted via gr.Image in app.py -->
94
  """
95
 
96
- # Between Warmup and Tier 1 figures
97
  WHAT_IS_F1_HTML_AFTER_WARMUPFIG = """
98
- <p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>Tier 1</strong>, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
99
  <!-- tier1_performance figure inserted via gr.Image in app.py -->
100
  """
101
 
102
- # Tail after Tier 1 figure (closes evaluation section + container)
103
  WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL = """
104
  <p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
105
  </section>
 
23
  <div class="f1-grid-cell" role="columnheader">Description</div>
24
  </div>
25
  <div class="f1-grid-row" role="row">
26
+ <div class="f1-grid-cell" role="cell">Shallow</div>
27
  <div class="f1-grid-cell" role="cell">100</div>
28
  <div class="f1-grid-cell" role="cell">A set of “easier” problems.</div>
29
  </div>
30
  <div class="f1-grid-row" role="row">
31
+ <div class="f1-grid-cell" role="cell">Deeper</div>
32
  <div class="f1-grid-cell" role="cell">100</div>
33
  <div class="f1-grid-cell" role="cell">A set of challenging problems.</div>
34
  </div>
35
  <div class="f1-grid-row" role="row">
36
+ <div class="f1-grid-cell" role="cell">Deepest</div>
37
  <div class="f1-grid-cell" role="cell">20</div>
38
  <div class="f1-grid-cell" role="cell">A set of highly challenging problems.</div>
39
  </div>
 
83
  <li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
84
  </ul>
85
  <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
86
+ <p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Deeper and Deepest Tier problems. Solutions submitted for evaluation on our benchmark are evaluated against a withheld comprehensive test-suite.</p>
87
  """
88
 
89
  # Evaluation: begins the "Model Accuracy" subsection and the Warmup paragraph, up to (but not including) the Warmup figure.
 
93
  <!-- warmup_performance figure inserted via gr.Image in app.py -->
94
  """
95
 
96
+ # Between Shallow and Deeper figures
97
  WHAT_IS_F1_HTML_AFTER_WARMUPFIG = """
98
+ <p class="mb-4 f1-p">However, as the reasoning depth increases in the <strong>Deeper</strong> tier, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
99
  <!-- tier1_performance figure inserted via gr.Image in app.py -->
100
  """
101
 
102
+ # Tail after Deeper figure (closes evaluation section + container)
103
  WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL = """
104
  <p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
105
  </section>
src/display/utils.py CHANGED
@@ -24,8 +24,8 @@ class AutoEvalColumn:
24
  system = ColumnContent("System Name", "markdown", True, never_hidden=True)
25
  organization = ColumnContent("Organization", "str", True, never_hidden=True)
26
  success_rate_overall = ColumnContent("Overall Success (%)", "number", True)
27
- success_rate_tier1 = ColumnContent("Tier 1 Success (%)", "number", True)
28
- success_rate_tier2 = ColumnContent("Tier 2 Success (%)", "number", True)
29
  submitted_on = ColumnContent("Submitted On", "datetime", True)
30
 
31
 
 
24
  system = ColumnContent("System Name", "markdown", True, never_hidden=True)
25
  organization = ColumnContent("Organization", "str", True, never_hidden=True)
26
  success_rate_overall = ColumnContent("Overall Success (%)", "number", True)
27
+ success_rate_tier1 = ColumnContent("Deeper Tier Success (%)", "number", True)
28
+ success_rate_tier2 = ColumnContent("Deepest Tier Success (%)", "number", True)
29
  submitted_on = ColumnContent("Submitted On", "datetime", True)
30
 
31