Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update tiers.
Browse files
app.py
CHANGED
@@ -203,9 +203,9 @@ def get_theme():
|
|
203 |
# --- Gradio-based tabs for examples (no JS in HTML) ---
|
204 |
def _select_example_tab(choice: str):
|
205 |
return (
|
206 |
-
gr.update(visible=(choice == "
|
207 |
-
gr.update(visible=(choice == "
|
208 |
-
gr.update(visible=(choice == "
|
209 |
)
|
210 |
|
211 |
|
@@ -219,25 +219,25 @@ MODEL_RELEASES = {
|
|
219 |
"o3 Pro": "2025-06-10",
|
220 |
}
|
221 |
|
222 |
-
TIER_TOTALS = {"
|
223 |
MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"]
|
224 |
|
225 |
ACCURACY_PCT = {
|
226 |
-
"
|
227 |
"GPT-5": 49,
|
228 |
"Gemini 2.5 Pro": 30,
|
229 |
"Grok 4": 28,
|
230 |
"Claude Opus 4": 30,
|
231 |
"o3 Pro": 24,
|
232 |
},
|
233 |
-
"Tier
|
234 |
"GPT-5": 4,
|
235 |
"Gemini 2.5 Pro": 0,
|
236 |
"Grok 4": 0,
|
237 |
"Claude Opus 4": 0,
|
238 |
"o3 Pro": 0,
|
239 |
},
|
240 |
-
"Tier
|
241 |
"GPT-5": 0,
|
242 |
"Gemini 2.5 Pro": 0,
|
243 |
"Grok 4": 0,
|
@@ -301,7 +301,7 @@ def build_accuracy_figure(tier: str):
|
|
301 |
return fig
|
302 |
|
303 |
|
304 |
-
_initial_accuracy_fig = build_accuracy_figure("Tier
|
305 |
|
306 |
# Force light theme even if HF user prefers dark
|
307 |
blocks = gr.Blocks(
|
@@ -369,7 +369,7 @@ with blocks:
|
|
369 |
with gr.Row(elem_id="f1-tier-select-row"):
|
370 |
tier_selector = gr.Radio(
|
371 |
choices=list(TIER_TOTALS.keys()),
|
372 |
-
value="Tier
|
373 |
label=None,
|
374 |
show_label=False,
|
375 |
elem_id="f1-tier-select",
|
@@ -461,8 +461,8 @@ with blocks:
|
|
461 |
)
|
462 |
|
463 |
tab_radio = gr.Radio(
|
464 |
-
choices=["
|
465 |
-
value="
|
466 |
label=None,
|
467 |
show_label=False,
|
468 |
elem_id="f1-example-radio",
|
@@ -492,7 +492,7 @@ with blocks:
|
|
492 |
# Evaluation: Warmup figure
|
493 |
gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False)
|
494 |
gr.Image(
|
495 |
-
"assets/
|
496 |
width=600,
|
497 |
show_label=False,
|
498 |
elem_classes=["f1-image"],
|
@@ -500,14 +500,16 @@ with blocks:
|
|
500 |
show_download_button=False,
|
501 |
show_fullscreen_button=False,
|
502 |
)
|
503 |
-
gr.HTML(
|
|
|
|
|
504 |
|
505 |
# Between warmup and tier1 figs
|
506 |
gr.HTML(WHAT_IS_F1_HTML_AFTER_WARMUPFIG)
|
507 |
|
508 |
-
#
|
509 |
gr.Image(
|
510 |
-
"assets/
|
511 |
width=600,
|
512 |
show_label=False,
|
513 |
elem_classes=["f1-image"],
|
@@ -516,10 +518,10 @@ with blocks:
|
|
516 |
show_fullscreen_button=False,
|
517 |
)
|
518 |
gr.HTML(
|
519 |
-
'<div class="f1-figcaption">Performance of frontier reasoning models on Tier
|
520 |
)
|
521 |
|
522 |
-
# Tail after Tier
|
523 |
gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL)
|
524 |
|
525 |
# Rename tab to "Leaderboard" and cap at 800px width
|
|
|
203 |
# --- Gradio-based tabs for examples (no JS in HTML) ---
|
204 |
def _select_example_tab(choice: str):
|
205 |
return (
|
206 |
+
gr.update(visible=(choice == "Shallow")),
|
207 |
+
gr.update(visible=(choice == "Deeper")),
|
208 |
+
gr.update(visible=(choice == "Deepest")),
|
209 |
)
|
210 |
|
211 |
|
|
|
219 |
"o3 Pro": "2025-06-10",
|
220 |
}
|
221 |
|
222 |
+
TIER_TOTALS = {"Shallow Tier": 100, "Deeper Tier": 100, "Deepest Tier": 20}
|
223 |
MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"]
|
224 |
|
225 |
ACCURACY_PCT = {
|
226 |
+
"Shallow Tier": {
|
227 |
"GPT-5": 49,
|
228 |
"Gemini 2.5 Pro": 30,
|
229 |
"Grok 4": 28,
|
230 |
"Claude Opus 4": 30,
|
231 |
"o3 Pro": 24,
|
232 |
},
|
233 |
+
"Deeper Tier": {
|
234 |
"GPT-5": 4,
|
235 |
"Gemini 2.5 Pro": 0,
|
236 |
"Grok 4": 0,
|
237 |
"Claude Opus 4": 0,
|
238 |
"o3 Pro": 0,
|
239 |
},
|
240 |
+
"Deepest Tier": {
|
241 |
"GPT-5": 0,
|
242 |
"Gemini 2.5 Pro": 0,
|
243 |
"Grok 4": 0,
|
|
|
301 |
return fig
|
302 |
|
303 |
|
304 |
+
_initial_accuracy_fig = build_accuracy_figure("Deeper Tier")
|
305 |
|
306 |
# Force light theme even if HF user prefers dark
|
307 |
blocks = gr.Blocks(
|
|
|
369 |
with gr.Row(elem_id="f1-tier-select-row"):
|
370 |
tier_selector = gr.Radio(
|
371 |
choices=list(TIER_TOTALS.keys()),
|
372 |
+
value="Deeper Tier",
|
373 |
label=None,
|
374 |
show_label=False,
|
375 |
elem_id="f1-tier-select",
|
|
|
461 |
)
|
462 |
|
463 |
tab_radio = gr.Radio(
|
464 |
+
choices=["Shallow", "Deeper", "Deepest"],
|
465 |
+
value="Shallow",
|
466 |
label=None,
|
467 |
show_label=False,
|
468 |
elem_id="f1-example-radio",
|
|
|
492 |
# Evaluation: Warmup figure
|
493 |
gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False)
|
494 |
gr.Image(
|
495 |
+
"assets/shallow_tier_performance.png",
|
496 |
width=600,
|
497 |
show_label=False,
|
498 |
elem_classes=["f1-image"],
|
|
|
500 |
show_download_button=False,
|
501 |
show_fullscreen_button=False,
|
502 |
)
|
503 |
+
gr.HTML(
|
504 |
+
'<div class="f1-figcaption">Performance of frontier models on the FormulaOne-Shallow ("warmup") dataset.</div>'
|
505 |
+
)
|
506 |
|
507 |
# Between warmup and tier1 figs
|
508 |
gr.HTML(WHAT_IS_F1_HTML_AFTER_WARMUPFIG)
|
509 |
|
510 |
+
# Deeper tier figure with corrected caption text
|
511 |
gr.Image(
|
512 |
+
"assets/deeper_tier_performance.png",
|
513 |
width=600,
|
514 |
show_label=False,
|
515 |
elem_classes=["f1-image"],
|
|
|
518 |
show_fullscreen_button=False,
|
519 |
)
|
520 |
gr.HTML(
|
521 |
+
'<div class="f1-figcaption">Performance of frontier reasoning models on the Deeper Tier of FormulaOne.</div>'
|
522 |
)
|
523 |
|
524 |
+
# Tail after Deeper Tier fig
|
525 |
gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL)
|
526 |
|
527 |
# Rename tab to "Leaderboard" and cap at 800px width
|
assets/{tier1_performance.png → deeper_tier_performance.png}
RENAMED
File without changes
|
assets/{warmup_performance.png → shallow_tier_performance.png}
RENAMED
File without changes
|
src/about.py
CHANGED
@@ -23,17 +23,17 @@ WHAT_IS_F1_HTML_TOP = f"""
|
|
23 |
<div class="f1-grid-cell" role="columnheader">Description</div>
|
24 |
</div>
|
25 |
<div class="f1-grid-row" role="row">
|
26 |
-
<div class="f1-grid-cell" role="cell">
|
27 |
<div class="f1-grid-cell" role="cell">100</div>
|
28 |
<div class="f1-grid-cell" role="cell">A set of “easier” problems.</div>
|
29 |
</div>
|
30 |
<div class="f1-grid-row" role="row">
|
31 |
-
<div class="f1-grid-cell" role="cell">
|
32 |
<div class="f1-grid-cell" role="cell">100</div>
|
33 |
<div class="f1-grid-cell" role="cell">A set of challenging problems.</div>
|
34 |
</div>
|
35 |
<div class="f1-grid-row" role="row">
|
36 |
-
<div class="f1-grid-cell" role="cell">
|
37 |
<div class="f1-grid-cell" role="cell">20</div>
|
38 |
<div class="f1-grid-cell" role="cell">A set of highly challenging problems.</div>
|
39 |
</div>
|
@@ -83,7 +83,7 @@ WHAT_IS_F1_HTML_AFTER_VIDEO = """
|
|
83 |
<li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
|
84 |
</ul>
|
85 |
<p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
|
86 |
-
<p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the
|
87 |
"""
|
88 |
|
89 |
# Evaluation: begins the "Model Accuracy" subsection and the Warmup paragraph, up to (but not including) the Warmup figure.
|
@@ -93,13 +93,13 @@ WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG = """
|
|
93 |
<!-- warmup_performance figure inserted via gr.Image in app.py -->
|
94 |
"""
|
95 |
|
96 |
-
# Between
|
97 |
WHAT_IS_F1_HTML_AFTER_WARMUPFIG = """
|
98 |
-
<p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>
|
99 |
<!-- tier1_performance figure inserted via gr.Image in app.py -->
|
100 |
"""
|
101 |
|
102 |
-
# Tail after
|
103 |
WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL = """
|
104 |
<p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
|
105 |
</section>
|
|
|
23 |
<div class="f1-grid-cell" role="columnheader">Description</div>
|
24 |
</div>
|
25 |
<div class="f1-grid-row" role="row">
|
26 |
+
<div class="f1-grid-cell" role="cell">Shallow</div>
|
27 |
<div class="f1-grid-cell" role="cell">100</div>
|
28 |
<div class="f1-grid-cell" role="cell">A set of “easier” problems.</div>
|
29 |
</div>
|
30 |
<div class="f1-grid-row" role="row">
|
31 |
+
<div class="f1-grid-cell" role="cell">Deeper</div>
|
32 |
<div class="f1-grid-cell" role="cell">100</div>
|
33 |
<div class="f1-grid-cell" role="cell">A set of challenging problems.</div>
|
34 |
</div>
|
35 |
<div class="f1-grid-row" role="row">
|
36 |
+
<div class="f1-grid-cell" role="cell">Deepest</div>
|
37 |
<div class="f1-grid-cell" role="cell">20</div>
|
38 |
<div class="f1-grid-cell" role="cell">A set of highly challenging problems.</div>
|
39 |
</div>
|
|
|
83 |
<li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
|
84 |
</ul>
|
85 |
<p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
|
86 |
+
<p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Deeper and Deepest Tier problems. Solutions submitted for evaluation on our benchmark are evaluated against a withheld comprehensive test-suite.</p>
|
87 |
"""
|
88 |
|
89 |
# Evaluation: begins the "Model Accuracy" subsection and the Warmup paragraph, up to (but not including) the Warmup figure.
|
|
|
93 |
<!-- warmup_performance figure inserted via gr.Image in app.py -->
|
94 |
"""
|
95 |
|
96 |
+
# Between Shallow and Deeper figures
|
97 |
WHAT_IS_F1_HTML_AFTER_WARMUPFIG = """
|
98 |
+
<p class="mb-4 f1-p">However, as the reasoning depth increases in the <strong>Deeper</strong> tier, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
|
99 |
<!-- tier1_performance figure inserted via gr.Image in app.py -->
|
100 |
"""
|
101 |
|
102 |
+
# Tail after Deeper figure (closes evaluation section + container)
|
103 |
WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL = """
|
104 |
<p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
|
105 |
</section>
|
src/display/utils.py
CHANGED
@@ -24,8 +24,8 @@ class AutoEvalColumn:
|
|
24 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
25 |
organization = ColumnContent("Organization", "str", True, never_hidden=True)
|
26 |
success_rate_overall = ColumnContent("Overall Success (%)", "number", True)
|
27 |
-
success_rate_tier1 = ColumnContent("Tier
|
28 |
-
success_rate_tier2 = ColumnContent("Tier
|
29 |
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
30 |
|
31 |
|
|
|
24 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
25 |
organization = ColumnContent("Organization", "str", True, never_hidden=True)
|
26 |
success_rate_overall = ColumnContent("Overall Success (%)", "number", True)
|
27 |
+
success_rate_tier1 = ColumnContent("Deeper Tier Success (%)", "number", True)
|
28 |
+
success_rate_tier2 = ColumnContent("Deepest Tier Success (%)", "number", True)
|
29 |
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
30 |
|
31 |
|