Spaces:

Ashokdll
/

agent_unit4

Running

App Files Files Community

Ashokdll commited on Jun 4

Commit

8fea07e

verified ·

1 Parent(s): 2d5d543

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -25

app.py CHANGED Viewed

@@ -809,8 +809,10 @@ def create_gaia_app():
                     outputs=[download_output]
                 )
             # ===============================
-            # TAB 4: FULL BENCHMARK (NEW)
             # ===============================
             with gr.Tab("🏆 Full Benchmark"):
                 gr.Markdown("## Official GAIA Leaderboard Benchmark")
@@ -826,20 +828,69 @@ def create_gaia_app():
                             value="Click above to preview official test questions"
                         )
-                        # Full benchmark
-                        gr.Markdown("### 🚀 Run Complete Benchmark")
-                        gr.Markdown("""
-                        **Warning**: This will evaluate your model on all ~300 official GAIA test questions.
-                        This process may take 1-3 hours depending on your model and hardware.
-                        """)
-                        full_benchmark_btn = gr.Button(
-                            "🏆 Start Full Benchmark (300 Questions)",
-                            variant="primary",
-                            size="lg"
                         )
-                # Benchmark results
                 benchmark_status = gr.Textbox(
                     label="📊 Benchmark Status",
                     value="Ready to run benchmark",
@@ -875,12 +926,26 @@ def create_gaia_app():
                 # Event handlers
                 test_preview_btn.click(
-                    fn=load_test_questions_interface,
                     outputs=[test_preview_output]
                 )
-                def full_benchmark_with_files(*args):
-                    status, report, sub_file, meta_file = run_leaderboard_benchmark_interface(*args)
                     return (
                         status,
                         report,
@@ -890,18 +955,26 @@ def create_gaia_app():
                         gr.update(visible=True)   # Show metadata file
                     )
                 full_benchmark_btn.click(
-                    fn=full_benchmark_with_files,
-                    outputs=[
-                        benchmark_status,
-                        benchmark_report,
-                        submission_file,
-                        metadata_file,
-                        submission_file,  # Update visibility
-                        metadata_file     # Update visibility
-                    ]
                 )
             # ===============================
             # TAB 5: INFORMATION (UPDATED)
             # ===============================

                     outputs=[download_output]
                 )
+            # Add this to your Full Benchmark tab in app.py
             # ===============================
+            # TAB 4: FULL BENCHMARK (UPDATED FOR 300 QUESTIONS)
             # ===============================
             with gr.Tab("🏆 Full Benchmark"):
                 gr.Markdown("## Official GAIA Leaderboard Benchmark")
                             value="Click above to preview official test questions"
                         )
+                        # Dataset structure preview (NEW)
+                        dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
+                        dataset_structure_output = gr.Markdown(
+                            value="Click above to see actual GAIA dataset structure"
+                        )
+                # Benchmark Configuration Section (NEW)
+                gr.Markdown("### 🎛️ Benchmark Configuration")
+                with gr.Row():
+                    with gr.Column():
+                        # Question count selection
+                        question_count = gr.Slider(
+                            minimum=10,
+                            maximum=300,
+                            value=50,
+                            step=10,
+                            label="Number of Questions",
+                            info="Choose how many questions to evaluate (300 = full benchmark)"
+                        )
+                        # Selection strategy
+                        selection_strategy = gr.Dropdown(
+                            choices=["balanced", "random", "sequential"],
+                            value="balanced",
+                            label="Question Selection Strategy",
+                            info="Balanced recommended for representative evaluation"
+                        )
+                    with gr.Column():
+                        # Configuration info
+                        config_info = gr.Markdown(
+                            value=get_question_selection_info()
                         )
+                # Benchmark execution
+                gr.Markdown("### 🚀 Run Benchmark")
+                with gr.Row():
+                    # Custom benchmark button
+                    custom_benchmark_btn = gr.Button(
+                        "🎯 Start Custom Benchmark",
+                        variant="primary",
+                        size="lg"
+                    )
+                    # Full 300-question benchmark button
+                    full_benchmark_btn = gr.Button(
+                        "🏆 Start FULL 300-Question Benchmark",
+                        variant="secondary",
+                        size="lg"
+                    )
+                # Warning message for full benchmark
+                gr.Markdown("""
+                **⚠️ Full 300-Question Benchmark Warning**:
+                - **Time**: 1-3 hours depending on model and hardware
+                - **Cost**: ~$1-3 on GPU (T4 Small recommended)
+                - **Purpose**: Official leaderboard submission
+                - **Recommendation**: Test with smaller batches first
+                """)
+                # Results section
                 benchmark_status = gr.Textbox(
                     label="📊 Benchmark Status",
                     value="Ready to run benchmark",
                 # Event handlers
                 test_preview_btn.click(
+                    fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
                     outputs=[test_preview_output]
                 )
+                # NEW: Dataset structure preview
+                dataset_structure_btn.click(
+                    fn=preview_dataset_structure_interface,
+                    outputs=[dataset_structure_output]
+                )
+                # Custom benchmark with user settings
+                def run_custom_benchmark_with_settings(num_questions, strategy, progress=gr.Progress()):
+                    return run_custom_benchmark_interface(num_questions, strategy, progress)
+                # Full 300-question benchmark
+                def run_full_300_benchmark(progress=gr.Progress()):
+                    return run_custom_benchmark_interface(300, "balanced", progress)
+                def benchmark_with_files(*args):
+                    status, report, sub_file, meta_file = args[0], args[1], args[2], args[3]
                     return (
                         status,
                         report,
                         gr.update(visible=True)   # Show metadata file
                     )
+                # Custom benchmark event
+                custom_benchmark_btn.click(
+                    fn=run_custom_benchmark_with_settings,
+                    inputs=[question_count, selection_strategy],
+                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
+                ).then(
+                    fn=benchmark_with_files,
+                    inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
+                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
+                )
+                # Full 300-question benchmark event
                 full_benchmark_btn.click(
+                    fn=run_full_300_benchmark,
+                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
+                ).then(
+                    fn=benchmark_with_files,
+                    inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
+                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
                 )
             # ===============================
             # TAB 5: INFORMATION (UPDATED)
             # ===============================