Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -809,8 +809,10 @@ def create_gaia_app():
|
|
809 |
outputs=[download_output]
|
810 |
)
|
811 |
|
|
|
|
|
812 |
# ===============================
|
813 |
-
# TAB 4: FULL BENCHMARK (
|
814 |
# ===============================
|
815 |
with gr.Tab("🏆 Full Benchmark"):
|
816 |
gr.Markdown("## Official GAIA Leaderboard Benchmark")
|
@@ -826,20 +828,69 @@ def create_gaia_app():
|
|
826 |
value="Click above to preview official test questions"
|
827 |
)
|
828 |
|
829 |
-
#
|
830 |
-
gr.
|
831 |
-
gr.Markdown(
|
832 |
-
|
833 |
-
|
834 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
835 |
|
836 |
-
|
837 |
-
|
838 |
-
|
839 |
-
|
840 |
)
|
841 |
|
842 |
-
# Benchmark
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
843 |
benchmark_status = gr.Textbox(
|
844 |
label="📊 Benchmark Status",
|
845 |
value="Ready to run benchmark",
|
@@ -875,12 +926,26 @@ def create_gaia_app():
|
|
875 |
|
876 |
# Event handlers
|
877 |
test_preview_btn.click(
|
878 |
-
fn=load_test_questions_interface,
|
879 |
outputs=[test_preview_output]
|
880 |
)
|
881 |
|
882 |
-
|
883 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
884 |
return (
|
885 |
status,
|
886 |
report,
|
@@ -890,18 +955,26 @@ def create_gaia_app():
|
|
890 |
gr.update(visible=True) # Show metadata file
|
891 |
)
|
892 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
893 |
full_benchmark_btn.click(
|
894 |
-
fn=
|
895 |
-
outputs=[
|
896 |
-
|
897 |
-
|
898 |
-
|
899 |
-
|
900 |
-
submission_file, # Update visibility
|
901 |
-
metadata_file # Update visibility
|
902 |
-
]
|
903 |
)
|
904 |
-
|
905 |
# ===============================
|
906 |
# TAB 5: INFORMATION (UPDATED)
|
907 |
# ===============================
|
|
|
809 |
outputs=[download_output]
|
810 |
)
|
811 |
|
812 |
+
# Add this to your Full Benchmark tab in app.py
|
813 |
+
|
814 |
# ===============================
|
815 |
+
# TAB 4: FULL BENCHMARK (UPDATED FOR 300 QUESTIONS)
|
816 |
# ===============================
|
817 |
with gr.Tab("🏆 Full Benchmark"):
|
818 |
gr.Markdown("## Official GAIA Leaderboard Benchmark")
|
|
|
828 |
value="Click above to preview official test questions"
|
829 |
)
|
830 |
|
831 |
+
# Dataset structure preview (NEW)
|
832 |
+
dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
|
833 |
+
dataset_structure_output = gr.Markdown(
|
834 |
+
value="Click above to see actual GAIA dataset structure"
|
835 |
+
)
|
836 |
+
|
837 |
+
# Benchmark Configuration Section (NEW)
|
838 |
+
gr.Markdown("### 🎛️ Benchmark Configuration")
|
839 |
+
|
840 |
+
with gr.Row():
|
841 |
+
with gr.Column():
|
842 |
+
# Question count selection
|
843 |
+
question_count = gr.Slider(
|
844 |
+
minimum=10,
|
845 |
+
maximum=300,
|
846 |
+
value=50,
|
847 |
+
step=10,
|
848 |
+
label="Number of Questions",
|
849 |
+
info="Choose how many questions to evaluate (300 = full benchmark)"
|
850 |
+
)
|
851 |
+
|
852 |
+
# Selection strategy
|
853 |
+
selection_strategy = gr.Dropdown(
|
854 |
+
choices=["balanced", "random", "sequential"],
|
855 |
+
value="balanced",
|
856 |
+
label="Question Selection Strategy",
|
857 |
+
info="Balanced recommended for representative evaluation"
|
858 |
+
)
|
859 |
|
860 |
+
with gr.Column():
|
861 |
+
# Configuration info
|
862 |
+
config_info = gr.Markdown(
|
863 |
+
value=get_question_selection_info()
|
864 |
)
|
865 |
|
866 |
+
# Benchmark execution
|
867 |
+
gr.Markdown("### 🚀 Run Benchmark")
|
868 |
+
|
869 |
+
with gr.Row():
|
870 |
+
# Custom benchmark button
|
871 |
+
custom_benchmark_btn = gr.Button(
|
872 |
+
"🎯 Start Custom Benchmark",
|
873 |
+
variant="primary",
|
874 |
+
size="lg"
|
875 |
+
)
|
876 |
+
|
877 |
+
# Full 300-question benchmark button
|
878 |
+
full_benchmark_btn = gr.Button(
|
879 |
+
"🏆 Start FULL 300-Question Benchmark",
|
880 |
+
variant="secondary",
|
881 |
+
size="lg"
|
882 |
+
)
|
883 |
+
|
884 |
+
# Warning message for full benchmark
|
885 |
+
gr.Markdown("""
|
886 |
+
**⚠️ Full 300-Question Benchmark Warning**:
|
887 |
+
- **Time**: 1-3 hours depending on model and hardware
|
888 |
+
- **Cost**: ~$1-3 on GPU (T4 Small recommended)
|
889 |
+
- **Purpose**: Official leaderboard submission
|
890 |
+
- **Recommendation**: Test with smaller batches first
|
891 |
+
""")
|
892 |
+
|
893 |
+
# Results section
|
894 |
benchmark_status = gr.Textbox(
|
895 |
label="📊 Benchmark Status",
|
896 |
value="Ready to run benchmark",
|
|
|
926 |
|
927 |
# Event handlers
|
928 |
test_preview_btn.click(
|
929 |
+
fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
|
930 |
outputs=[test_preview_output]
|
931 |
)
|
932 |
|
933 |
+
# NEW: Dataset structure preview
|
934 |
+
dataset_structure_btn.click(
|
935 |
+
fn=preview_dataset_structure_interface,
|
936 |
+
outputs=[dataset_structure_output]
|
937 |
+
)
|
938 |
+
|
939 |
+
# Custom benchmark with user settings
|
940 |
+
def run_custom_benchmark_with_settings(num_questions, strategy, progress=gr.Progress()):
|
941 |
+
return run_custom_benchmark_interface(num_questions, strategy, progress)
|
942 |
+
|
943 |
+
# Full 300-question benchmark
|
944 |
+
def run_full_300_benchmark(progress=gr.Progress()):
|
945 |
+
return run_custom_benchmark_interface(300, "balanced", progress)
|
946 |
+
|
947 |
+
def benchmark_with_files(*args):
|
948 |
+
status, report, sub_file, meta_file = args[0], args[1], args[2], args[3]
|
949 |
return (
|
950 |
status,
|
951 |
report,
|
|
|
955 |
gr.update(visible=True) # Show metadata file
|
956 |
)
|
957 |
|
958 |
+
# Custom benchmark event
|
959 |
+
custom_benchmark_btn.click(
|
960 |
+
fn=run_custom_benchmark_with_settings,
|
961 |
+
inputs=[question_count, selection_strategy],
|
962 |
+
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
|
963 |
+
).then(
|
964 |
+
fn=benchmark_with_files,
|
965 |
+
inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
|
966 |
+
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
|
967 |
+
)
|
968 |
+
|
969 |
+
# Full 300-question benchmark event
|
970 |
full_benchmark_btn.click(
|
971 |
+
fn=run_full_300_benchmark,
|
972 |
+
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
|
973 |
+
).then(
|
974 |
+
fn=benchmark_with_files,
|
975 |
+
inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
|
976 |
+
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
|
|
|
|
|
|
|
977 |
)
|
|
|
978 |
# ===============================
|
979 |
# TAB 5: INFORMATION (UPDATED)
|
980 |
# ===============================
|