Ashokdll commited on
Commit
8fea07e
·
verified ·
1 Parent(s): 2d5d543

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -25
app.py CHANGED
@@ -809,8 +809,10 @@ def create_gaia_app():
809
  outputs=[download_output]
810
  )
811
 
 
 
812
  # ===============================
813
- # TAB 4: FULL BENCHMARK (NEW)
814
  # ===============================
815
  with gr.Tab("🏆 Full Benchmark"):
816
  gr.Markdown("## Official GAIA Leaderboard Benchmark")
@@ -826,20 +828,69 @@ def create_gaia_app():
826
  value="Click above to preview official test questions"
827
  )
828
 
829
- # Full benchmark
830
- gr.Markdown("### 🚀 Run Complete Benchmark")
831
- gr.Markdown("""
832
- **Warning**: This will evaluate your model on all ~300 official GAIA test questions.
833
- This process may take 1-3 hours depending on your model and hardware.
834
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
835
 
836
- full_benchmark_btn = gr.Button(
837
- "🏆 Start Full Benchmark (300 Questions)",
838
- variant="primary",
839
- size="lg"
840
  )
841
 
842
- # Benchmark results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
843
  benchmark_status = gr.Textbox(
844
  label="📊 Benchmark Status",
845
  value="Ready to run benchmark",
@@ -875,12 +926,26 @@ def create_gaia_app():
875
 
876
  # Event handlers
877
  test_preview_btn.click(
878
- fn=load_test_questions_interface,
879
  outputs=[test_preview_output]
880
  )
881
 
882
- def full_benchmark_with_files(*args):
883
- status, report, sub_file, meta_file = run_leaderboard_benchmark_interface(*args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
884
  return (
885
  status,
886
  report,
@@ -890,18 +955,26 @@ def create_gaia_app():
890
  gr.update(visible=True) # Show metadata file
891
  )
892
 
 
 
 
 
 
 
 
 
 
 
 
 
893
  full_benchmark_btn.click(
894
- fn=full_benchmark_with_files,
895
- outputs=[
896
- benchmark_status,
897
- benchmark_report,
898
- submission_file,
899
- metadata_file,
900
- submission_file, # Update visibility
901
- metadata_file # Update visibility
902
- ]
903
  )
904
-
905
  # ===============================
906
  # TAB 5: INFORMATION (UPDATED)
907
  # ===============================
 
809
  outputs=[download_output]
810
  )
811
 
812
+ # Add this to your Full Benchmark tab in app.py
813
+
814
  # ===============================
815
+ # TAB 4: FULL BENCHMARK (UPDATED FOR 300 QUESTIONS)
816
  # ===============================
817
  with gr.Tab("🏆 Full Benchmark"):
818
  gr.Markdown("## Official GAIA Leaderboard Benchmark")
 
828
  value="Click above to preview official test questions"
829
  )
830
 
831
+ # Dataset structure preview (NEW)
832
+ dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
833
+ dataset_structure_output = gr.Markdown(
834
+ value="Click above to see actual GAIA dataset structure"
835
+ )
836
+
837
+ # Benchmark Configuration Section (NEW)
838
+ gr.Markdown("### 🎛️ Benchmark Configuration")
839
+
840
+ with gr.Row():
841
+ with gr.Column():
842
+ # Question count selection
843
+ question_count = gr.Slider(
844
+ minimum=10,
845
+ maximum=300,
846
+ value=50,
847
+ step=10,
848
+ label="Number of Questions",
849
+ info="Choose how many questions to evaluate (300 = full benchmark)"
850
+ )
851
+
852
+ # Selection strategy
853
+ selection_strategy = gr.Dropdown(
854
+ choices=["balanced", "random", "sequential"],
855
+ value="balanced",
856
+ label="Question Selection Strategy",
857
+ info="Balanced recommended for representative evaluation"
858
+ )
859
 
860
+ with gr.Column():
861
+ # Configuration info
862
+ config_info = gr.Markdown(
863
+ value=get_question_selection_info()
864
  )
865
 
866
+ # Benchmark execution
867
+ gr.Markdown("### 🚀 Run Benchmark")
868
+
869
+ with gr.Row():
870
+ # Custom benchmark button
871
+ custom_benchmark_btn = gr.Button(
872
+ "🎯 Start Custom Benchmark",
873
+ variant="primary",
874
+ size="lg"
875
+ )
876
+
877
+ # Full 300-question benchmark button
878
+ full_benchmark_btn = gr.Button(
879
+ "🏆 Start FULL 300-Question Benchmark",
880
+ variant="secondary",
881
+ size="lg"
882
+ )
883
+
884
+ # Warning message for full benchmark
885
+ gr.Markdown("""
886
+ **⚠️ Full 300-Question Benchmark Warning**:
887
+ - **Time**: 1-3 hours depending on model and hardware
888
+ - **Cost**: ~$1-3 on GPU (T4 Small recommended)
889
+ - **Purpose**: Official leaderboard submission
890
+ - **Recommendation**: Test with smaller batches first
891
+ """)
892
+
893
+ # Results section
894
  benchmark_status = gr.Textbox(
895
  label="📊 Benchmark Status",
896
  value="Ready to run benchmark",
 
926
 
927
  # Event handlers
928
  test_preview_btn.click(
929
+ fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
930
  outputs=[test_preview_output]
931
  )
932
 
933
+ # NEW: Dataset structure preview
934
+ dataset_structure_btn.click(
935
+ fn=preview_dataset_structure_interface,
936
+ outputs=[dataset_structure_output]
937
+ )
938
+
939
+ # Custom benchmark with user settings
940
+ def run_custom_benchmark_with_settings(num_questions, strategy, progress=gr.Progress()):
941
+ return run_custom_benchmark_interface(num_questions, strategy, progress)
942
+
943
+ # Full 300-question benchmark
944
+ def run_full_300_benchmark(progress=gr.Progress()):
945
+ return run_custom_benchmark_interface(300, "balanced", progress)
946
+
947
+ def benchmark_with_files(*args):
948
+ status, report, sub_file, meta_file = args[0], args[1], args[2], args[3]
949
  return (
950
  status,
951
  report,
 
955
  gr.update(visible=True) # Show metadata file
956
  )
957
 
958
+ # Custom benchmark event
959
+ custom_benchmark_btn.click(
960
+ fn=run_custom_benchmark_with_settings,
961
+ inputs=[question_count, selection_strategy],
962
+ outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
963
+ ).then(
964
+ fn=benchmark_with_files,
965
+ inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
966
+ outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
967
+ )
968
+
969
+ # Full 300-question benchmark event
970
  full_benchmark_btn.click(
971
+ fn=run_full_300_benchmark,
972
+ outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
973
+ ).then(
974
+ fn=benchmark_with_files,
975
+ inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
976
+ outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
 
 
 
977
  )
 
978
  # ===============================
979
  # TAB 5: INFORMATION (UPDATED)
980
  # ===============================