PeterKruger commited on
Commit
3116207
·
verified ·
1 Parent(s): dfa358f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -84
app.py CHANGED
@@ -12,13 +12,17 @@ import sys
12
  import io
13
  from contextlib import redirect_stdout, redirect_stderr
14
 
15
- # Initialize session state variables
16
  if 'main_output' not in st.session_state:
17
- st.session_state.main_output = []
18
  if 'debug_output' not in st.session_state:
19
- st.session_state.debug_output = []
20
  if 'progress' not in st.session_state:
21
- st.session_state.progress = 0
 
 
 
 
22
 
23
  # FILES
24
  iteration_output_file = "llm_benchmark_iteration_results.csv" # File to store iteration results, defined as global
@@ -55,21 +59,15 @@ def custom_print(*args, **kwargs):
55
  output = ' '.join(map(str, args))
56
 
57
  # Add to main output list
58
- st.session_state.main_output.append(output)
59
 
60
  # Also print to standard output for console logging
61
  print(*args, **kwargs)
62
-
63
- # Force an immediate update of the UI (when used inside a function)
64
- st.session_state.update_counter = st.session_state.get('update_counter', 0) + 1
65
 
66
  # Custom function to capture warnings and errors
67
  def log_debug(message):
68
- st.session_state.debug_output.append(message)
69
  print(f"DEBUG: {message}", file=sys.stderr)
70
-
71
- # Force an immediate update of the UI
72
- st.session_state.update_counter = st.session_state.get('update_counter', 0) + 1
73
 
74
  def retry_api_request(max_retries=3, wait_time=10):
75
  """Decorator for retrying API requests with rate limit handling."""
@@ -511,6 +509,8 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
511
 
512
  # --- Core Logic ---
513
  def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
 
 
514
  results = {
515
  "model_name": [],
516
  "topic": [],
@@ -561,7 +561,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
561
 
562
  for iteration in range(t): # Added iteration counter
563
  # Update progress in the Streamlit app
564
- st.session_state.progress = (iteration + 1) / t
565
 
566
  if len(active_models) < 2:
567
  custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
@@ -647,8 +647,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
647
  question_ranking_futures.append(future)
648
 
649
  for future in concurrent.futures.as_completed(question_ranking_futures): # Collect ranks as they become available
650
- ranking_model_id, rank = future.result() # Get model_id and rank
651
- question_ranks[ranking_model_id] = rank # Store rank with model_id as key
 
 
 
652
 
653
  question_ranking_end_time = time.time()
654
  question_ranking_duration_total = question_ranking_end_time - question_ranking_start_time
@@ -674,11 +677,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
674
 
675
  if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
676
  question_accepted = True
677
- custom_print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
678
  s_t += 1
679
  else:
680
  question_accepted = False
681
- custom_print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
682
 
683
  if not question_accepted:
684
  custom_print("Generated question was not accepted. Regenerating question.")
@@ -709,35 +712,43 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
709
  token,
710
  timeout=60
711
  )
712
- answer_futures.append(future)
713
- except TimeoutError as e:
714
- log_debug(f"Answer generation for model {model_id} timed out: {e}")
715
- answer = "I am struggling to answer this question" # Treat timeout as error
716
- duration = 120 # You can set a default duration or handle it differently if needed
717
- answers[model_id] = answer # Store error answer
718
- answer_durations[model_id] = duration # Store default duration
719
-
720
- for future in concurrent.futures.as_completed(answer_futures):
721
- model_id = active_models[answer_futures.index(future)] # Get model_id based on future index (order is preserved)
722
- answer, duration = future.result() # Get both answer and duration - unpack the returned tuple
723
- answers[model_id] = answer
724
- answer_durations[model_id] = duration # Store duration - store the duration in the answer_durations dictionary
 
 
 
 
725
 
726
  # --- Ranking Process ---
727
 
728
  # Prepare to write to file (open in append mode outside the model loop but inside iteration loop)
729
- iteration_results_file_opened = open(iteration_output_file, 'a')
730
- if iteration == 0: # Write header only for the first iteration
731
- iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
 
 
 
 
732
 
733
 
734
  for model_id in active_models:
735
- answer = answers.get(model_id)
736
- if not answer: # Add guard clause
737
  log_debug(f"No answer found for model {model_id}. Skipping ranking.")
738
  continue
739
 
740
- if answer == "Error answering": # Handle answer generation errors
 
741
  consecutive_failures[model_id] += 1
742
  if consecutive_failures[model_id] >= failure_threshold:
743
  custom_print(f"Model {model_id} is consistently failing to answer. Removing from active models.")
@@ -759,24 +770,30 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
759
  for ranking_model_id in active_models:
760
  # --- Filter for ranking roles ("rank" or "both") ---
761
  if model_config[ranking_model_id].get("role", "both") in ["rank", "both"]:
762
- future = executor.submit(
763
- get_rank_from_model,
764
- ranking_model_id,
765
- question,
766
- answer,
767
- consecutive_failures,
768
- failure_threshold,
769
- unresponsive_models,
770
- model_config,
771
- topic,
772
- token,
773
- timeout=60
774
- )
775
- ranking_futures.append(future)
 
 
 
776
 
777
  for future in concurrent.futures.as_completed(ranking_futures): # Collect ranks as they become available
778
- ranking_model_id, rank = future.result() # Get model_id and rank
779
- ranks[ranking_model_id] = rank # Store rank with model_id as key
 
 
 
780
 
781
  ranking_end_time = time.time() # Record end time of ranking
782
  ranking_duration = ranking_end_time - ranking_start_time # Calculate duration
@@ -807,20 +824,29 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
807
  results["answer"].append(answer)
808
  results["answer_generation_duration"].append(answer_durations.get(model_id, 0))
809
  results["average_rank"].append(average_rank)
810
- results["ranks"].append([ranks[m] for m in active_models if m in ranks]) # Store raw ranks including Nones, ensure order
811
  results["question_rank_average"].append(question_avg_rank) # Store question rank average
812
- results["question_ranks"].append([question_ranks[m] for m in active_models if m in question_ranks]) # Store question ranks
813
  results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
814
 
815
- cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
816
- cumulative_avg_rank[model_id] = np.nanmean(cumulative_model_ranks[model_id]) if cumulative_model_ranks[model_id] else np.nan
 
 
 
 
 
817
 
818
  # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
819
- ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
820
- custom_print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank[model_id]:.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
821
 
822
  # Write iteration results to file (append mode) - write for each model right after ranking
823
- iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
 
 
 
 
824
 
825
  # Update model weights based on cumulative average ranks, handling NaNs
826
  temp_weights = {}
@@ -842,9 +868,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
842
  for m_id in active_models:
843
  model_weights[m_id] = 1.0 / len(active_models)
844
 
845
- iteration_results_file_opened.close()
 
846
 
847
  custom_print(f"Unresponsive models during this run: {unresponsive_models}")
 
848
  return results, cumulative_avg_rank, s_t
849
 
850
  def check_model_availability(models, token):
@@ -970,14 +998,14 @@ with tab1:
970
  st.error("You need at least 2 available models to run the benchmark")
971
 
972
  # Progress bar
973
- progress_bar = st.progress(st.session_state.progress)
974
 
975
  # Start benchmark button
976
  if st.sidebar.button("Start Benchmark"):
977
  # Clear previous outputs
978
- st.session_state.main_output = []
979
- st.session_state.debug_output = []
980
- st.session_state.progress = 0
981
 
982
  if not hf_token:
983
  st.error("Please enter your Hugging Face API token")
@@ -989,10 +1017,6 @@ with tab1:
989
  # Setup to capture results for display
990
  results_container = st.container()
991
 
992
- # Create a global variable to store intermediate results
993
- if 'results_df' not in st.session_state:
994
- st.session_state.results_df = pd.DataFrame()
995
-
996
  # Run the benchmark
997
  try:
998
  # Run benchmark and get results
@@ -1003,19 +1027,19 @@ with tab1:
1003
  )
1004
 
1005
  # Update progress to complete
1006
- st.session_state.progress = 1.0
1007
  progress_bar.progress(1.0)
1008
 
1009
  # Display results
1010
  if total_successful > 0:
1011
  results_df = pd.DataFrame(results)
1012
- st.session_state.results_df = results_df
1013
 
1014
  # Show model rankings
1015
  st.subheader("Model Rankings")
1016
  ranking_df = pd.DataFrame({
1017
  "Model": list(cumulative_avg_rank.keys()),
1018
- "Average Rank": [round(r, 2) for r in cumulative_avg_rank.values()]
1019
  })
1020
  ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
1021
  st.dataframe(ranking_df)
@@ -1039,35 +1063,42 @@ with tab1:
1039
  st.exception(e)
1040
 
1041
  # Show previous results if available
1042
- elif 'results_df' in st.session_state and not st.session_state.results_df.empty:
1043
  st.subheader("Previous Results")
1044
- st.dataframe(st.session_state.results_df)
1045
 
1046
  with tab2:
1047
  # Display main output log
1048
  st.subheader("Execution Log")
1049
 
1050
  # Display logs
1051
- log_text = "\n".join(st.session_state.main_output)
1052
- st.text_area("Progress Log", log_text, height=400)
 
 
 
1053
 
1054
  # Add a refresh button for the log
1055
  if st.button("Refresh Progress Log"):
1056
- pass # The rerun happens automatically at the end
1057
 
1058
  with tab3:
1059
  # Display debug output
1060
  st.subheader("Debug Log")
1061
 
1062
  # Display debug logs
1063
- debug_text = "\n".join(st.session_state.debug_output)
1064
- st.text_area("Debug Information", debug_text, height=400)
 
 
 
1065
 
1066
  # Add a refresh button for the debug log
1067
  if st.button("Refresh Debug Log"):
1068
- pass # The rerun happens automatically at the end
1069
-
1070
- # Auto-refresh mechanism
1071
- if st.session_state.get('update_counter', 0) > 0:
1072
- time.sleep(0.1) # Brief pause to allow UI to update
1073
- st.experimental_rerun()
 
 
12
  import io
13
  from contextlib import redirect_stdout, redirect_stderr
14
 
15
+ # Initialize session state variables properly at the very beginning
16
  if 'main_output' not in st.session_state:
17
+ st.session_state['main_output'] = []
18
  if 'debug_output' not in st.session_state:
19
+ st.session_state['debug_output'] = []
20
  if 'progress' not in st.session_state:
21
+ st.session_state['progress'] = 0
22
+ if 'results_df' not in st.session_state:
23
+ st.session_state['results_df'] = pd.DataFrame()
24
+ if 'is_running' not in st.session_state:
25
+ st.session_state['is_running'] = False
26
 
27
  # FILES
28
  iteration_output_file = "llm_benchmark_iteration_results.csv" # File to store iteration results, defined as global
 
59
  output = ' '.join(map(str, args))
60
 
61
  # Add to main output list
62
+ st.session_state['main_output'].append(output)
63
 
64
  # Also print to standard output for console logging
65
  print(*args, **kwargs)
 
 
 
66
 
67
  # Custom function to capture warnings and errors
68
  def log_debug(message):
69
+ st.session_state['debug_output'].append(message)
70
  print(f"DEBUG: {message}", file=sys.stderr)
 
 
 
71
 
72
  def retry_api_request(max_retries=3, wait_time=10):
73
  """Decorator for retrying API requests with rate limit handling."""
 
509
 
510
  # --- Core Logic ---
511
  def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
512
+ st.session_state['is_running'] = True
513
+
514
  results = {
515
  "model_name": [],
516
  "topic": [],
 
561
 
562
  for iteration in range(t): # Added iteration counter
563
  # Update progress in the Streamlit app
564
+ st.session_state['progress'] = (iteration + 1) / t
565
 
566
  if len(active_models) < 2:
567
  custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
 
647
  question_ranking_futures.append(future)
648
 
649
  for future in concurrent.futures.as_completed(question_ranking_futures): # Collect ranks as they become available
650
+ try:
651
+ ranking_model_id, rank = future.result() # Get model_id and rank
652
+ question_ranks[ranking_model_id] = rank # Store rank with model_id as key
653
+ except Exception as e:
654
+ log_debug(f"Error getting question rank result: {e}")
655
 
656
  question_ranking_end_time = time.time()
657
  question_ranking_duration_total = question_ranking_end_time - question_ranking_start_time
 
677
 
678
  if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
679
  question_accepted = True
680
+ custom_print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks.get(m, None) for m in active_models if m in question_ranks]}")
681
  s_t += 1
682
  else:
683
  question_accepted = False
684
+ custom_print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks.get(m, None) for m in active_models if m in question_ranks]}")
685
 
686
  if not question_accepted:
687
  custom_print("Generated question was not accepted. Regenerating question.")
 
712
  token,
713
  timeout=60
714
  )
715
+ answer_futures.append((model_id, future))
716
+ except Exception as e:
717
+ log_debug(f"Error submitting answer task for {model_id}: {e}")
718
+ answer = "Error answering - Task submission failed"
719
+ duration = 0
720
+ answers[model_id] = answer
721
+ answer_durations[model_id] = duration
722
+
723
+ for model_id, future in answer_futures:
724
+ try:
725
+ answer, duration = future.result() # Get both answer and duration
726
+ answers[model_id] = answer
727
+ answer_durations[model_id] = duration
728
+ except Exception as e:
729
+ log_debug(f"Error getting answer from {model_id}: {e}")
730
+ answers[model_id] = "Error answering - Future result failed"
731
+ answer_durations[model_id] = 0
732
 
733
  # --- Ranking Process ---
734
 
735
  # Prepare to write to file (open in append mode outside the model loop but inside iteration loop)
736
+ try:
737
+ iteration_results_file_opened = open(iteration_output_file, 'a')
738
+ if iteration == 0: # Write header only for the first iteration
739
+ iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
740
+ except Exception as e:
741
+ log_debug(f"Error opening results file: {e}")
742
+ iteration_results_file_opened = None
743
 
744
 
745
  for model_id in active_models:
746
+ if model_id not in answers:
 
747
  log_debug(f"No answer found for model {model_id}. Skipping ranking.")
748
  continue
749
 
750
+ answer = answers[model_id]
751
+ if answer == "Error answering" or answer.startswith("Error answering -"): # Handle answer generation errors
752
  consecutive_failures[model_id] += 1
753
  if consecutive_failures[model_id] >= failure_threshold:
754
  custom_print(f"Model {model_id} is consistently failing to answer. Removing from active models.")
 
770
  for ranking_model_id in active_models:
771
  # --- Filter for ranking roles ("rank" or "both") ---
772
  if model_config[ranking_model_id].get("role", "both") in ["rank", "both"]:
773
+ try:
774
+ future = executor.submit(
775
+ get_rank_from_model,
776
+ ranking_model_id,
777
+ question,
778
+ answer,
779
+ consecutive_failures,
780
+ failure_threshold,
781
+ unresponsive_models,
782
+ model_config,
783
+ topic,
784
+ token,
785
+ timeout=60
786
+ )
787
+ ranking_futures.append(future)
788
+ except Exception as e:
789
+ log_debug(f"Error submitting ranking task for {ranking_model_id}: {e}")
790
 
791
  for future in concurrent.futures.as_completed(ranking_futures): # Collect ranks as they become available
792
+ try:
793
+ ranking_model_id, rank = future.result() # Get model_id and rank
794
+ ranks[ranking_model_id] = rank # Store rank with model_id as key
795
+ except Exception as e:
796
+ log_debug(f"Error getting rank result: {e}")
797
 
798
  ranking_end_time = time.time() # Record end time of ranking
799
  ranking_duration = ranking_end_time - ranking_start_time # Calculate duration
 
824
  results["answer"].append(answer)
825
  results["answer_generation_duration"].append(answer_durations.get(model_id, 0))
826
  results["average_rank"].append(average_rank)
827
+ results["ranks"].append([ranks.get(m, None) for m in active_models if m in ranks]) # Store raw ranks including Nones, ensure order
828
  results["question_rank_average"].append(question_avg_rank) # Store question rank average
829
+ results["question_ranks"].append([question_ranks.get(m, None) for m in active_models if m in question_ranks]) # Store question ranks
830
  results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
831
 
832
+ if model_id in cumulative_model_ranks:
833
+ cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
834
+
835
+ if model_id in cumulative_model_ranks and cumulative_model_ranks[model_id]:
836
+ cumulative_avg_rank[model_id] = np.nanmean([r for r in cumulative_model_ranks[model_id] if not np.isnan(r)])
837
+ else:
838
+ cumulative_avg_rank[model_id] = np.nan
839
 
840
  # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
841
+ ranks_str = "[" + ", ".join(map(str, [ranks.get(m, None) for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
842
+ custom_print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank.get(model_id, np.nan):.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
843
 
844
  # Write iteration results to file (append mode) - write for each model right after ranking
845
+ if iteration_results_file_opened:
846
+ try:
847
+ iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank.get(model_id, np.nan):.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
848
+ except Exception as e:
849
+ log_debug(f"Error writing to results file: {e}")
850
 
851
  # Update model weights based on cumulative average ranks, handling NaNs
852
  temp_weights = {}
 
868
  for m_id in active_models:
869
  model_weights[m_id] = 1.0 / len(active_models)
870
 
871
+ if iteration_results_file_opened:
872
+ iteration_results_file_opened.close()
873
 
874
  custom_print(f"Unresponsive models during this run: {unresponsive_models}")
875
+ st.session_state['is_running'] = False
876
  return results, cumulative_avg_rank, s_t
877
 
878
  def check_model_availability(models, token):
 
998
  st.error("You need at least 2 available models to run the benchmark")
999
 
1000
  # Progress bar
1001
+ progress_bar = st.progress(st.session_state['progress'])
1002
 
1003
  # Start benchmark button
1004
  if st.sidebar.button("Start Benchmark"):
1005
  # Clear previous outputs
1006
+ st.session_state['main_output'] = []
1007
+ st.session_state['debug_output'] = []
1008
+ st.session_state['progress'] = 0
1009
 
1010
  if not hf_token:
1011
  st.error("Please enter your Hugging Face API token")
 
1017
  # Setup to capture results for display
1018
  results_container = st.container()
1019
 
 
 
 
 
1020
  # Run the benchmark
1021
  try:
1022
  # Run benchmark and get results
 
1027
  )
1028
 
1029
  # Update progress to complete
1030
+ st.session_state['progress'] = 1.0
1031
  progress_bar.progress(1.0)
1032
 
1033
  # Display results
1034
  if total_successful > 0:
1035
  results_df = pd.DataFrame(results)
1036
+ st.session_state['results_df'] = results_df
1037
 
1038
  # Show model rankings
1039
  st.subheader("Model Rankings")
1040
  ranking_df = pd.DataFrame({
1041
  "Model": list(cumulative_avg_rank.keys()),
1042
+ "Average Rank": [round(r, 2) if not np.isnan(r) else 'N/A' for r in cumulative_avg_rank.values()]
1043
  })
1044
  ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
1045
  st.dataframe(ranking_df)
 
1063
  st.exception(e)
1064
 
1065
  # Show previous results if available
1066
+ elif 'results_df' in st.session_state and not st.session_state['results_df'].empty:
1067
  st.subheader("Previous Results")
1068
+ st.dataframe(st.session_state['results_df'])
1069
 
1070
  with tab2:
1071
  # Display main output log
1072
  st.subheader("Execution Log")
1073
 
1074
  # Display logs
1075
+ if 'main_output' in st.session_state:
1076
+ log_text = "\n".join(st.session_state['main_output'])
1077
+ st.text_area("Progress Log", log_text, height=400)
1078
+ else:
1079
+ st.text_area("Progress Log", "No progress logs yet.", height=400)
1080
 
1081
  # Add a refresh button for the log
1082
  if st.button("Refresh Progress Log"):
1083
+ st.experimental_rerun()
1084
 
1085
  with tab3:
1086
  # Display debug output
1087
  st.subheader("Debug Log")
1088
 
1089
  # Display debug logs
1090
+ if 'debug_output' in st.session_state:
1091
+ debug_text = "\n".join(st.session_state['debug_output'])
1092
+ st.text_area("Debug Information", debug_text, height=400)
1093
+ else:
1094
+ st.text_area("Debug Information", "No debug logs yet.", height=400)
1095
 
1096
  # Add a refresh button for the debug log
1097
  if st.button("Refresh Debug Log"):
1098
+ st.experimental_rerun()
1099
+
1100
+ # Auto-update while benchmark is running
1101
+ if st.session_state.get('is_running', False):
1102
+ st.empty()
1103
+ time.sleep(5) # Update every 5 seconds while running
1104
+ st.rerun()