PeterKruger commited on
Commit
819adb7
·
verified ·
1 Parent(s): 130972f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -196
app.py CHANGED
@@ -8,52 +8,9 @@ import re
8
  import time
9
  import random
10
  import functools
11
-
12
- # Create areas for different outputs
13
- status_area = st.empty() # For current status
14
- progress_area = st.empty() # For progress updates
15
-
16
- # Initialize session state variables
17
- if 'log_messages' not in st.session_state:
18
- st.session_state.log_messages = []
19
- if 'results_df' not in st.session_state:
20
- st.session_state.results_df = pd.DataFrame()
21
-
22
- # Collapsible section for logs
23
- with st.expander("Execution Log", expanded=False):
24
- log_area = st.empty()
25
-
26
- def update_log():
27
- """Update the log display with current messages"""
28
- log_area.text_area("System Log", value="\n".join(st.session_state.log_messages), height=300)
29
-
30
- def log_message(message, level="INFO"):
31
- """Log a message with timestamp and level"""
32
- timestamp = time.strftime("%H:%M:%S")
33
- formatted_msg = f"[{timestamp}] {level}: {message}"
34
- st.session_state.log_messages.append(formatted_msg)
35
- # Limit log size
36
- if len(st.session_state.log_messages) > 500:
37
- st.session_state.log_messages = st.session_state.log_messages[-500:]
38
- update_log()
39
-
40
- # Specialized logging functions
41
- def log_info(message):
42
- log_message(message, "INFO")
43
-
44
- def log_warning(message):
45
- log_message(message, "WARNING")
46
-
47
- def log_error(message):
48
- log_message(message, "ERROR")
49
-
50
- # Function to update status
51
- def update_status(message):
52
- status_area.write(message)
53
-
54
- # Function to update progress message
55
- def update_progress(message):
56
- progress_area.write(message)
57
 
58
  # FILES
59
  iteration_output_file = "llm_benchmark_iteration_results.csv" # File to store iteration results, defined as global
@@ -84,6 +41,28 @@ difficulty_probabilities = {
84
  "a very difficult": 0.6
85
  }
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  def retry_api_request(max_retries=3, wait_time=10):
88
  """Decorator for retrying API requests with rate limit handling."""
89
  def decorator(func):
@@ -94,13 +73,16 @@ def retry_api_request(max_retries=3, wait_time=10):
94
  try:
95
  return func(*args, **kwargs)
96
  except Exception as e:
97
- log_error(f"API error: {e}")
 
98
  if retries < max_retries:
99
- log_info(f"Waiting for {wait_time} seconds before retrying... (Retry {retries + 1}/{max_retries})")
 
100
  time.sleep(wait_time)
101
  retries += 1
102
  else:
103
- log_error(f"Max retries reached. Request failed.")
 
104
  return None
105
 
106
  return None
@@ -147,7 +129,8 @@ def make_hf_request(model_name, messages, temperature, max_tokens, token=None):
147
  )
148
  return response
149
  except Exception as e:
150
- log_error(f"Hugging Face Inference API error: {e}")
 
151
  return None
152
 
153
  # --- Prompting Functions ---
@@ -332,7 +315,7 @@ def generate_question_prompt(topic, difficulty):
332
  if topic in topic_instructions:
333
  prompt += random.choice(topic_instructions[topic]) + "\n"
334
  else:
335
- log_warning(f"No topic_instructions defined for topic '{topic}'")
336
 
337
  # 5. Conditional Question Types (Not for math, logics, grammar)
338
  if topic not in ["math", "logics", "grammar", "coding", "creative writing"]:
@@ -418,14 +401,14 @@ def parse_rank_string(rank_str, ranking_model_id):
418
  try:
419
  rank_val = int(rank_str) # Convert to integer *after* regex extraction
420
  if not 1 <= rank_val <= 5: # Check if rank is within valid range
421
- log_warning(f"Model {ranking_model_id} returned rank outside of valid range [1-5]: {rank_val}. Rank set to None.")
422
  return None
423
  return rank_val
424
  except ValueError:
425
- log_warning(f"Model {ranking_model_id} returned non-integer rank after regex extraction: '{rank_str}'. Rank set to None.")
426
  return None
427
  else:
428
- log_warning(f"Model {ranking_model_id} returned non-numeric rank: '{rank_str}'. Rank set to None.")
429
  return None
430
 
431
  # --- Helper Function for Parallel Ranking ---
@@ -442,18 +425,18 @@ def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures
442
  rank_str = response.strip()
443
  rank = parse_rank_string(rank_str, ranking_model_id)
444
  except ValueError:
445
- log_warning(f"Model {ranking_model_id} returned non-integer rank: '{rank_str}'. Rank set to None.")
446
  rank = None
447
  else:
448
- log_warning(f"Model {ranking_model_id} failed to provide rank. Rank set to None.")
449
  except Exception as e:
450
  duration = time.time() - start_time
451
- log_warning(f"Model {ranking_model_id} ranking timed out or failed after {duration:.2f}s: {e}")
452
  rank = None
453
 
454
  duration = time.time() - start_time # Calculate total duration of ranking attempt
455
  if duration > timeout:
456
- log_warning(f"Ranking by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
457
  rank = None # Ensure rank is None if timeout occurs
458
 
459
  time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
@@ -473,18 +456,18 @@ def get_question_rank_from_model(ranking_model_id, question, topic, difficulty,
473
  rank_str = response.strip()
474
  rank = parse_rank_string(rank_str, ranking_model_id)
475
  except ValueError:
476
- log_warning(f"Model {ranking_model_id} returned non-integer rank for question: '{rank_str}'. Rank set to None.")
477
  rank = None
478
  else:
479
- log_warning(f"Model {ranking_model_id} failed to provide rank for question. Rank set to None.")
480
  except Exception as e:
481
  duration = time.time() - start_time
482
- log_warning(f"Model {ranking_model_id} ranking question timed out or failed after {duration:.2f}s: {e}")
483
  rank = None
484
 
485
  duration = time.time() - start_time # Calculate total duration of ranking attempt
486
  if duration > timeout:
487
- log_warning(f"Ranking question by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
488
  rank = None # Ensure rank is None if timeout occurs
489
 
490
  time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
@@ -508,13 +491,13 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
508
  answer = response.strip()
509
  except Exception as e:
510
  duration = time.time() - start_time
511
- log_warning(f"Model {model_id} answering timed out or failed after {duration:.2f}s: {e}")
512
  answer = "Error answering - Timeout" # Or a specific timeout error message
513
  return answer, duration # Return error answer and duration
514
 
515
  time.sleep(time_sleep) # Small delay
516
  duration = time.time() - start_time # Calculate duration
517
- st.write(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately
518
 
519
  return answer, duration # Return answer and duration
520
 
@@ -569,18 +552,17 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
569
  s_t = 0 #count succesful iterations
570
 
571
  for iteration in range(t): # Added iteration counter
572
- # Update the progress bar
573
- progress_percentage = min(100, (iteration / t) * 100)
574
- st.progress(progress_percentage)
575
-
576
  if len(active_models) < 2:
577
- st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
578
  break
579
 
580
  topic = random.choice(topics)
581
  # --- Select difficulty with probabilities ---
582
  difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
583
- update_status(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---")
584
 
585
  # --- Question Generation ---
586
  question = None
@@ -601,13 +583,12 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
601
  if model_config[model_id].get("role", "both") in ["answer", "both"]
602
  ]
603
  if not question_gen_candidates: # No suitable models left
604
- st.warning("No models available for question generation with 'answer' or 'both' role. Skipping iteration.")
605
  continue # Skip to next iteration
606
 
607
  question_generator_model_id = random.choice(question_gen_candidates)
608
 
609
  # --- Question Generation ---
610
- update_progress(f"Generating question using model {question_generator_model_id}...")
611
  response = make_hf_request(model_config[question_generator_model_id]["name"],
612
  [{"role": "user", "content": question_prompt}],
613
  question_temp,
@@ -619,26 +600,25 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
619
  consecutive_failures[question_generator_model_id] = 0 # Reset on success
620
  break
621
  else:
622
- log_warning(f"Skipping due to request failure for model {question_generator_model_id}.")
623
  consecutive_failures[question_generator_model_id] += 1
624
 
625
  if consecutive_failures[question_generator_model_id] >= failure_threshold:
626
- st.warning(f"Model {question_generator_model_id} is unresponsive (question gen). Removing from active models.")
627
  if question_generator_model_id in active_models:
628
  active_models.remove(question_generator_model_id)
629
  unresponsive_models.add(question_generator_model_id)
630
  time.sleep(time_sleep)
631
 
632
  if question is None:
633
- st.warning(f"Failed to generate a question after {max_attempts} attempts. Skipping this round.")
634
  continue
635
 
636
  # --- Parallel Question Ranking ---
637
  question_ranks = {}
638
  question_ranking_futures = []
639
  question_ranking_start_time = time.time()
640
-
641
- update_progress(f"Ranking generated question...")
642
  with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models) or 1) as executor:
643
  for ranking_model_id in active_models:
644
  # --- Filter for ranking roles ("rank" or "both") ---
@@ -677,34 +657,33 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
677
 
678
  #check that the length is correct
679
  if len(weights_for_valid_question_ranks) != len(valid_question_ranks_values):
680
- log_warning("Mismatch length of weights and valid question ranks")
681
- log_info(f'weights_for_valid_question_ranks {weights_for_valid_question_ranks}')
682
- log_info(f'valid_question_ranks_values: {valid_question_ranks_values}')
683
 
684
  question_avg_rank = np.average(valid_question_ranks_values, weights=weights_for_valid_question_ranks)
685
  min_question_rank = min(valid_question_ranks_values) if valid_question_ranks_values else 0 # To avoid error if no valid rank
686
 
687
  if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
688
  question_accepted = True
689
- st.write(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
690
  s_t += 1
691
  else:
692
  question_accepted = False
693
- st.write(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
694
 
695
  if not question_accepted:
696
- update_progress("Generated question was not accepted. Regenerating question.")
697
  continue
698
 
699
  if len(active_models) < 2:
700
- st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
701
  break
702
 
703
  # --- Parallel Answer Generation ---
704
  answers = {}
705
  answer_futures = []
706
  answer_durations = {}
707
- update_progress("Generating answers from all models...")
708
  with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models)) as executor:
709
  for model_id in active_models:
710
  # --- Filter for answer generation roles ("answer" or "both") ---
@@ -724,7 +703,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
724
  )
725
  answer_futures.append(future)
726
  except TimeoutError as e:
727
- log_error(f"Answer generation for model {model_id} timed out: {e}")
728
  answer = "I am struggling to answer this question" # Treat timeout as error
729
  duration = 120 # You can set a default duration or handle it differently if needed
730
  answers[model_id] = answer # Store error answer
@@ -743,15 +722,14 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
743
  if iteration == 0: # Write header only for the first iteration
744
  iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
745
 
746
- update_progress("Ranking all answers...")
747
  for model_id in active_models:
748
- answer = answers.get(model_id, "Error answering") # Retrieve pre-generated answer, default if not found
749
- duration = answer_durations.get(model_id, 0) # Get duration with default
750
 
751
  if answer == "Error answering": # Handle answer generation errors
752
  consecutive_failures[model_id] += 1
753
  if consecutive_failures[model_id] >= failure_threshold:
754
- st.warning(f"Model {model_id} is consistently failing to answer. Removing from active models.")
755
  if model_id in active_models: # double check before removing, might have been removed in another thread
756
  active_models.remove(model_id)
757
  unresponsive_models.add(model_id)
@@ -759,7 +737,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
759
 
760
 
761
  if len(active_models) < 2: # Re-check active models before ranking
762
- st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
763
  break
764
 
765
  ranks = {}
@@ -804,9 +782,9 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
804
 
805
 
806
  if len(weights_for_valid_ranks) != len(valid_ranks_values):
807
- log_warning("Mismatch length of weights and valid answer ranks")
808
- log_info(f'weights_for_valid_ranks {weights_for_valid_ranks}')
809
- log_info(f'valid_ranks_values: {valid_ranks_values}')
810
 
811
  average_rank = np.average(valid_ranks_values, weights=weights_for_valid_ranks)
812
 
@@ -824,14 +802,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
824
  results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
825
 
826
  cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
827
- if cumulative_model_ranks[model_id]: # Check if the list is not empty
828
- cumulative_avg_rank[model_id] = np.nanmean(cumulative_model_ranks[model_id])
829
- else:
830
- cumulative_avg_rank[model_id] = np.nan
831
 
832
  # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
833
  ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
834
- st.write(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank[model_id]:.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
835
 
836
  # Write iteration results to file (append mode) - write for each model right after ranking
837
  iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
@@ -841,10 +816,10 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
841
  total_valid_rank = 0 # Keep track of the sum of valid (non-NaN) ranks
842
 
843
  for m_id in active_models:
844
- if m_id in cumulative_avg_rank and not np.isnan(cumulative_avg_rank[m_id]):
845
  temp_weights[m_id] = cumulative_avg_rank[m_id]
846
  total_valid_rank += cumulative_avg_rank[m_id]
847
- else: # if cumulative is empty or NaN, keep original
848
  temp_weights[m_id] = model_weights.get(m_id, 1.0 / len(active_models))
849
 
850
  # Normalize the weights so they sum to 1, handling cases where total_valid_rank might be zero
@@ -858,7 +833,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
858
 
859
  iteration_results_file_opened.close()
860
 
861
- st.write(f"Unresponsive models during this run: {unresponsive_models}")
862
  return results, cumulative_avg_rank, s_t
863
 
864
  def check_model_availability(models, token):
@@ -909,6 +884,10 @@ def check_model_availability(models, token):
909
  # Streamlit UI
910
  st.title("LLM Benchmark")
911
 
 
 
 
 
912
  # Setup sidebar for configuration
913
  st.sidebar.header("Configuration")
914
 
@@ -950,103 +929,130 @@ model_config = {}
950
  for model in selected_models:
951
  model_config[model] = {"name": model, "role": "both"}
952
 
953
- if st.sidebar.button("Test Selected Models"):
954
- if not hf_token:
955
- st.error("Please enter your Hugging Face API token")
956
- elif not selected_models:
957
- st.error("Please select at least one model")
958
- else:
959
- with st.spinner("Testing model availability..."):
960
- availability = check_model_availability(selected_models, hf_token)
961
-
962
- # Show results in a table
963
- availability_df = pd.DataFrame([
964
- {
965
- "Model": model,
966
- "Available": info["available"],
967
- "Status": "Available" if info["available"] else "Error",
968
- "Details": info.get("response", "") if info["available"] else info.get("error", "")
969
- }
970
- for model, info in availability.items()
971
- ])
972
-
973
- st.dataframe(availability_df)
974
-
975
- # Check if we have enough models to run the benchmark
976
- available_models = [m for m, info in availability.items() if info["available"]]
977
- if len(available_models) >= 2:
978
- st.success(f"{len(available_models)} models are available for benchmarking")
979
- else:
980
- st.error("You need at least 2 available models to run the benchmark")
981
-
982
- # Start benchmark button
983
- if st.sidebar.button("Start Benchmark"):
984
- if not hf_token:
985
- st.error("Please enter your Hugging Face API token")
986
- elif not selected_models:
987
- st.error("Please select at least two models")
988
- elif not selected_topics:
989
- st.error("Please select at least one topic")
990
- else:
991
- # Create progress bar
992
- progress_bar = st.progress(0)
993
- status_text = st.empty()
994
-
995
- # Setup to capture results for display
996
- results_container = st.container()
997
- with results_container:
998
- results_placeholder = st.empty()
999
- iterations_table = st.empty()
1000
 
1001
- # Run the benchmark
1002
- try:
1003
- # Update status
1004
- status_text.text("Benchmark running...")
1005
-
1006
- # Run benchmark and get results
1007
- results, cumulative_avg_rank, total_successful = run_benchmark(
1008
- selected_models, selected_topics,
1009
- ["a very simple", "a simple", "a", "a difficult", "a very difficult"],
1010
- num_iterations, model_config, hf_token
1011
- )
1012
 
1013
- # Update progress to complete
1014
- progress_bar.progress(100)
1015
- status_text.text(f"Benchmark completed! {total_successful} successful iterations")
1016
 
1017
- # Display results
1018
- if total_successful > 0:
1019
- results_df = pd.DataFrame(results)
1020
- st.session_state.results_df = results_df
 
 
 
 
1021
 
1022
- # Show model rankings
1023
- st.subheader("Model Rankings")
1024
- ranking_df = pd.DataFrame({
1025
- "Model": list(cumulative_avg_rank.keys()),
1026
- "Average Rank": [round(r, 2) for r in cumulative_avg_rank.values()]
1027
- })
1028
- ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
1029
- results_placeholder.dataframe(ranking_df)
1030
 
1031
- # Show detailed results
1032
- st.subheader("Detailed Results")
1033
- st.dataframe(results_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1034
 
1035
- # Option to download results
1036
- csv = results_df.to_csv(index=False)
1037
- st.download_button(
1038
- label="Download Results CSV",
1039
- data=csv,
1040
- file_name="llm_benchmark_results.csv",
1041
- mime="text/csv",
1042
- )
1043
- else:
1044
- st.warning("The benchmark did not complete any successful iterations.")
1045
- except Exception as e:
1046
- st.error(f"An error occurred: {e}")
1047
- st.exception(e)
1048
-
1049
- # Show previous results if available
1050
- elif 'results_df' in st.session_state and not st.session_state.results_df.empty:
1051
- st.subheader("Previous Results")
1052
- st.dataframe(st.session_state.results_df)
 
 
 
 
 
 
 
 
 
8
  import time
9
  import random
10
  import functools
11
+ import sys
12
+ import io
13
+ from contextlib import redirect_stdout, redirect_stderr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # FILES
16
  iteration_output_file = "llm_benchmark_iteration_results.csv" # File to store iteration results, defined as global
 
41
  "a very difficult": 0.6
42
  }
43
 
44
+ # Create output displays for main log and debug log
45
+ if 'main_output' not in st.session_state:
46
+ st.session_state.main_output = []
47
+ if 'debug_output' not in st.session_state:
48
+ st.session_state.debug_output = []
49
+
50
+ # Custom print function to capture output
51
+ def custom_print(*args, **kwargs):
52
+ # Convert args to string and join with spaces
53
+ output = ' '.join(map(str, args))
54
+
55
+ # Add to main output list
56
+ st.session_state.main_output.append(output)
57
+
58
+ # Also print to standard output for console logging
59
+ print(*args, **kwargs)
60
+
61
+ # Custom function to capture warnings and errors
62
+ def log_debug(message):
63
+ st.session_state.debug_output.append(message)
64
+ print(f"DEBUG: {message}", file=sys.stderr)
65
+
66
  def retry_api_request(max_retries=3, wait_time=10):
67
  """Decorator for retrying API requests with rate limit handling."""
68
  def decorator(func):
 
73
  try:
74
  return func(*args, **kwargs)
75
  except Exception as e:
76
+ error_msg = f"API error: {e}"
77
+ log_debug(error_msg)
78
  if retries < max_retries:
79
+ retry_msg = f"Waiting for {wait_time} seconds before retrying... (Retry {retries + 1}/{max_retries})"
80
+ log_debug(retry_msg)
81
  time.sleep(wait_time)
82
  retries += 1
83
  else:
84
+ failure_msg = f"Max retries reached. Request failed."
85
+ log_debug(failure_msg)
86
  return None
87
 
88
  return None
 
129
  )
130
  return response
131
  except Exception as e:
132
+ error_msg = f"Hugging Face Inference API error: {e}"
133
+ log_debug(error_msg)
134
  return None
135
 
136
  # --- Prompting Functions ---
 
315
  if topic in topic_instructions:
316
  prompt += random.choice(topic_instructions[topic]) + "\n"
317
  else:
318
+ log_debug(f"Warning: No topic_instructions defined for topic '{topic}'")
319
 
320
  # 5. Conditional Question Types (Not for math, logics, grammar)
321
  if topic not in ["math", "logics", "grammar", "coding", "creative writing"]:
 
401
  try:
402
  rank_val = int(rank_str) # Convert to integer *after* regex extraction
403
  if not 1 <= rank_val <= 5: # Check if rank is within valid range
404
+ log_debug(f"Warning: Model {ranking_model_id} returned rank outside of valid range [1-5]: {rank_val}. Rank set to None.")
405
  return None
406
  return rank_val
407
  except ValueError:
408
+ log_debug(f"Warning: Model {ranking_model_id} returned non-integer rank after regex extraction: '{rank_str}'. Rank set to None.")
409
  return None
410
  else:
411
+ log_debug(f"Warning: Model {ranking_model_id} returned non-numeric rank: '{rank_str}'. Rank set to None.")
412
  return None
413
 
414
  # --- Helper Function for Parallel Ranking ---
 
425
  rank_str = response.strip()
426
  rank = parse_rank_string(rank_str, ranking_model_id)
427
  except ValueError:
428
+ log_debug(f"Warning: Model {ranking_model_id} returned non-integer rank: '{rank_str}'. Rank set to None.")
429
  rank = None
430
  else:
431
+ log_debug(f"Warning: Model {ranking_model_id} failed to provide rank. Rank set to None.")
432
  except Exception as e:
433
  duration = time.time() - start_time
434
+ log_debug(f"Warning: Model {ranking_model_id} ranking timed out or failed after {duration:.2f}s: {e}")
435
  rank = None
436
 
437
  duration = time.time() - start_time # Calculate total duration of ranking attempt
438
  if duration > timeout:
439
+ log_debug(f"Warning: Ranking by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
440
  rank = None # Ensure rank is None if timeout occurs
441
 
442
  time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
 
456
  rank_str = response.strip()
457
  rank = parse_rank_string(rank_str, ranking_model_id)
458
  except ValueError:
459
+ log_debug(f"Warning: Model {ranking_model_id} returned non-integer rank for question: '{rank_str}'. Rank set to None.")
460
  rank = None
461
  else:
462
+ log_debug(f"Warning: Model {ranking_model_id} failed to provide rank for question. Rank set to None.")
463
  except Exception as e:
464
  duration = time.time() - start_time
465
+ log_debug(f"Warning: Model {ranking_model_id} ranking question timed out or failed after {duration:.2f}s: {e}")
466
  rank = None
467
 
468
  duration = time.time() - start_time # Calculate total duration of ranking attempt
469
  if duration > timeout:
470
+ log_debug(f"Warning: Ranking question by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
471
  rank = None # Ensure rank is None if timeout occurs
472
 
473
  time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
 
491
  answer = response.strip()
492
  except Exception as e:
493
  duration = time.time() - start_time
494
+ log_debug(f"Warning: Model {model_id} answering timed out or failed after {duration:.2f}s: {e}")
495
  answer = "Error answering - Timeout" # Or a specific timeout error message
496
  return answer, duration # Return error answer and duration
497
 
498
  time.sleep(time_sleep) # Small delay
499
  duration = time.time() - start_time # Calculate duration
500
+ custom_print(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately as requested - as requested
501
 
502
  return answer, duration # Return answer and duration
503
 
 
552
  s_t = 0 #count succesful iterations
553
 
554
  for iteration in range(t): # Added iteration counter
555
+ # Update progress in the Streamlit app
556
+ st.session_state.progress = (iteration + 1) / t
557
+
 
558
  if len(active_models) < 2:
559
+ custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
560
  break
561
 
562
  topic = random.choice(topics)
563
  # --- Select difficulty with probabilities ---
564
  difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
565
+ custom_print(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---") # Print iteration number
566
 
567
  # --- Question Generation ---
568
  question = None
 
583
  if model_config[model_id].get("role", "both") in ["answer", "both"]
584
  ]
585
  if not question_gen_candidates: # No suitable models left
586
+ custom_print("No models available for question generation with 'answer' or 'both' role. Skipping iteration.")
587
  continue # Skip to next iteration
588
 
589
  question_generator_model_id = random.choice(question_gen_candidates)
590
 
591
  # --- Question Generation ---
 
592
  response = make_hf_request(model_config[question_generator_model_id]["name"],
593
  [{"role": "user", "content": question_prompt}],
594
  question_temp,
 
600
  consecutive_failures[question_generator_model_id] = 0 # Reset on success
601
  break
602
  else:
603
+ custom_print(f"Skipping due to request failure for model {question_generator_model_id}.")
604
  consecutive_failures[question_generator_model_id] += 1
605
 
606
  if consecutive_failures[question_generator_model_id] >= failure_threshold:
607
+ custom_print(f"Model {question_generator_model_id} is unresponsive (question gen). Removing from active models.")
608
  if question_generator_model_id in active_models:
609
  active_models.remove(question_generator_model_id)
610
  unresponsive_models.add(question_generator_model_id)
611
  time.sleep(time_sleep)
612
 
613
  if question is None:
614
+ custom_print(f"Failed to generate a question after {max_attempts} attempts. Skipping this round.")
615
  continue
616
 
617
  # --- Parallel Question Ranking ---
618
  question_ranks = {}
619
  question_ranking_futures = []
620
  question_ranking_start_time = time.time()
621
+
 
622
  with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models) or 1) as executor:
623
  for ranking_model_id in active_models:
624
  # --- Filter for ranking roles ("rank" or "both") ---
 
657
 
658
  #check that the length is correct
659
  if len(weights_for_valid_question_ranks) != len(valid_question_ranks_values):
660
+ log_debug("Warning: Mismatch length of weights and valid question ranks")
661
+ log_debug(f'weights_for_valid_question_ranks {weights_for_valid_question_ranks}')
662
+ log_debug(f'valid_question_ranks_values: {valid_question_ranks_values}')
663
 
664
  question_avg_rank = np.average(valid_question_ranks_values, weights=weights_for_valid_question_ranks)
665
  min_question_rank = min(valid_question_ranks_values) if valid_question_ranks_values else 0 # To avoid error if no valid rank
666
 
667
  if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
668
  question_accepted = True
669
+ custom_print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
670
  s_t += 1
671
  else:
672
  question_accepted = False
673
+ custom_print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
674
 
675
  if not question_accepted:
676
+ custom_print("Generated question was not accepted. Regenerating question.")
677
  continue
678
 
679
  if len(active_models) < 2:
680
+ custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
681
  break
682
 
683
  # --- Parallel Answer Generation ---
684
  answers = {}
685
  answer_futures = []
686
  answer_durations = {}
 
687
  with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models)) as executor:
688
  for model_id in active_models:
689
  # --- Filter for answer generation roles ("answer" or "both") ---
 
703
  )
704
  answer_futures.append(future)
705
  except TimeoutError as e:
706
+ log_debug(f"Answer generation for model {model_id} timed out: {e}")
707
  answer = "I am struggling to answer this question" # Treat timeout as error
708
  duration = 120 # You can set a default duration or handle it differently if needed
709
  answers[model_id] = answer # Store error answer
 
722
  if iteration == 0: # Write header only for the first iteration
723
  iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
724
 
725
+
726
  for model_id in active_models:
727
+ answer = answers[model_id] # Retrieve pre-generated answer
 
728
 
729
  if answer == "Error answering": # Handle answer generation errors
730
  consecutive_failures[model_id] += 1
731
  if consecutive_failures[model_id] >= failure_threshold:
732
+ custom_print(f"Model {model_id} is consistently failing to answer. Removing from active models.")
733
  if model_id in active_models: # double check before removing, might have been removed in another thread
734
  active_models.remove(model_id)
735
  unresponsive_models.add(model_id)
 
737
 
738
 
739
  if len(active_models) < 2: # Re-check active models before ranking
740
+ custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
741
  break
742
 
743
  ranks = {}
 
782
 
783
 
784
  if len(weights_for_valid_ranks) != len(valid_ranks_values):
785
+ log_debug("Warning: Mismatch length of weights and valid answer ranks")
786
+ log_debug(f'weights_for_valid_ranks {weights_for_valid_ranks}')
787
+ log_debug(f'valid_ranks_values: {valid_ranks_values}')
788
 
789
  average_rank = np.average(valid_ranks_values, weights=weights_for_valid_ranks)
790
 
 
802
  results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
803
 
804
  cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
805
+ cumulative_avg_rank[model_id] = np.nanmean(cumulative_model_ranks[model_id]) if cumulative_model_ranks[model_id] else np.nan
 
 
 
806
 
807
  # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
808
  ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
809
+ custom_print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank[model_id]:.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
810
 
811
  # Write iteration results to file (append mode) - write for each model right after ranking
812
  iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
 
816
  total_valid_rank = 0 # Keep track of the sum of valid (non-NaN) ranks
817
 
818
  for m_id in active_models:
819
+ if cumulative_avg_rank[m_id]:
820
  temp_weights[m_id] = cumulative_avg_rank[m_id]
821
  total_valid_rank += cumulative_avg_rank[m_id]
822
+ else: # if cumulative is empty, keep original
823
  temp_weights[m_id] = model_weights.get(m_id, 1.0 / len(active_models))
824
 
825
  # Normalize the weights so they sum to 1, handling cases where total_valid_rank might be zero
 
833
 
834
  iteration_results_file_opened.close()
835
 
836
+ custom_print(f"Unresponsive models during this run: {unresponsive_models}")
837
  return results, cumulative_avg_rank, s_t
838
 
839
  def check_model_availability(models, token):
 
884
  # Streamlit UI
885
  st.title("LLM Benchmark")
886
 
887
+ # Initialize session state variables for progress tracking
888
+ if 'progress' not in st.session_state:
889
+ st.session_state.progress = 0
890
+
891
  # Setup sidebar for configuration
892
  st.sidebar.header("Configuration")
893
 
 
929
  for model in selected_models:
930
  model_config[model] = {"name": model, "role": "both"}
931
 
932
+ # Create tabs for different views
933
+ tab1, tab2, tab3 = st.tabs(["Benchmark", "Progress Log", "Debug Log"])
934
+
935
+ with tab1:
936
+ if st.sidebar.button("Test Selected Models"):
937
+ if not hf_token:
938
+ st.error("Please enter your Hugging Face API token")
939
+ elif not selected_models:
940
+ st.error("Please select at least one model")
941
+ else:
942
+ with st.spinner("Testing model availability..."):
943
+ availability = check_model_availability(selected_models, hf_token)
944
+
945
+ # Show results in a table
946
+ availability_df = pd.DataFrame([
947
+ {
948
+ "Model": model,
949
+ "Available": info["available"],
950
+ "Status": "Available" if info["available"] else "Error",
951
+ "Details": info.get("response", "") if info["available"] else info.get("error", "")
952
+ }
953
+ for model, info in availability.items()
954
+ ])
955
+
956
+ st.dataframe(availability_df)
957
+
958
+ # Check if we have enough models to run the benchmark
959
+ available_models = [m for m, info in availability.items() if info["available"]]
960
+ if len(available_models) >= 2:
961
+ st.success(f"{len(available_models)} models are available for benchmarking")
962
+ else:
963
+ st.error("You need at least 2 available models to run the benchmark")
964
+
965
+ # Progress bar
966
+ progress_bar = st.progress(st.session_state.progress)
967
+
968
+ # Start benchmark button
969
+ if st.sidebar.button("Start Benchmark"):
970
+ # Clear previous outputs
971
+ st.session_state.main_output = []
972
+ st.session_state.debug_output = []
 
 
 
 
 
 
973
 
974
+ if not hf_token:
975
+ st.error("Please enter your Hugging Face API token")
976
+ elif not selected_models:
977
+ st.error("Please select at least two models")
978
+ elif not selected_topics:
979
+ st.error("Please select at least one topic")
980
+ else:
981
+ # Setup to capture results for display
982
+ results_container = st.container()
 
 
983
 
984
+ # Create a global variable to store intermediate results
985
+ if 'results_df' not in st.session_state:
986
+ st.session_state.results_df = pd.DataFrame()
987
 
988
+ # Run the benchmark
989
+ try:
990
+ # Run benchmark and get results
991
+ results, cumulative_avg_rank, total_successful = run_benchmark(
992
+ selected_models, selected_topics,
993
+ ["a very simple", "a simple", "a", "a difficult", "a very difficult"],
994
+ num_iterations, model_config, hf_token
995
+ )
996
 
997
+ # Update progress to complete
998
+ st.session_state.progress = 1.0
999
+ progress_bar.progress(1.0)
 
 
 
 
 
1000
 
1001
+ # Display results
1002
+ if total_successful > 0:
1003
+ results_df = pd.DataFrame(results)
1004
+ st.session_state.results_df = results_df
1005
+
1006
+ # Show model rankings
1007
+ st.subheader("Model Rankings")
1008
+ ranking_df = pd.DataFrame({
1009
+ "Model": list(cumulative_avg_rank.keys()),
1010
+ "Average Rank": [round(r, 2) for r in cumulative_avg_rank.values()]
1011
+ })
1012
+ ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
1013
+ st.dataframe(ranking_df)
1014
+
1015
+ # Show detailed results
1016
+ st.subheader("Detailed Results")
1017
+ st.dataframe(results_df)
1018
+
1019
+ # Option to download results
1020
+ csv = results_df.to_csv(index=False)
1021
+ st.download_button(
1022
+ label="Download Results CSV",
1023
+ data=csv,
1024
+ file_name="llm_benchmark_results.csv",
1025
+ mime="text/csv",
1026
+ )
1027
+ else:
1028
+ st.warning("The benchmark did not complete any successful iterations.")
1029
+ except Exception as e:
1030
+ st.error(f"An error occurred: {e}")
1031
+ st.exception(e)
1032
 
1033
+ # Show previous results if available
1034
+ elif 'results_df' in st.session_state and not st.session_state.results_df.empty:
1035
+ st.subheader("Previous Results")
1036
+ st.dataframe(st.session_state.results_df)
1037
+
1038
+ with tab2:
1039
+ # Display main output log
1040
+ st.subheader("Execution Log")
1041
+ log_container = st.container()
1042
+
1043
+ # Display logs
1044
+ log_text = "\n".join(st.session_state.main_output)
1045
+ log_container.text_area("Progress Log", log_text, height=400)
1046
+
1047
+ # Add a refresh button for the log
1048
+ if st.button("Refresh Log"):
1049
+ st.experimental_rerun()
1050
+
1051
+ with tab3:
1052
+ # Display debug output
1053
+ st.subheader("Debug Log")
1054
+ debug_container = st.container()
1055
+
1056
+ # Display debug logs
1057
+ debug_text = "\n".join(st.session_state.debug_output)
1058
+ debug_container.text_area("Debug Information", debug_text, height=400)