PeterKruger commited on
Commit
2ee78d6
·
verified ·
1 Parent(s): 1bc57b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -41
app.py CHANGED
@@ -9,6 +9,50 @@ import time
9
  import random
10
  import functools
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # FILES
13
  iteration_output_file = "llm_benchmark_iteration_results.csv" # File to store iteration results, defined as global
14
  results_file = "llm_benchmark_results.csv" # all data
@@ -48,13 +92,13 @@ def retry_api_request(max_retries=3, wait_time=10):
48
  try:
49
  return func(*args, **kwargs)
50
  except Exception as e:
51
- print(f"API error: {e}")
52
  if retries < max_retries:
53
- print(f"Waiting for {wait_time} seconds before retrying... (Retry {retries + 1}/{max_retries})")
54
  time.sleep(wait_time)
55
  retries += 1
56
  else:
57
- print(f"Max retries reached. Request failed.")
58
  return None
59
 
60
  return None
@@ -101,7 +145,7 @@ def make_hf_request(model_name, messages, temperature, max_tokens, token=None):
101
  )
102
  return response
103
  except Exception as e:
104
- print(f"Hugging Face Inference API error: {e}")
105
  return None
106
 
107
  # --- Prompting Functions ---
@@ -286,7 +330,7 @@ def generate_question_prompt(topic, difficulty):
286
  if topic in topic_instructions:
287
  prompt += random.choice(topic_instructions[topic]) + "\n"
288
  else:
289
- print(f"Warning: No topic_instructions defined for topic '{topic}'")
290
 
291
  # 5. Conditional Question Types (Not for math, logics, grammar)
292
  if topic not in ["math", "logics", "grammar", "coding", "creative writing"]:
@@ -372,14 +416,14 @@ def parse_rank_string(rank_str, ranking_model_id):
372
  try:
373
  rank_val = int(rank_str) # Convert to integer *after* regex extraction
374
  if not 1 <= rank_val <= 5: # Check if rank is within valid range
375
- print(f"Warning: Model {ranking_model_id} returned rank outside of valid range [1-5]: {rank_val}. Rank set to None.")
376
  return None
377
  return rank_val
378
  except ValueError:
379
- print(f"Warning: Model {ranking_model_id} returned non-integer rank after regex extraction: '{rank_str}'. Rank set to None.")
380
  return None
381
  else:
382
- print(f"Warning: Model {ranking_model_id} returned non-numeric rank: '{rank_str}'. Rank set to None.")
383
  return None
384
 
385
  # --- Helper Function for Parallel Ranking ---
@@ -396,18 +440,18 @@ def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures
396
  rank_str = response.strip()
397
  rank = parse_rank_string(rank_str, ranking_model_id)
398
  except ValueError:
399
- print(f"Warning: Model {ranking_model_id} returned non-integer rank: '{rank_str}'. Rank set to None.")
400
  rank = None
401
  else:
402
- print(f"Warning: Model {ranking_model_id} failed to provide rank. Rank set to None.")
403
  except Exception as e:
404
  duration = time.time() - start_time
405
- print(f"Warning: Model {ranking_model_id} ranking timed out or failed after {duration:.2f}s: {e}")
406
  rank = None
407
 
408
  duration = time.time() - start_time # Calculate total duration of ranking attempt
409
  if duration > timeout:
410
- print(f"Warning: Ranking by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
411
  rank = None # Ensure rank is None if timeout occurs
412
 
413
  time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
@@ -427,18 +471,18 @@ def get_question_rank_from_model(ranking_model_id, question, topic, difficulty,
427
  rank_str = response.strip()
428
  rank = parse_rank_string(rank_str, ranking_model_id)
429
  except ValueError:
430
- print(f"Warning: Model {ranking_model_id} returned non-integer rank for question: '{rank_str}'. Rank set to None.")
431
  rank = None
432
  else:
433
- print(f"Warning: Model {ranking_model_id} failed to provide rank for question. Rank set to None.")
434
  except Exception as e:
435
  duration = time.time() - start_time
436
- print(f"Warning: Model {ranking_model_id} ranking question timed out or failed after {duration:.2f}s: {e}")
437
  rank = None
438
 
439
  duration = time.time() - start_time # Calculate total duration of ranking attempt
440
  if duration > timeout:
441
- print(f"Warning: Ranking question by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
442
  rank = None # Ensure rank is None if timeout occurs
443
 
444
  time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
@@ -462,13 +506,13 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
462
  answer = response.strip()
463
  except Exception as e:
464
  duration = time.time() - start_time
465
- print(f"Warning: Model {model_id} answering timed out or failed after {duration:.2f}s: {e}")
466
  answer = "Error answering - Timeout" # Or a specific timeout error message
467
  return answer, duration # Return error answer and duration
468
 
469
  time.sleep(time_sleep) # Small delay
470
  duration = time.time() - start_time # Calculate duration
471
- print(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately as requested - as requested
472
 
473
  return answer, duration # Return answer and duration
474
 
@@ -523,15 +567,18 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
523
  s_t = 0 #count succesful iterations
524
 
525
  for iteration in range(t): # Added iteration counter
 
 
 
526
 
527
  if len(active_models) < 2:
528
- print("Fewer than 2 active models remaining. Exiting benchmark.")
529
  break
530
 
531
  topic = random.choice(topics)
532
  # --- Select difficulty with probabilities ---
533
  difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
534
- print(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---") # Print iteration number
535
 
536
  # --- Question Generation ---
537
  question = None
@@ -552,12 +599,13 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
552
  if model_config[model_id].get("role", "both") in ["answer", "both"]
553
  ]
554
  if not question_gen_candidates: # No suitable models left
555
- print("No models available for question generation with 'answer' or 'both' role. Skipping iteration.")
556
  continue # Skip to next iteration
557
 
558
  question_generator_model_id = random.choice(question_gen_candidates)
559
 
560
  # --- Question Generation ---
 
561
  response = make_hf_request(model_config[question_generator_model_id]["name"],
562
  [{"role": "user", "content": question_prompt}],
563
  question_temp,
@@ -569,25 +617,26 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
569
  consecutive_failures[question_generator_model_id] = 0 # Reset on success
570
  break
571
  else:
572
- print(f"Skipping due to request failure for model {question_generator_model_id}.")
573
  consecutive_failures[question_generator_model_id] += 1
574
 
575
  if consecutive_failures[question_generator_model_id] >= failure_threshold:
576
- print(f"Model {question_generator_model_id} is unresponsive (question gen). Removing from active models.")
577
  if question_generator_model_id in active_models:
578
  active_models.remove(question_generator_model_id)
579
  unresponsive_models.add(question_generator_model_id)
580
  time.sleep(time_sleep)
581
 
582
  if question is None:
583
- print(f"Failed to generate a question after {max_attempts} attempts. Skipping this round.")
584
  continue
585
 
586
  # --- Parallel Question Ranking ---
587
  question_ranks = {}
588
  question_ranking_futures = []
589
  question_ranking_start_time = time.time()
590
-
 
591
  with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models) or 1) as executor:
592
  for ranking_model_id in active_models:
593
  # --- Filter for ranking roles ("rank" or "both") ---
@@ -626,33 +675,34 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
626
 
627
  #check that the length is correct
628
  if len(weights_for_valid_question_ranks) != len(valid_question_ranks_values):
629
- print("Warning: Mismatch length of weights and valid question ranks")
630
- print(f'weights_for_valid_question_ranks {weights_for_valid_question_ranks}')
631
- print(f'valid_question_ranks_values: {valid_question_ranks_values}')
632
 
633
  question_avg_rank = np.average(valid_question_ranks_values, weights=weights_for_valid_question_ranks)
634
  min_question_rank = min(valid_question_ranks_values) if valid_question_ranks_values else 0 # To avoid error if no valid rank
635
 
636
  if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
637
  question_accepted = True
638
- print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
639
  s_t += 1
640
  else:
641
  question_accepted = False
642
- print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
643
 
644
  if not question_accepted:
645
- print("Generated question was not accepted. Regenerating question.")
646
  continue
647
 
648
  if len(active_models) < 2:
649
- print("Fewer than 2 active models remaining. Exiting benchmark.")
650
  break
651
 
652
  # --- Parallel Answer Generation ---
653
  answers = {}
654
  answer_futures = []
655
  answer_durations = {}
 
656
  with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models)) as executor:
657
  for model_id in active_models:
658
  # --- Filter for answer generation roles ("answer" or "both") ---
@@ -672,7 +722,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
672
  )
673
  answer_futures.append(future)
674
  except TimeoutError as e:
675
- print(f"Answer generation for model {model_id} timed out: {e}")
676
  answer = "I am struggling to answer this question" # Treat timeout as error
677
  duration = 120 # You can set a default duration or handle it differently if needed
678
  answers[model_id] = answer # Store error answer
@@ -691,14 +741,14 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
691
  if iteration == 0: # Write header only for the first iteration
692
  iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
693
 
694
-
695
  for model_id in active_models:
696
  answer = answers[model_id] # Retrieve pre-generated answer
697
 
698
  if answer == "Error answering": # Handle answer generation errors
699
  consecutive_failures[model_id] += 1
700
  if consecutive_failures[model_id] >= failure_threshold:
701
- print(f"Model {model_id} is consistently failing to answer. Removing from active models.")
702
  if model_id in active_models: # double check before removing, might have been removed in another thread
703
  active_models.remove(model_id)
704
  unresponsive_models.add(model_id)
@@ -706,7 +756,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
706
 
707
 
708
  if len(active_models) < 2: # Re-check active models before ranking
709
- print("Fewer than 2 active models remaining. Exiting benchmark.")
710
  break
711
 
712
  ranks = {}
@@ -751,9 +801,9 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
751
 
752
 
753
  if len(weights_for_valid_ranks) != len(valid_ranks_values):
754
- print("Warning: Mismatch length of weights and valid answer ranks")
755
- print(f'weights_for_valid_ranks {weights_for_valid_ranks}')
756
- print(f'valid_ranks_values: {valid_ranks_values}')
757
 
758
  average_rank = np.average(valid_ranks_values, weights=weights_for_valid_ranks)
759
 
@@ -775,7 +825,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
775
 
776
  # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
777
  ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
778
- print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank[model_id]:.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
779
 
780
  # Write iteration results to file (append mode) - write for each model right after ranking
781
  iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
@@ -802,7 +852,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
802
 
803
  iteration_results_file_opened.close()
804
 
805
- print(f"Unresponsive models during this run: {unresponsive_models}")
806
  return results, cumulative_avg_rank, s_t
807
 
808
  def check_model_availability(models, token):
 
9
  import random
10
  import functools
11
 
12
+ # Create a logging system for errors and warnings
13
+ if 'log_messages' not in st.session_state:
14
+ st.session_state.log_messages = []
15
+
16
+ # Create areas for different outputs
17
+ status_area = st.empty() # For current status
18
+ progress_area = st.empty() # For progress updates
19
+
20
+ # Collapsible section for logs
21
+ with st.expander("Execution Log", expanded=False):
22
+ log_area = st.empty()
23
+
24
+ def update_log():
25
+ """Update the log display with current messages"""
26
+ log_area.text_area("System Log", value="\n".join(st.session_state.log_messages), height=300)
27
+
28
+ def log_message(message, level="INFO"):
29
+ """Log a message with timestamp and level"""
30
+ timestamp = time.strftime("%H:%M:%S")
31
+ formatted_msg = f"[{timestamp}] {level}: {message}"
32
+ st.session_state.log_messages.append(formatted_msg)
33
+ # Limit log size
34
+ if len(st.session_state.log_messages) > 500:
35
+ st.session_state.log_messages = st.session_state.log_messages[-500:]
36
+ update_log()
37
+
38
+ # Specialized logging functions
39
+ def log_info(message):
40
+ log_message(message, "INFO")
41
+
42
+ def log_warning(message):
43
+ log_message(message, "WARNING")
44
+
45
+ def log_error(message):
46
+ log_message(message, "ERROR")
47
+
48
+ # Function to update status
49
+ def update_status(message):
50
+ status_area.write(message)
51
+
52
+ # Function to update progress message
53
+ def update_progress(message):
54
+ progress_area.write(message)
55
+
56
  # FILES
57
  iteration_output_file = "llm_benchmark_iteration_results.csv" # File to store iteration results, defined as global
58
  results_file = "llm_benchmark_results.csv" # all data
 
92
  try:
93
  return func(*args, **kwargs)
94
  except Exception as e:
95
+ log_error(f"API error: {e}")
96
  if retries < max_retries:
97
+ log_info(f"Waiting for {wait_time} seconds before retrying... (Retry {retries + 1}/{max_retries})")
98
  time.sleep(wait_time)
99
  retries += 1
100
  else:
101
+ log_error(f"Max retries reached. Request failed.")
102
  return None
103
 
104
  return None
 
145
  )
146
  return response
147
  except Exception as e:
148
+ log_error(f"Hugging Face Inference API error: {e}")
149
  return None
150
 
151
  # --- Prompting Functions ---
 
330
  if topic in topic_instructions:
331
  prompt += random.choice(topic_instructions[topic]) + "\n"
332
  else:
333
+ log_warning(f"No topic_instructions defined for topic '{topic}'")
334
 
335
  # 5. Conditional Question Types (Not for math, logics, grammar)
336
  if topic not in ["math", "logics", "grammar", "coding", "creative writing"]:
 
416
  try:
417
  rank_val = int(rank_str) # Convert to integer *after* regex extraction
418
  if not 1 <= rank_val <= 5: # Check if rank is within valid range
419
+ log_warning(f"Model {ranking_model_id} returned rank outside of valid range [1-5]: {rank_val}. Rank set to None.")
420
  return None
421
  return rank_val
422
  except ValueError:
423
+ log_warning(f"Model {ranking_model_id} returned non-integer rank after regex extraction: '{rank_str}'. Rank set to None.")
424
  return None
425
  else:
426
+ log_warning(f"Model {ranking_model_id} returned non-numeric rank: '{rank_str}'. Rank set to None.")
427
  return None
428
 
429
  # --- Helper Function for Parallel Ranking ---
 
440
  rank_str = response.strip()
441
  rank = parse_rank_string(rank_str, ranking_model_id)
442
  except ValueError:
443
+ log_warning(f"Model {ranking_model_id} returned non-integer rank: '{rank_str}'. Rank set to None.")
444
  rank = None
445
  else:
446
+ log_warning(f"Model {ranking_model_id} failed to provide rank. Rank set to None.")
447
  except Exception as e:
448
  duration = time.time() - start_time
449
+ log_warning(f"Model {ranking_model_id} ranking timed out or failed after {duration:.2f}s: {e}")
450
  rank = None
451
 
452
  duration = time.time() - start_time # Calculate total duration of ranking attempt
453
  if duration > timeout:
454
+ log_warning(f"Ranking by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
455
  rank = None # Ensure rank is None if timeout occurs
456
 
457
  time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
 
471
  rank_str = response.strip()
472
  rank = parse_rank_string(rank_str, ranking_model_id)
473
  except ValueError:
474
+ log_warning(f"Model {ranking_model_id} returned non-integer rank for question: '{rank_str}'. Rank set to None.")
475
  rank = None
476
  else:
477
+ log_warning(f"Model {ranking_model_id} failed to provide rank for question. Rank set to None.")
478
  except Exception as e:
479
  duration = time.time() - start_time
480
+ log_warning(f"Model {ranking_model_id} ranking question timed out or failed after {duration:.2f}s: {e}")
481
  rank = None
482
 
483
  duration = time.time() - start_time # Calculate total duration of ranking attempt
484
  if duration > timeout:
485
+ log_warning(f"Ranking question by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
486
  rank = None # Ensure rank is None if timeout occurs
487
 
488
  time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
 
506
  answer = response.strip()
507
  except Exception as e:
508
  duration = time.time() - start_time
509
+ log_warning(f"Model {model_id} answering timed out or failed after {duration:.2f}s: {e}")
510
  answer = "Error answering - Timeout" # Or a specific timeout error message
511
  return answer, duration # Return error answer and duration
512
 
513
  time.sleep(time_sleep) # Small delay
514
  duration = time.time() - start_time # Calculate duration
515
+ st.write(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately
516
 
517
  return answer, duration # Return answer and duration
518
 
 
567
  s_t = 0 #count succesful iterations
568
 
569
  for iteration in range(t): # Added iteration counter
570
+ # Update the progress bar
571
+ progress_percentage = min(100, (iteration / t) * 100)
572
+ st.progress(progress_percentage)
573
 
574
  if len(active_models) < 2:
575
+ st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
576
  break
577
 
578
  topic = random.choice(topics)
579
  # --- Select difficulty with probabilities ---
580
  difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
581
+ update_status(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---")
582
 
583
  # --- Question Generation ---
584
  question = None
 
599
  if model_config[model_id].get("role", "both") in ["answer", "both"]
600
  ]
601
  if not question_gen_candidates: # No suitable models left
602
+ st.warning("No models available for question generation with 'answer' or 'both' role. Skipping iteration.")
603
  continue # Skip to next iteration
604
 
605
  question_generator_model_id = random.choice(question_gen_candidates)
606
 
607
  # --- Question Generation ---
608
+ update_progress(f"Generating question using model {question_generator_model_id}...")
609
  response = make_hf_request(model_config[question_generator_model_id]["name"],
610
  [{"role": "user", "content": question_prompt}],
611
  question_temp,
 
617
  consecutive_failures[question_generator_model_id] = 0 # Reset on success
618
  break
619
  else:
620
+ log_warning(f"Skipping due to request failure for model {question_generator_model_id}.")
621
  consecutive_failures[question_generator_model_id] += 1
622
 
623
  if consecutive_failures[question_generator_model_id] >= failure_threshold:
624
+ st.warning(f"Model {question_generator_model_id} is unresponsive (question gen). Removing from active models.")
625
  if question_generator_model_id in active_models:
626
  active_models.remove(question_generator_model_id)
627
  unresponsive_models.add(question_generator_model_id)
628
  time.sleep(time_sleep)
629
 
630
  if question is None:
631
+ st.warning(f"Failed to generate a question after {max_attempts} attempts. Skipping this round.")
632
  continue
633
 
634
  # --- Parallel Question Ranking ---
635
  question_ranks = {}
636
  question_ranking_futures = []
637
  question_ranking_start_time = time.time()
638
+
639
+ update_progress(f"Ranking generated question...")
640
  with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models) or 1) as executor:
641
  for ranking_model_id in active_models:
642
  # --- Filter for ranking roles ("rank" or "both") ---
 
675
 
676
  #check that the length is correct
677
  if len(weights_for_valid_question_ranks) != len(valid_question_ranks_values):
678
+ log_warning("Mismatch length of weights and valid question ranks")
679
+ log_info(f'weights_for_valid_question_ranks {weights_for_valid_question_ranks}')
680
+ log_info(f'valid_question_ranks_values: {valid_question_ranks_values}')
681
 
682
  question_avg_rank = np.average(valid_question_ranks_values, weights=weights_for_valid_question_ranks)
683
  min_question_rank = min(valid_question_ranks_values) if valid_question_ranks_values else 0 # To avoid error if no valid rank
684
 
685
  if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
686
  question_accepted = True
687
+ st.write(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
688
  s_t += 1
689
  else:
690
  question_accepted = False
691
+ st.write(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
692
 
693
  if not question_accepted:
694
+ update_progress("Generated question was not accepted. Regenerating question.")
695
  continue
696
 
697
  if len(active_models) < 2:
698
+ st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
699
  break
700
 
701
  # --- Parallel Answer Generation ---
702
  answers = {}
703
  answer_futures = []
704
  answer_durations = {}
705
+ update_progress("Generating answers from all models...")
706
  with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models)) as executor:
707
  for model_id in active_models:
708
  # --- Filter for answer generation roles ("answer" or "both") ---
 
722
  )
723
  answer_futures.append(future)
724
  except TimeoutError as e:
725
+ log_error(f"Answer generation for model {model_id} timed out: {e}")
726
  answer = "I am struggling to answer this question" # Treat timeout as error
727
  duration = 120 # You can set a default duration or handle it differently if needed
728
  answers[model_id] = answer # Store error answer
 
741
  if iteration == 0: # Write header only for the first iteration
742
  iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
743
 
744
+ update_progress("Ranking all answers...")
745
  for model_id in active_models:
746
  answer = answers[model_id] # Retrieve pre-generated answer
747
 
748
  if answer == "Error answering": # Handle answer generation errors
749
  consecutive_failures[model_id] += 1
750
  if consecutive_failures[model_id] >= failure_threshold:
751
+ st.warning(f"Model {model_id} is consistently failing to answer. Removing from active models.")
752
  if model_id in active_models: # double check before removing, might have been removed in another thread
753
  active_models.remove(model_id)
754
  unresponsive_models.add(model_id)
 
756
 
757
 
758
  if len(active_models) < 2: # Re-check active models before ranking
759
+ st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
760
  break
761
 
762
  ranks = {}
 
801
 
802
 
803
  if len(weights_for_valid_ranks) != len(valid_ranks_values):
804
+ log_warning("Mismatch length of weights and valid answer ranks")
805
+ log_info(f'weights_for_valid_ranks {weights_for_valid_ranks}')
806
+ log_info(f'valid_ranks_values: {valid_ranks_values}')
807
 
808
  average_rank = np.average(valid_ranks_values, weights=weights_for_valid_ranks)
809
 
 
825
 
826
  # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
827
  ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
828
+ st.write(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank[model_id]:.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
829
 
830
  # Write iteration results to file (append mode) - write for each model right after ranking
831
  iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
 
852
 
853
  iteration_results_file_opened.close()
854
 
855
+ st.write(f"Unresponsive models during this run: {unresponsive_models}")
856
  return results, cumulative_avg_rank, s_t
857
 
858
  def check_model_availability(models, token):