Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,50 @@ import time
|
|
9 |
import random
|
10 |
import functools
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
# FILES
|
13 |
iteration_output_file = "llm_benchmark_iteration_results.csv" # File to store iteration results, defined as global
|
14 |
results_file = "llm_benchmark_results.csv" # all data
|
@@ -48,13 +92,13 @@ def retry_api_request(max_retries=3, wait_time=10):
|
|
48 |
try:
|
49 |
return func(*args, **kwargs)
|
50 |
except Exception as e:
|
51 |
-
|
52 |
if retries < max_retries:
|
53 |
-
|
54 |
time.sleep(wait_time)
|
55 |
retries += 1
|
56 |
else:
|
57 |
-
|
58 |
return None
|
59 |
|
60 |
return None
|
@@ -101,7 +145,7 @@ def make_hf_request(model_name, messages, temperature, max_tokens, token=None):
|
|
101 |
)
|
102 |
return response
|
103 |
except Exception as e:
|
104 |
-
|
105 |
return None
|
106 |
|
107 |
# --- Prompting Functions ---
|
@@ -286,7 +330,7 @@ def generate_question_prompt(topic, difficulty):
|
|
286 |
if topic in topic_instructions:
|
287 |
prompt += random.choice(topic_instructions[topic]) + "\n"
|
288 |
else:
|
289 |
-
|
290 |
|
291 |
# 5. Conditional Question Types (Not for math, logics, grammar)
|
292 |
if topic not in ["math", "logics", "grammar", "coding", "creative writing"]:
|
@@ -372,14 +416,14 @@ def parse_rank_string(rank_str, ranking_model_id):
|
|
372 |
try:
|
373 |
rank_val = int(rank_str) # Convert to integer *after* regex extraction
|
374 |
if not 1 <= rank_val <= 5: # Check if rank is within valid range
|
375 |
-
|
376 |
return None
|
377 |
return rank_val
|
378 |
except ValueError:
|
379 |
-
|
380 |
return None
|
381 |
else:
|
382 |
-
|
383 |
return None
|
384 |
|
385 |
# --- Helper Function for Parallel Ranking ---
|
@@ -396,18 +440,18 @@ def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures
|
|
396 |
rank_str = response.strip()
|
397 |
rank = parse_rank_string(rank_str, ranking_model_id)
|
398 |
except ValueError:
|
399 |
-
|
400 |
rank = None
|
401 |
else:
|
402 |
-
|
403 |
except Exception as e:
|
404 |
duration = time.time() - start_time
|
405 |
-
|
406 |
rank = None
|
407 |
|
408 |
duration = time.time() - start_time # Calculate total duration of ranking attempt
|
409 |
if duration > timeout:
|
410 |
-
|
411 |
rank = None # Ensure rank is None if timeout occurs
|
412 |
|
413 |
time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
|
@@ -427,18 +471,18 @@ def get_question_rank_from_model(ranking_model_id, question, topic, difficulty,
|
|
427 |
rank_str = response.strip()
|
428 |
rank = parse_rank_string(rank_str, ranking_model_id)
|
429 |
except ValueError:
|
430 |
-
|
431 |
rank = None
|
432 |
else:
|
433 |
-
|
434 |
except Exception as e:
|
435 |
duration = time.time() - start_time
|
436 |
-
|
437 |
rank = None
|
438 |
|
439 |
duration = time.time() - start_time # Calculate total duration of ranking attempt
|
440 |
if duration > timeout:
|
441 |
-
|
442 |
rank = None # Ensure rank is None if timeout occurs
|
443 |
|
444 |
time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
|
@@ -462,13 +506,13 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
|
|
462 |
answer = response.strip()
|
463 |
except Exception as e:
|
464 |
duration = time.time() - start_time
|
465 |
-
|
466 |
answer = "Error answering - Timeout" # Or a specific timeout error message
|
467 |
return answer, duration # Return error answer and duration
|
468 |
|
469 |
time.sleep(time_sleep) # Small delay
|
470 |
duration = time.time() - start_time # Calculate duration
|
471 |
-
|
472 |
|
473 |
return answer, duration # Return answer and duration
|
474 |
|
@@ -523,15 +567,18 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
523 |
s_t = 0 #count succesful iterations
|
524 |
|
525 |
for iteration in range(t): # Added iteration counter
|
|
|
|
|
|
|
526 |
|
527 |
if len(active_models) < 2:
|
528 |
-
|
529 |
break
|
530 |
|
531 |
topic = random.choice(topics)
|
532 |
# --- Select difficulty with probabilities ---
|
533 |
difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
|
534 |
-
|
535 |
|
536 |
# --- Question Generation ---
|
537 |
question = None
|
@@ -552,12 +599,13 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
552 |
if model_config[model_id].get("role", "both") in ["answer", "both"]
|
553 |
]
|
554 |
if not question_gen_candidates: # No suitable models left
|
555 |
-
|
556 |
continue # Skip to next iteration
|
557 |
|
558 |
question_generator_model_id = random.choice(question_gen_candidates)
|
559 |
|
560 |
# --- Question Generation ---
|
|
|
561 |
response = make_hf_request(model_config[question_generator_model_id]["name"],
|
562 |
[{"role": "user", "content": question_prompt}],
|
563 |
question_temp,
|
@@ -569,25 +617,26 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
569 |
consecutive_failures[question_generator_model_id] = 0 # Reset on success
|
570 |
break
|
571 |
else:
|
572 |
-
|
573 |
consecutive_failures[question_generator_model_id] += 1
|
574 |
|
575 |
if consecutive_failures[question_generator_model_id] >= failure_threshold:
|
576 |
-
|
577 |
if question_generator_model_id in active_models:
|
578 |
active_models.remove(question_generator_model_id)
|
579 |
unresponsive_models.add(question_generator_model_id)
|
580 |
time.sleep(time_sleep)
|
581 |
|
582 |
if question is None:
|
583 |
-
|
584 |
continue
|
585 |
|
586 |
# --- Parallel Question Ranking ---
|
587 |
question_ranks = {}
|
588 |
question_ranking_futures = []
|
589 |
question_ranking_start_time = time.time()
|
590 |
-
|
|
|
591 |
with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models) or 1) as executor:
|
592 |
for ranking_model_id in active_models:
|
593 |
# --- Filter for ranking roles ("rank" or "both") ---
|
@@ -626,33 +675,34 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
626 |
|
627 |
#check that the length is correct
|
628 |
if len(weights_for_valid_question_ranks) != len(valid_question_ranks_values):
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
|
633 |
question_avg_rank = np.average(valid_question_ranks_values, weights=weights_for_valid_question_ranks)
|
634 |
min_question_rank = min(valid_question_ranks_values) if valid_question_ranks_values else 0 # To avoid error if no valid rank
|
635 |
|
636 |
if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
|
637 |
question_accepted = True
|
638 |
-
|
639 |
s_t += 1
|
640 |
else:
|
641 |
question_accepted = False
|
642 |
-
|
643 |
|
644 |
if not question_accepted:
|
645 |
-
|
646 |
continue
|
647 |
|
648 |
if len(active_models) < 2:
|
649 |
-
|
650 |
break
|
651 |
|
652 |
# --- Parallel Answer Generation ---
|
653 |
answers = {}
|
654 |
answer_futures = []
|
655 |
answer_durations = {}
|
|
|
656 |
with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models)) as executor:
|
657 |
for model_id in active_models:
|
658 |
# --- Filter for answer generation roles ("answer" or "both") ---
|
@@ -672,7 +722,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
672 |
)
|
673 |
answer_futures.append(future)
|
674 |
except TimeoutError as e:
|
675 |
-
|
676 |
answer = "I am struggling to answer this question" # Treat timeout as error
|
677 |
duration = 120 # You can set a default duration or handle it differently if needed
|
678 |
answers[model_id] = answer # Store error answer
|
@@ -691,14 +741,14 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
691 |
if iteration == 0: # Write header only for the first iteration
|
692 |
iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
|
693 |
|
694 |
-
|
695 |
for model_id in active_models:
|
696 |
answer = answers[model_id] # Retrieve pre-generated answer
|
697 |
|
698 |
if answer == "Error answering": # Handle answer generation errors
|
699 |
consecutive_failures[model_id] += 1
|
700 |
if consecutive_failures[model_id] >= failure_threshold:
|
701 |
-
|
702 |
if model_id in active_models: # double check before removing, might have been removed in another thread
|
703 |
active_models.remove(model_id)
|
704 |
unresponsive_models.add(model_id)
|
@@ -706,7 +756,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
706 |
|
707 |
|
708 |
if len(active_models) < 2: # Re-check active models before ranking
|
709 |
-
|
710 |
break
|
711 |
|
712 |
ranks = {}
|
@@ -751,9 +801,9 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
751 |
|
752 |
|
753 |
if len(weights_for_valid_ranks) != len(valid_ranks_values):
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
|
758 |
average_rank = np.average(valid_ranks_values, weights=weights_for_valid_ranks)
|
759 |
|
@@ -775,7 +825,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
775 |
|
776 |
# --- Print and store iteration results IMMEDIATELY after ranking for this model ---
|
777 |
ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
|
778 |
-
|
779 |
|
780 |
# Write iteration results to file (append mode) - write for each model right after ranking
|
781 |
iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
|
@@ -802,7 +852,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
802 |
|
803 |
iteration_results_file_opened.close()
|
804 |
|
805 |
-
|
806 |
return results, cumulative_avg_rank, s_t
|
807 |
|
808 |
def check_model_availability(models, token):
|
|
|
9 |
import random
|
10 |
import functools
|
11 |
|
12 |
+
# Create a logging system for errors and warnings
|
13 |
+
if 'log_messages' not in st.session_state:
|
14 |
+
st.session_state.log_messages = []
|
15 |
+
|
16 |
+
# Create areas for different outputs
|
17 |
+
status_area = st.empty() # For current status
|
18 |
+
progress_area = st.empty() # For progress updates
|
19 |
+
|
20 |
+
# Collapsible section for logs
|
21 |
+
with st.expander("Execution Log", expanded=False):
|
22 |
+
log_area = st.empty()
|
23 |
+
|
24 |
+
def update_log():
|
25 |
+
"""Update the log display with current messages"""
|
26 |
+
log_area.text_area("System Log", value="\n".join(st.session_state.log_messages), height=300)
|
27 |
+
|
28 |
+
def log_message(message, level="INFO"):
|
29 |
+
"""Log a message with timestamp and level"""
|
30 |
+
timestamp = time.strftime("%H:%M:%S")
|
31 |
+
formatted_msg = f"[{timestamp}] {level}: {message}"
|
32 |
+
st.session_state.log_messages.append(formatted_msg)
|
33 |
+
# Limit log size
|
34 |
+
if len(st.session_state.log_messages) > 500:
|
35 |
+
st.session_state.log_messages = st.session_state.log_messages[-500:]
|
36 |
+
update_log()
|
37 |
+
|
38 |
+
# Specialized logging functions
|
39 |
+
def log_info(message):
|
40 |
+
log_message(message, "INFO")
|
41 |
+
|
42 |
+
def log_warning(message):
|
43 |
+
log_message(message, "WARNING")
|
44 |
+
|
45 |
+
def log_error(message):
|
46 |
+
log_message(message, "ERROR")
|
47 |
+
|
48 |
+
# Function to update status
|
49 |
+
def update_status(message):
|
50 |
+
status_area.write(message)
|
51 |
+
|
52 |
+
# Function to update progress message
|
53 |
+
def update_progress(message):
|
54 |
+
progress_area.write(message)
|
55 |
+
|
56 |
# FILES
|
57 |
iteration_output_file = "llm_benchmark_iteration_results.csv" # File to store iteration results, defined as global
|
58 |
results_file = "llm_benchmark_results.csv" # all data
|
|
|
92 |
try:
|
93 |
return func(*args, **kwargs)
|
94 |
except Exception as e:
|
95 |
+
log_error(f"API error: {e}")
|
96 |
if retries < max_retries:
|
97 |
+
log_info(f"Waiting for {wait_time} seconds before retrying... (Retry {retries + 1}/{max_retries})")
|
98 |
time.sleep(wait_time)
|
99 |
retries += 1
|
100 |
else:
|
101 |
+
log_error(f"Max retries reached. Request failed.")
|
102 |
return None
|
103 |
|
104 |
return None
|
|
|
145 |
)
|
146 |
return response
|
147 |
except Exception as e:
|
148 |
+
log_error(f"Hugging Face Inference API error: {e}")
|
149 |
return None
|
150 |
|
151 |
# --- Prompting Functions ---
|
|
|
330 |
if topic in topic_instructions:
|
331 |
prompt += random.choice(topic_instructions[topic]) + "\n"
|
332 |
else:
|
333 |
+
log_warning(f"No topic_instructions defined for topic '{topic}'")
|
334 |
|
335 |
# 5. Conditional Question Types (Not for math, logics, grammar)
|
336 |
if topic not in ["math", "logics", "grammar", "coding", "creative writing"]:
|
|
|
416 |
try:
|
417 |
rank_val = int(rank_str) # Convert to integer *after* regex extraction
|
418 |
if not 1 <= rank_val <= 5: # Check if rank is within valid range
|
419 |
+
log_warning(f"Model {ranking_model_id} returned rank outside of valid range [1-5]: {rank_val}. Rank set to None.")
|
420 |
return None
|
421 |
return rank_val
|
422 |
except ValueError:
|
423 |
+
log_warning(f"Model {ranking_model_id} returned non-integer rank after regex extraction: '{rank_str}'. Rank set to None.")
|
424 |
return None
|
425 |
else:
|
426 |
+
log_warning(f"Model {ranking_model_id} returned non-numeric rank: '{rank_str}'. Rank set to None.")
|
427 |
return None
|
428 |
|
429 |
# --- Helper Function for Parallel Ranking ---
|
|
|
440 |
rank_str = response.strip()
|
441 |
rank = parse_rank_string(rank_str, ranking_model_id)
|
442 |
except ValueError:
|
443 |
+
log_warning(f"Model {ranking_model_id} returned non-integer rank: '{rank_str}'. Rank set to None.")
|
444 |
rank = None
|
445 |
else:
|
446 |
+
log_warning(f"Model {ranking_model_id} failed to provide rank. Rank set to None.")
|
447 |
except Exception as e:
|
448 |
duration = time.time() - start_time
|
449 |
+
log_warning(f"Model {ranking_model_id} ranking timed out or failed after {duration:.2f}s: {e}")
|
450 |
rank = None
|
451 |
|
452 |
duration = time.time() - start_time # Calculate total duration of ranking attempt
|
453 |
if duration > timeout:
|
454 |
+
log_warning(f"Ranking by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
|
455 |
rank = None # Ensure rank is None if timeout occurs
|
456 |
|
457 |
time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
|
|
|
471 |
rank_str = response.strip()
|
472 |
rank = parse_rank_string(rank_str, ranking_model_id)
|
473 |
except ValueError:
|
474 |
+
log_warning(f"Model {ranking_model_id} returned non-integer rank for question: '{rank_str}'. Rank set to None.")
|
475 |
rank = None
|
476 |
else:
|
477 |
+
log_warning(f"Model {ranking_model_id} failed to provide rank for question. Rank set to None.")
|
478 |
except Exception as e:
|
479 |
duration = time.time() - start_time
|
480 |
+
log_warning(f"Model {ranking_model_id} ranking question timed out or failed after {duration:.2f}s: {e}")
|
481 |
rank = None
|
482 |
|
483 |
duration = time.time() - start_time # Calculate total duration of ranking attempt
|
484 |
if duration > timeout:
|
485 |
+
log_warning(f"Ranking question by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
|
486 |
rank = None # Ensure rank is None if timeout occurs
|
487 |
|
488 |
time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
|
|
|
506 |
answer = response.strip()
|
507 |
except Exception as e:
|
508 |
duration = time.time() - start_time
|
509 |
+
log_warning(f"Model {model_id} answering timed out or failed after {duration:.2f}s: {e}")
|
510 |
answer = "Error answering - Timeout" # Or a specific timeout error message
|
511 |
return answer, duration # Return error answer and duration
|
512 |
|
513 |
time.sleep(time_sleep) # Small delay
|
514 |
duration = time.time() - start_time # Calculate duration
|
515 |
+
st.write(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately
|
516 |
|
517 |
return answer, duration # Return answer and duration
|
518 |
|
|
|
567 |
s_t = 0 #count succesful iterations
|
568 |
|
569 |
for iteration in range(t): # Added iteration counter
|
570 |
+
# Update the progress bar
|
571 |
+
progress_percentage = min(100, (iteration / t) * 100)
|
572 |
+
st.progress(progress_percentage)
|
573 |
|
574 |
if len(active_models) < 2:
|
575 |
+
st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
|
576 |
break
|
577 |
|
578 |
topic = random.choice(topics)
|
579 |
# --- Select difficulty with probabilities ---
|
580 |
difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
|
581 |
+
update_status(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---")
|
582 |
|
583 |
# --- Question Generation ---
|
584 |
question = None
|
|
|
599 |
if model_config[model_id].get("role", "both") in ["answer", "both"]
|
600 |
]
|
601 |
if not question_gen_candidates: # No suitable models left
|
602 |
+
st.warning("No models available for question generation with 'answer' or 'both' role. Skipping iteration.")
|
603 |
continue # Skip to next iteration
|
604 |
|
605 |
question_generator_model_id = random.choice(question_gen_candidates)
|
606 |
|
607 |
# --- Question Generation ---
|
608 |
+
update_progress(f"Generating question using model {question_generator_model_id}...")
|
609 |
response = make_hf_request(model_config[question_generator_model_id]["name"],
|
610 |
[{"role": "user", "content": question_prompt}],
|
611 |
question_temp,
|
|
|
617 |
consecutive_failures[question_generator_model_id] = 0 # Reset on success
|
618 |
break
|
619 |
else:
|
620 |
+
log_warning(f"Skipping due to request failure for model {question_generator_model_id}.")
|
621 |
consecutive_failures[question_generator_model_id] += 1
|
622 |
|
623 |
if consecutive_failures[question_generator_model_id] >= failure_threshold:
|
624 |
+
st.warning(f"Model {question_generator_model_id} is unresponsive (question gen). Removing from active models.")
|
625 |
if question_generator_model_id in active_models:
|
626 |
active_models.remove(question_generator_model_id)
|
627 |
unresponsive_models.add(question_generator_model_id)
|
628 |
time.sleep(time_sleep)
|
629 |
|
630 |
if question is None:
|
631 |
+
st.warning(f"Failed to generate a question after {max_attempts} attempts. Skipping this round.")
|
632 |
continue
|
633 |
|
634 |
# --- Parallel Question Ranking ---
|
635 |
question_ranks = {}
|
636 |
question_ranking_futures = []
|
637 |
question_ranking_start_time = time.time()
|
638 |
+
|
639 |
+
update_progress(f"Ranking generated question...")
|
640 |
with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models) or 1) as executor:
|
641 |
for ranking_model_id in active_models:
|
642 |
# --- Filter for ranking roles ("rank" or "both") ---
|
|
|
675 |
|
676 |
#check that the length is correct
|
677 |
if len(weights_for_valid_question_ranks) != len(valid_question_ranks_values):
|
678 |
+
log_warning("Mismatch length of weights and valid question ranks")
|
679 |
+
log_info(f'weights_for_valid_question_ranks {weights_for_valid_question_ranks}')
|
680 |
+
log_info(f'valid_question_ranks_values: {valid_question_ranks_values}')
|
681 |
|
682 |
question_avg_rank = np.average(valid_question_ranks_values, weights=weights_for_valid_question_ranks)
|
683 |
min_question_rank = min(valid_question_ranks_values) if valid_question_ranks_values else 0 # To avoid error if no valid rank
|
684 |
|
685 |
if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
|
686 |
question_accepted = True
|
687 |
+
st.write(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
|
688 |
s_t += 1
|
689 |
else:
|
690 |
question_accepted = False
|
691 |
+
st.write(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
|
692 |
|
693 |
if not question_accepted:
|
694 |
+
update_progress("Generated question was not accepted. Regenerating question.")
|
695 |
continue
|
696 |
|
697 |
if len(active_models) < 2:
|
698 |
+
st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
|
699 |
break
|
700 |
|
701 |
# --- Parallel Answer Generation ---
|
702 |
answers = {}
|
703 |
answer_futures = []
|
704 |
answer_durations = {}
|
705 |
+
update_progress("Generating answers from all models...")
|
706 |
with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models)) as executor:
|
707 |
for model_id in active_models:
|
708 |
# --- Filter for answer generation roles ("answer" or "both") ---
|
|
|
722 |
)
|
723 |
answer_futures.append(future)
|
724 |
except TimeoutError as e:
|
725 |
+
log_error(f"Answer generation for model {model_id} timed out: {e}")
|
726 |
answer = "I am struggling to answer this question" # Treat timeout as error
|
727 |
duration = 120 # You can set a default duration or handle it differently if needed
|
728 |
answers[model_id] = answer # Store error answer
|
|
|
741 |
if iteration == 0: # Write header only for the first iteration
|
742 |
iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
|
743 |
|
744 |
+
update_progress("Ranking all answers...")
|
745 |
for model_id in active_models:
|
746 |
answer = answers[model_id] # Retrieve pre-generated answer
|
747 |
|
748 |
if answer == "Error answering": # Handle answer generation errors
|
749 |
consecutive_failures[model_id] += 1
|
750 |
if consecutive_failures[model_id] >= failure_threshold:
|
751 |
+
st.warning(f"Model {model_id} is consistently failing to answer. Removing from active models.")
|
752 |
if model_id in active_models: # double check before removing, might have been removed in another thread
|
753 |
active_models.remove(model_id)
|
754 |
unresponsive_models.add(model_id)
|
|
|
756 |
|
757 |
|
758 |
if len(active_models) < 2: # Re-check active models before ranking
|
759 |
+
st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
|
760 |
break
|
761 |
|
762 |
ranks = {}
|
|
|
801 |
|
802 |
|
803 |
if len(weights_for_valid_ranks) != len(valid_ranks_values):
|
804 |
+
log_warning("Mismatch length of weights and valid answer ranks")
|
805 |
+
log_info(f'weights_for_valid_ranks {weights_for_valid_ranks}')
|
806 |
+
log_info(f'valid_ranks_values: {valid_ranks_values}')
|
807 |
|
808 |
average_rank = np.average(valid_ranks_values, weights=weights_for_valid_ranks)
|
809 |
|
|
|
825 |
|
826 |
# --- Print and store iteration results IMMEDIATELY after ranking for this model ---
|
827 |
ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
|
828 |
+
st.write(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank[model_id]:.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
|
829 |
|
830 |
# Write iteration results to file (append mode) - write for each model right after ranking
|
831 |
iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
|
|
|
852 |
|
853 |
iteration_results_file_opened.close()
|
854 |
|
855 |
+
st.write(f"Unresponsive models during this run: {unresponsive_models}")
|
856 |
return results, cumulative_avg_rank, s_t
|
857 |
|
858 |
def check_model_availability(models, token):
|