Spaces:

AutoBench
/

AutoBench_1.0_Demo

Running

App Files Files Community

PeterKruger commited on Feb 28

Commit

2ee78d6

verified ·

1 Parent(s): 1bc57b4

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -41

app.py CHANGED Viewed

@@ -9,6 +9,50 @@ import time
 import random
 import functools
 # FILES
 iteration_output_file = "llm_benchmark_iteration_results.csv"  # File to store iteration results, defined as global
 results_file = "llm_benchmark_results.csv"  # all data
@@ -48,13 +92,13 @@ def retry_api_request(max_retries=3, wait_time=10):
                 try:
                     return func(*args, **kwargs)
                 except Exception as e:
-                    print(f"API error: {e}")
                     if retries < max_retries:
-                        print(f"Waiting for {wait_time} seconds before retrying... (Retry {retries + 1}/{max_retries})")
                         time.sleep(wait_time)
                         retries += 1
                     else:
-                        print(f"Max retries reached. Request failed.")
                         return None
             return None
@@ -101,7 +145,7 @@ def make_hf_request(model_name, messages, temperature, max_tokens, token=None):
         )
         return response
     except Exception as e:
-        print(f"Hugging Face Inference API error: {e}")
         return None
 # --- Prompting Functions ---
@@ -286,7 +330,7 @@ def generate_question_prompt(topic, difficulty):
     if topic in topic_instructions:
         prompt += random.choice(topic_instructions[topic]) + "\n"
     else:
-        print(f"Warning: No topic_instructions defined for topic '{topic}'")
     # 5. Conditional Question Types (Not for math, logics, grammar)
     if topic not in ["math", "logics", "grammar", "coding", "creative writing"]:
@@ -372,14 +416,14 @@ def parse_rank_string(rank_str, ranking_model_id):
         try:
             rank_val = int(rank_str) # Convert to integer *after* regex extraction
             if not 1 <= rank_val <= 5: # Check if rank is within valid range
-                print(f"Warning: Model {ranking_model_id} returned rank outside of valid range [1-5]: {rank_val}. Rank set to None.")
                 return None
             return rank_val
         except ValueError:
-            print(f"Warning: Model {ranking_model_id} returned non-integer rank after regex extraction: '{rank_str}'. Rank set to None.")
             return None
     else:
-        print(f"Warning: Model {ranking_model_id} returned non-numeric rank: '{rank_str}'. Rank set to None.")
         return None
 # --- Helper Function for Parallel Ranking ---
@@ -396,18 +440,18 @@ def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures
                 rank_str = response.strip()
                 rank = parse_rank_string(rank_str, ranking_model_id)
             except ValueError:
-                print(f"Warning: Model {ranking_model_id} returned non-integer rank: '{rank_str}'. Rank set to None.")
                 rank = None
         else:
-            print(f"Warning: Model {ranking_model_id} failed to provide rank. Rank set to None.")
     except Exception as e:
         duration = time.time() - start_time
-        print(f"Warning: Model {ranking_model_id} ranking timed out or failed after {duration:.2f}s: {e}")
         rank = None
     duration = time.time() - start_time # Calculate total duration of ranking attempt
     if duration > timeout:
-        print(f"Warning: Ranking by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
         rank = None # Ensure rank is None if timeout occurs
     time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
@@ -427,18 +471,18 @@ def get_question_rank_from_model(ranking_model_id, question, topic, difficulty,
                 rank_str = response.strip()
                 rank = parse_rank_string(rank_str, ranking_model_id)
             except ValueError:
-                print(f"Warning: Model {ranking_model_id} returned non-integer rank for question: '{rank_str}'. Rank set to None.")
                 rank = None
         else:
-            print(f"Warning: Model {ranking_model_id} failed to provide rank for question. Rank set to None.")
     except Exception as e:
         duration = time.time() - start_time
-        print(f"Warning: Model {ranking_model_id} ranking question timed out or failed after {duration:.2f}s: {e}")
         rank = None
     duration = time.time() - start_time # Calculate total duration of ranking attempt
     if duration > timeout:
-        print(f"Warning: Ranking question by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
         rank = None # Ensure rank is None if timeout occurs
     time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
@@ -462,13 +506,13 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
             answer = response.strip()
     except Exception as e:
         duration = time.time() - start_time
-        print(f"Warning: Model {model_id} answering timed out or failed after {duration:.2f}s: {e}")
         answer = "Error answering - Timeout" # Or a specific timeout error message
         return answer, duration # Return error answer and duration
     time.sleep(time_sleep) # Small delay
     duration = time.time() - start_time # Calculate duration
-    print(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately as requested - as requested
     return answer, duration # Return answer and duration
@@ -523,15 +567,18 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
     s_t = 0 #count succesful iterations
     for iteration in range(t): # Added iteration counter
         if len(active_models) < 2:
-            print("Fewer than 2 active models remaining. Exiting benchmark.")
             break
         topic = random.choice(topics)
         # --- Select difficulty with probabilities ---
         difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
-        print(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---") # Print iteration number
         # --- Question Generation ---
         question = None
@@ -552,12 +599,13 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
                 if model_config[model_id].get("role", "both") in ["answer", "both"]
             ]
             if not question_gen_candidates: # No suitable models left
-                print("No models available for question generation with 'answer' or 'both' role. Skipping iteration.")
                 continue # Skip to next iteration
             question_generator_model_id = random.choice(question_gen_candidates)
             # --- Question Generation ---
             response = make_hf_request(model_config[question_generator_model_id]["name"],
                                       [{"role": "user", "content": question_prompt}],
                                       question_temp,
@@ -569,25 +617,26 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
                 consecutive_failures[question_generator_model_id] = 0  # Reset on success
                 break
             else:
-                print(f"Skipping due to request failure for model {question_generator_model_id}.")
                 consecutive_failures[question_generator_model_id] += 1
             if consecutive_failures[question_generator_model_id] >= failure_threshold:
-                print(f"Model {question_generator_model_id} is unresponsive (question gen). Removing from active models.")
                 if question_generator_model_id in active_models:
                     active_models.remove(question_generator_model_id)
                 unresponsive_models.add(question_generator_model_id)
             time.sleep(time_sleep)
         if question is None:
-            print(f"Failed to generate a question after {max_attempts} attempts. Skipping this round.")
             continue
         # --- Parallel Question Ranking ---
         question_ranks = {}
         question_ranking_futures = []
         question_ranking_start_time = time.time()
         with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models) or 1) as executor:
             for ranking_model_id in active_models:
                 # --- Filter for ranking roles ("rank" or "both") ---
@@ -626,33 +675,34 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
             #check that the length is correct
             if len(weights_for_valid_question_ranks) != len(valid_question_ranks_values):
-                print("Warning: Mismatch length of weights and valid question ranks")
-                print(f'weights_for_valid_question_ranks {weights_for_valid_question_ranks}')
-                print(f'valid_question_ranks_values: {valid_question_ranks_values}')
             question_avg_rank = np.average(valid_question_ranks_values, weights=weights_for_valid_question_ranks)
         min_question_rank = min(valid_question_ranks_values) if valid_question_ranks_values else 0 # To avoid error if no valid rank
         if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
             question_accepted = True
-            print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
             s_t += 1
         else:
             question_accepted = False
-            print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
         if not question_accepted:
-            print("Generated question was not accepted. Regenerating question.")
             continue
         if len(active_models) < 2:
-          print("Fewer than 2 active models remaining. Exiting benchmark.")
           break
         # --- Parallel Answer Generation ---
         answers = {}
         answer_futures = []
         answer_durations = {}
         with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models)) as executor:
             for model_id in active_models:
                 # --- Filter for answer generation roles ("answer" or "both") ---
@@ -672,7 +722,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
                         )
                         answer_futures.append(future)
                     except TimeoutError as e:
-                        print(f"Answer generation for model {model_id} timed out: {e}")
                         answer = "I am struggling to answer this question" # Treat timeout as error
                         duration = 120 # You can set a default duration or handle it differently if needed
                         answers[model_id] = answer # Store error answer
@@ -691,14 +741,14 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
         if iteration == 0: # Write header only for the first iteration
             iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
         for model_id in active_models:
             answer = answers[model_id] # Retrieve pre-generated answer
             if answer == "Error answering": # Handle answer generation errors
                 consecutive_failures[model_id] += 1
                 if consecutive_failures[model_id] >= failure_threshold:
-                    print(f"Model {model_id} is consistently failing to answer. Removing from active models.")
                     if model_id in active_models: # double check before removing, might have been removed in another thread
                         active_models.remove(model_id)
                     unresponsive_models.add(model_id)
@@ -706,7 +756,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
             if len(active_models) < 2: # Re-check active models before ranking
-              print("Fewer than 2 active models remaining. Exiting benchmark.")
               break
             ranks = {}
@@ -751,9 +801,9 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
                 if len(weights_for_valid_ranks) != len(valid_ranks_values):
-                    print("Warning: Mismatch length of weights and valid answer ranks")
-                    print(f'weights_for_valid_ranks {weights_for_valid_ranks}')
-                    print(f'valid_ranks_values: {valid_ranks_values}')
                 average_rank = np.average(valid_ranks_values, weights=weights_for_valid_ranks)
@@ -775,7 +825,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
             # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
             ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
-            print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank[model_id]:.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
             # Write iteration results to file (append mode) - write for each model right after ranking
             iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
@@ -802,7 +852,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
         iteration_results_file_opened.close()
-    print(f"Unresponsive models during this run: {unresponsive_models}")
     return results, cumulative_avg_rank, s_t
 def check_model_availability(models, token):

 import random
 import functools
+# Create a logging system for errors and warnings
+if 'log_messages' not in st.session_state:
+    st.session_state.log_messages = []
+# Create areas for different outputs
+status_area = st.empty()  # For current status
+progress_area = st.empty()  # For progress updates
+# Collapsible section for logs
+with st.expander("Execution Log", expanded=False):
+    log_area = st.empty()
+def update_log():
+    """Update the log display with current messages"""
+    log_area.text_area("System Log", value="\n".join(st.session_state.log_messages), height=300)
+def log_message(message, level="INFO"):
+    """Log a message with timestamp and level"""
+    timestamp = time.strftime("%H:%M:%S")
+    formatted_msg = f"[{timestamp}] {level}: {message}"
+    st.session_state.log_messages.append(formatted_msg)
+    # Limit log size
+    if len(st.session_state.log_messages) > 500:
+        st.session_state.log_messages = st.session_state.log_messages[-500:]
+    update_log()
+# Specialized logging functions
+def log_info(message):
+    log_message(message, "INFO")
+def log_warning(message):
+    log_message(message, "WARNING")
+def log_error(message):
+    log_message(message, "ERROR")
+# Function to update status
+def update_status(message):
+    status_area.write(message)
+# Function to update progress message
+def update_progress(message):
+    progress_area.write(message)
 # FILES
 iteration_output_file = "llm_benchmark_iteration_results.csv"  # File to store iteration results, defined as global
 results_file = "llm_benchmark_results.csv"  # all data
                 try:
                     return func(*args, **kwargs)
                 except Exception as e:
+                    log_error(f"API error: {e}")
                     if retries < max_retries:
+                        log_info(f"Waiting for {wait_time} seconds before retrying... (Retry {retries + 1}/{max_retries})")
                         time.sleep(wait_time)
                         retries += 1
                     else:
+                        log_error(f"Max retries reached. Request failed.")
                         return None
             return None
         )
         return response
     except Exception as e:
+        log_error(f"Hugging Face Inference API error: {e}")
         return None
 # --- Prompting Functions ---
     if topic in topic_instructions:
         prompt += random.choice(topic_instructions[topic]) + "\n"
     else:
+        log_warning(f"No topic_instructions defined for topic '{topic}'")
     # 5. Conditional Question Types (Not for math, logics, grammar)
     if topic not in ["math", "logics", "grammar", "coding", "creative writing"]:
         try:
             rank_val = int(rank_str) # Convert to integer *after* regex extraction
             if not 1 <= rank_val <= 5: # Check if rank is within valid range
+                log_warning(f"Model {ranking_model_id} returned rank outside of valid range [1-5]: {rank_val}. Rank set to None.")
                 return None
             return rank_val
         except ValueError:
+            log_warning(f"Model {ranking_model_id} returned non-integer rank after regex extraction: '{rank_str}'. Rank set to None.")
             return None
     else:
+        log_warning(f"Model {ranking_model_id} returned non-numeric rank: '{rank_str}'. Rank set to None.")
         return None
 # --- Helper Function for Parallel Ranking ---
                 rank_str = response.strip()
                 rank = parse_rank_string(rank_str, ranking_model_id)
             except ValueError:
+                log_warning(f"Model {ranking_model_id} returned non-integer rank: '{rank_str}'. Rank set to None.")
                 rank = None
         else:
+            log_warning(f"Model {ranking_model_id} failed to provide rank. Rank set to None.")
     except Exception as e:
         duration = time.time() - start_time
+        log_warning(f"Model {ranking_model_id} ranking timed out or failed after {duration:.2f}s: {e}")
         rank = None
     duration = time.time() - start_time # Calculate total duration of ranking attempt
     if duration > timeout:
+        log_warning(f"Ranking by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
         rank = None # Ensure rank is None if timeout occurs
     time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
                 rank_str = response.strip()
                 rank = parse_rank_string(rank_str, ranking_model_id)
             except ValueError:
+                log_warning(f"Model {ranking_model_id} returned non-integer rank for question: '{rank_str}'. Rank set to None.")
                 rank = None
         else:
+            log_warning(f"Model {ranking_model_id} failed to provide rank for question. Rank set to None.")
     except Exception as e:
         duration = time.time() - start_time
+        log_warning(f"Model {ranking_model_id} ranking question timed out or failed after {duration:.2f}s: {e}")
         rank = None
     duration = time.time() - start_time # Calculate total duration of ranking attempt
     if duration > timeout:
+        log_warning(f"Ranking question by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
         rank = None # Ensure rank is None if timeout occurs
     time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
             answer = response.strip()
     except Exception as e:
         duration = time.time() - start_time
+        log_warning(f"Model {model_id} answering timed out or failed after {duration:.2f}s: {e}")
         answer = "Error answering - Timeout" # Or a specific timeout error message
         return answer, duration # Return error answer and duration
     time.sleep(time_sleep) # Small delay
     duration = time.time() - start_time # Calculate duration
+    st.write(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately
     return answer, duration # Return answer and duration
     s_t = 0 #count succesful iterations
     for iteration in range(t): # Added iteration counter
+        # Update the progress bar
+        progress_percentage = min(100, (iteration / t) * 100)
+        st.progress(progress_percentage)
         if len(active_models) < 2:
+            st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
             break
         topic = random.choice(topics)
         # --- Select difficulty with probabilities ---
         difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
+        update_status(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---")
         # --- Question Generation ---
         question = None
                 if model_config[model_id].get("role", "both") in ["answer", "both"]
             ]
             if not question_gen_candidates: # No suitable models left
+                st.warning("No models available for question generation with 'answer' or 'both' role. Skipping iteration.")
                 continue # Skip to next iteration
             question_generator_model_id = random.choice(question_gen_candidates)
             # --- Question Generation ---
+            update_progress(f"Generating question using model {question_generator_model_id}...")
             response = make_hf_request(model_config[question_generator_model_id]["name"],
                                       [{"role": "user", "content": question_prompt}],
                                       question_temp,
                 consecutive_failures[question_generator_model_id] = 0  # Reset on success
                 break
             else:
+                log_warning(f"Skipping due to request failure for model {question_generator_model_id}.")
                 consecutive_failures[question_generator_model_id] += 1
             if consecutive_failures[question_generator_model_id] >= failure_threshold:
+                st.warning(f"Model {question_generator_model_id} is unresponsive (question gen). Removing from active models.")
                 if question_generator_model_id in active_models:
                     active_models.remove(question_generator_model_id)
                 unresponsive_models.add(question_generator_model_id)
             time.sleep(time_sleep)
         if question is None:
+            st.warning(f"Failed to generate a question after {max_attempts} attempts. Skipping this round.")
             continue
         # --- Parallel Question Ranking ---
         question_ranks = {}
         question_ranking_futures = []
         question_ranking_start_time = time.time()
+        update_progress(f"Ranking generated question...")
         with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models) or 1) as executor:
             for ranking_model_id in active_models:
                 # --- Filter for ranking roles ("rank" or "both") ---
             #check that the length is correct
             if len(weights_for_valid_question_ranks) != len(valid_question_ranks_values):
+                log_warning("Mismatch length of weights and valid question ranks")
+                log_info(f'weights_for_valid_question_ranks {weights_for_valid_question_ranks}')
+                log_info(f'valid_question_ranks_values: {valid_question_ranks_values}')
             question_avg_rank = np.average(valid_question_ranks_values, weights=weights_for_valid_question_ranks)
         min_question_rank = min(valid_question_ranks_values) if valid_question_ranks_values else 0 # To avoid error if no valid rank
         if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
             question_accepted = True
+            st.write(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
             s_t += 1
         else:
             question_accepted = False
+            st.write(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
         if not question_accepted:
+            update_progress("Generated question was not accepted. Regenerating question.")
             continue
         if len(active_models) < 2:
+          st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
           break
         # --- Parallel Answer Generation ---
         answers = {}
         answer_futures = []
         answer_durations = {}
+        update_progress("Generating answers from all models...")
         with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models)) as executor:
             for model_id in active_models:
                 # --- Filter for answer generation roles ("answer" or "both") ---
                         )
                         answer_futures.append(future)
                     except TimeoutError as e:
+                        log_error(f"Answer generation for model {model_id} timed out: {e}")
                         answer = "I am struggling to answer this question" # Treat timeout as error
                         duration = 120 # You can set a default duration or handle it differently if needed
                         answers[model_id] = answer # Store error answer
         if iteration == 0: # Write header only for the first iteration
             iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
+        update_progress("Ranking all answers...")
         for model_id in active_models:
             answer = answers[model_id] # Retrieve pre-generated answer
             if answer == "Error answering": # Handle answer generation errors
                 consecutive_failures[model_id] += 1
                 if consecutive_failures[model_id] >= failure_threshold:
+                    st.warning(f"Model {model_id} is consistently failing to answer. Removing from active models.")
                     if model_id in active_models: # double check before removing, might have been removed in another thread
                         active_models.remove(model_id)
                     unresponsive_models.add(model_id)
             if len(active_models) < 2: # Re-check active models before ranking
+              st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
               break
             ranks = {}
                 if len(weights_for_valid_ranks) != len(valid_ranks_values):
+                    log_warning("Mismatch length of weights and valid answer ranks")
+                    log_info(f'weights_for_valid_ranks {weights_for_valid_ranks}')
+                    log_info(f'valid_ranks_values: {valid_ranks_values}')
                 average_rank = np.average(valid_ranks_values, weights=weights_for_valid_ranks)
             # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
             ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
+            st.write(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank[model_id]:.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
             # Write iteration results to file (append mode) - write for each model right after ranking
             iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
         iteration_results_file_opened.close()
+    st.write(f"Unresponsive models during this run: {unresponsive_models}")
     return results, cumulative_avg_rank, s_t
 def check_model_availability(models, token):