Spaces:

AutoBench
/

AutoBench_1.0_Demo

Running

App Files Files Community

PeterKruger commited on Feb 28

Commit

3116207

verified ·

1 Parent(s): dfa358f

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -84

app.py CHANGED Viewed

@@ -12,13 +12,17 @@ import sys
 import io
 from contextlib import redirect_stdout, redirect_stderr
-# Initialize session state variables
 if 'main_output' not in st.session_state:
-    st.session_state.main_output = []
 if 'debug_output' not in st.session_state:
-    st.session_state.debug_output = []
 if 'progress' not in st.session_state:
-    st.session_state.progress = 0
 # FILES
 iteration_output_file = "llm_benchmark_iteration_results.csv"  # File to store iteration results, defined as global
@@ -55,21 +59,15 @@ def custom_print(*args, **kwargs):
     output = ' '.join(map(str, args))
     # Add to main output list
-    st.session_state.main_output.append(output)
     # Also print to standard output for console logging
     print(*args, **kwargs)
-    # Force an immediate update of the UI (when used inside a function)
-    st.session_state.update_counter = st.session_state.get('update_counter', 0) + 1
 # Custom function to capture warnings and errors
 def log_debug(message):
-    st.session_state.debug_output.append(message)
     print(f"DEBUG: {message}", file=sys.stderr)
-    # Force an immediate update of the UI
-    st.session_state.update_counter = st.session_state.get('update_counter', 0) + 1
 def retry_api_request(max_retries=3, wait_time=10):
     """Decorator for retrying API requests with rate limit handling."""
@@ -511,6 +509,8 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
 # --- Core Logic ---
 def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
     results = {
         "model_name": [],
         "topic": [],
@@ -561,7 +561,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
     for iteration in range(t): # Added iteration counter
         # Update progress in the Streamlit app
-        st.session_state.progress = (iteration + 1) / t
         if len(active_models) < 2:
             custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
@@ -647,8 +647,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
                     question_ranking_futures.append(future)
             for future in concurrent.futures.as_completed(question_ranking_futures): # Collect ranks as they become available
-                ranking_model_id, rank = future.result() # Get model_id and rank
-                question_ranks[ranking_model_id] = rank # Store rank with model_id as key
         question_ranking_end_time = time.time()
         question_ranking_duration_total = question_ranking_end_time - question_ranking_start_time
@@ -674,11 +677,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
         if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
             question_accepted = True
-            custom_print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
             s_t += 1
         else:
             question_accepted = False
-            custom_print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
         if not question_accepted:
             custom_print("Generated question was not accepted. Regenerating question.")
@@ -709,35 +712,43 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
                             token,
                             timeout=60
                         )
-                        answer_futures.append(future)
-                    except TimeoutError as e:
-                        log_debug(f"Answer generation for model {model_id} timed out: {e}")
-                        answer = "I am struggling to answer this question" # Treat timeout as error
-                        duration = 120 # You can set a default duration or handle it differently if needed
-                        answers[model_id] = answer # Store error answer
-                        answer_durations[model_id] = duration # Store default duration
-            for future in concurrent.futures.as_completed(answer_futures):
-                model_id = active_models[answer_futures.index(future)] # Get model_id based on future index (order is preserved)
-                answer, duration = future.result() # Get both answer and duration - unpack the returned tuple
-                answers[model_id] = answer
-                answer_durations[model_id] = duration # Store duration - store the duration in the answer_durations dictionary
         # --- Ranking Process ---
         # Prepare to write to file (open in append mode outside the model loop but inside iteration loop)
-        iteration_results_file_opened = open(iteration_output_file, 'a')
-        if iteration == 0: # Write header only for the first iteration
-            iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
         for model_id in active_models:
-            answer = answers.get(model_id)
-            if not answer: # Add guard clause
                 log_debug(f"No answer found for model {model_id}. Skipping ranking.")
                 continue
-            if answer == "Error answering": # Handle answer generation errors
                 consecutive_failures[model_id] += 1
                 if consecutive_failures[model_id] >= failure_threshold:
                     custom_print(f"Model {model_id} is consistently failing to answer. Removing from active models.")
@@ -759,24 +770,30 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
                 for ranking_model_id in active_models:
                     # --- Filter for ranking roles ("rank" or "both") ---
                     if model_config[ranking_model_id].get("role", "both") in ["rank", "both"]:
-                        future = executor.submit(
-                            get_rank_from_model,
-                            ranking_model_id,
-                            question,
-                            answer,
-                            consecutive_failures,
-                            failure_threshold,
-                            unresponsive_models,
-                            model_config,
-                            topic,
-                            token,
-                            timeout=60
-                        )
-                        ranking_futures.append(future)
                 for future in concurrent.futures.as_completed(ranking_futures): # Collect ranks as they become available
-                    ranking_model_id, rank = future.result() # Get model_id and rank
-                    ranks[ranking_model_id] = rank # Store rank with model_id as key
             ranking_end_time = time.time() # Record end time of ranking
             ranking_duration = ranking_end_time - ranking_start_time # Calculate duration
@@ -807,20 +824,29 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
             results["answer"].append(answer)
             results["answer_generation_duration"].append(answer_durations.get(model_id, 0))
             results["average_rank"].append(average_rank)
-            results["ranks"].append([ranks[m] for m in active_models if m in ranks]) # Store raw ranks including Nones, ensure order
             results["question_rank_average"].append(question_avg_rank) # Store question rank average
-            results["question_ranks"].append([question_ranks[m] for m in active_models if m in question_ranks]) # Store question ranks
             results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
-            cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
-            cumulative_avg_rank[model_id] = np.nanmean(cumulative_model_ranks[model_id]) if cumulative_model_ranks[model_id] else np.nan
             # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
-            ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
-            custom_print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank[model_id]:.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
             # Write iteration results to file (append mode) - write for each model right after ranking
-            iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
         # Update model weights based on cumulative average ranks, handling NaNs
         temp_weights = {}
@@ -842,9 +868,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
             for m_id in active_models:
                 model_weights[m_id] = 1.0 / len(active_models)
-        iteration_results_file_opened.close()
     custom_print(f"Unresponsive models during this run: {unresponsive_models}")
     return results, cumulative_avg_rank, s_t
 def check_model_availability(models, token):
@@ -970,14 +998,14 @@ with tab1:
                     st.error("You need at least 2 available models to run the benchmark")
     # Progress bar
-    progress_bar = st.progress(st.session_state.progress)
     # Start benchmark button
     if st.sidebar.button("Start Benchmark"):
         # Clear previous outputs
-        st.session_state.main_output = []
-        st.session_state.debug_output = []
-        st.session_state.progress = 0
         if not hf_token:
             st.error("Please enter your Hugging Face API token")
@@ -989,10 +1017,6 @@ with tab1:
             # Setup to capture results for display
             results_container = st.container()
-            # Create a global variable to store intermediate results
-            if 'results_df' not in st.session_state:
-                st.session_state.results_df = pd.DataFrame()
             # Run the benchmark
             try:
                 # Run benchmark and get results
@@ -1003,19 +1027,19 @@ with tab1:
                 )
                 # Update progress to complete
-                st.session_state.progress = 1.0
                 progress_bar.progress(1.0)
                 # Display results
                 if total_successful > 0:
                     results_df = pd.DataFrame(results)
-                    st.session_state.results_df = results_df
                     # Show model rankings
                     st.subheader("Model Rankings")
                     ranking_df = pd.DataFrame({
                         "Model": list(cumulative_avg_rank.keys()),
-                        "Average Rank": [round(r, 2) for r in cumulative_avg_rank.values()]
                     })
                     ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
                     st.dataframe(ranking_df)
@@ -1039,35 +1063,42 @@ with tab1:
                 st.exception(e)
     # Show previous results if available
-    elif 'results_df' in st.session_state and not st.session_state.results_df.empty:
         st.subheader("Previous Results")
-        st.dataframe(st.session_state.results_df)
 with tab2:
     # Display main output log
     st.subheader("Execution Log")
     # Display logs
-    log_text = "\n".join(st.session_state.main_output)
-    st.text_area("Progress Log", log_text, height=400)
     # Add a refresh button for the log
     if st.button("Refresh Progress Log"):
-        pass  # The rerun happens automatically at the end
 with tab3:
     # Display debug output
     st.subheader("Debug Log")
     # Display debug logs
-    debug_text = "\n".join(st.session_state.debug_output)
-    st.text_area("Debug Information", debug_text, height=400)
     # Add a refresh button for the debug log
     if st.button("Refresh Debug Log"):
-        pass  # The rerun happens automatically at the end
-# Auto-refresh mechanism
-if st.session_state.get('update_counter', 0) > 0:
-    time.sleep(0.1)  # Brief pause to allow UI to update
-    st.experimental_rerun()

 import io
 from contextlib import redirect_stdout, redirect_stderr
+# Initialize session state variables properly at the very beginning
 if 'main_output' not in st.session_state:
+    st.session_state['main_output'] = []
 if 'debug_output' not in st.session_state:
+    st.session_state['debug_output'] = []
 if 'progress' not in st.session_state:
+    st.session_state['progress'] = 0
+if 'results_df' not in st.session_state:
+    st.session_state['results_df'] = pd.DataFrame()
+if 'is_running' not in st.session_state:
+    st.session_state['is_running'] = False
 # FILES
 iteration_output_file = "llm_benchmark_iteration_results.csv"  # File to store iteration results, defined as global
     output = ' '.join(map(str, args))
     # Add to main output list
+    st.session_state['main_output'].append(output)
     # Also print to standard output for console logging
     print(*args, **kwargs)
 # Custom function to capture warnings and errors
 def log_debug(message):
+    st.session_state['debug_output'].append(message)
     print(f"DEBUG: {message}", file=sys.stderr)
 def retry_api_request(max_retries=3, wait_time=10):
     """Decorator for retrying API requests with rate limit handling."""
 # --- Core Logic ---
 def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
+    st.session_state['is_running'] = True
     results = {
         "model_name": [],
         "topic": [],
     for iteration in range(t): # Added iteration counter
         # Update progress in the Streamlit app
+        st.session_state['progress'] = (iteration + 1) / t
         if len(active_models) < 2:
             custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
                     question_ranking_futures.append(future)
             for future in concurrent.futures.as_completed(question_ranking_futures): # Collect ranks as they become available
+                try:
+                    ranking_model_id, rank = future.result() # Get model_id and rank
+                    question_ranks[ranking_model_id] = rank # Store rank with model_id as key
+                except Exception as e:
+                    log_debug(f"Error getting question rank result: {e}")
         question_ranking_end_time = time.time()
         question_ranking_duration_total = question_ranking_end_time - question_ranking_start_time
         if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
             question_accepted = True
+            custom_print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks.get(m, None) for m in active_models if m in question_ranks]}")
             s_t += 1
         else:
             question_accepted = False
+            custom_print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks.get(m, None) for m in active_models if m in question_ranks]}")
         if not question_accepted:
             custom_print("Generated question was not accepted. Regenerating question.")
                             token,
                             timeout=60
                         )
+                        answer_futures.append((model_id, future))
+                    except Exception as e:
+                        log_debug(f"Error submitting answer task for {model_id}: {e}")
+                        answer = "Error answering - Task submission failed"
+                        duration = 0
+                        answers[model_id] = answer
+                        answer_durations[model_id] = duration
+            for model_id, future in answer_futures:
+                try:
+                    answer, duration = future.result() # Get both answer and duration
+                    answers[model_id] = answer
+                    answer_durations[model_id] = duration
+                except Exception as e:
+                    log_debug(f"Error getting answer from {model_id}: {e}")
+                    answers[model_id] = "Error answering - Future result failed"
+                    answer_durations[model_id] = 0
         # --- Ranking Process ---
         # Prepare to write to file (open in append mode outside the model loop but inside iteration loop)
+        try:
+            iteration_results_file_opened = open(iteration_output_file, 'a')
+            if iteration == 0: # Write header only for the first iteration
+                iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
+        except Exception as e:
+            log_debug(f"Error opening results file: {e}")
+            iteration_results_file_opened = None
         for model_id in active_models:
+            if model_id not in answers:
                 log_debug(f"No answer found for model {model_id}. Skipping ranking.")
                 continue
+            answer = answers[model_id]
+            if answer == "Error answering" or answer.startswith("Error answering -"): # Handle answer generation errors
                 consecutive_failures[model_id] += 1
                 if consecutive_failures[model_id] >= failure_threshold:
                     custom_print(f"Model {model_id} is consistently failing to answer. Removing from active models.")
                 for ranking_model_id in active_models:
                     # --- Filter for ranking roles ("rank" or "both") ---
                     if model_config[ranking_model_id].get("role", "both") in ["rank", "both"]:
+                        try:
+                            future = executor.submit(
+                                get_rank_from_model,
+                                ranking_model_id,
+                                question,
+                                answer,
+                                consecutive_failures,
+                                failure_threshold,
+                                unresponsive_models,
+                                model_config,
+                                topic,
+                                token,
+                                timeout=60
+                            )
+                            ranking_futures.append(future)
+                        except Exception as e:
+                            log_debug(f"Error submitting ranking task for {ranking_model_id}: {e}")
                 for future in concurrent.futures.as_completed(ranking_futures): # Collect ranks as they become available
+                    try:
+                        ranking_model_id, rank = future.result() # Get model_id and rank
+                        ranks[ranking_model_id] = rank # Store rank with model_id as key
+                    except Exception as e:
+                        log_debug(f"Error getting rank result: {e}")
             ranking_end_time = time.time() # Record end time of ranking
             ranking_duration = ranking_end_time - ranking_start_time # Calculate duration
             results["answer"].append(answer)
             results["answer_generation_duration"].append(answer_durations.get(model_id, 0))
             results["average_rank"].append(average_rank)
+            results["ranks"].append([ranks.get(m, None) for m in active_models if m in ranks]) # Store raw ranks including Nones, ensure order
             results["question_rank_average"].append(question_avg_rank) # Store question rank average
+            results["question_ranks"].append([question_ranks.get(m, None) for m in active_models if m in question_ranks]) # Store question ranks
             results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
+            if model_id in cumulative_model_ranks:
+                cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
+            if model_id in cumulative_model_ranks and cumulative_model_ranks[model_id]:
+                cumulative_avg_rank[model_id] = np.nanmean([r for r in cumulative_model_ranks[model_id] if not np.isnan(r)])
+            else:
+                cumulative_avg_rank[model_id] = np.nan
             # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
+            ranks_str = "[" + ", ".join(map(str, [ranks.get(m, None) for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
+            custom_print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank.get(model_id, np.nan):.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
             # Write iteration results to file (append mode) - write for each model right after ranking
+            if iteration_results_file_opened:
+                try:
+                    iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank.get(model_id, np.nan):.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
+                except Exception as e:
+                    log_debug(f"Error writing to results file: {e}")
         # Update model weights based on cumulative average ranks, handling NaNs
         temp_weights = {}
             for m_id in active_models:
                 model_weights[m_id] = 1.0 / len(active_models)
+        if iteration_results_file_opened:
+            iteration_results_file_opened.close()
     custom_print(f"Unresponsive models during this run: {unresponsive_models}")
+    st.session_state['is_running'] = False
     return results, cumulative_avg_rank, s_t
 def check_model_availability(models, token):
                     st.error("You need at least 2 available models to run the benchmark")
     # Progress bar
+    progress_bar = st.progress(st.session_state['progress'])
     # Start benchmark button
     if st.sidebar.button("Start Benchmark"):
         # Clear previous outputs
+        st.session_state['main_output'] = []
+        st.session_state['debug_output'] = []
+        st.session_state['progress'] = 0
         if not hf_token:
             st.error("Please enter your Hugging Face API token")
             # Setup to capture results for display
             results_container = st.container()
             # Run the benchmark
             try:
                 # Run benchmark and get results
                 )
                 # Update progress to complete
+                st.session_state['progress'] = 1.0
                 progress_bar.progress(1.0)
                 # Display results
                 if total_successful > 0:
                     results_df = pd.DataFrame(results)
+                    st.session_state['results_df'] = results_df
                     # Show model rankings
                     st.subheader("Model Rankings")
                     ranking_df = pd.DataFrame({
                         "Model": list(cumulative_avg_rank.keys()),
+                        "Average Rank": [round(r, 2) if not np.isnan(r) else 'N/A' for r in cumulative_avg_rank.values()]
                     })
                     ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
                     st.dataframe(ranking_df)
                 st.exception(e)
     # Show previous results if available
+    elif 'results_df' in st.session_state and not st.session_state['results_df'].empty:
         st.subheader("Previous Results")
+        st.dataframe(st.session_state['results_df'])
 with tab2:
     # Display main output log
     st.subheader("Execution Log")
     # Display logs
+    if 'main_output' in st.session_state:
+        log_text = "\n".join(st.session_state['main_output'])
+        st.text_area("Progress Log", log_text, height=400)
+    else:
+        st.text_area("Progress Log", "No progress logs yet.", height=400)
     # Add a refresh button for the log
     if st.button("Refresh Progress Log"):
+        st.experimental_rerun()
 with tab3:
     # Display debug output
     st.subheader("Debug Log")
     # Display debug logs
+    if 'debug_output' in st.session_state:
+        debug_text = "\n".join(st.session_state['debug_output'])
+        st.text_area("Debug Information", debug_text, height=400)
+    else:
+        st.text_area("Debug Information", "No debug logs yet.", height=400)
     # Add a refresh button for the debug log
     if st.button("Refresh Debug Log"):
+        st.experimental_rerun()
+# Auto-update while benchmark is running
+if st.session_state.get('is_running', False):
+    st.empty()
+    time.sleep(5)  # Update every 5 seconds while running
+    st.rerun()