Spaces:

AutoBench
/

AutoBench_1.0_Demo

Running

App Files Files Community

PeterKruger commited on Feb 28

Commit

dfa358f

verified ·

1 Parent(s): 819adb7

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -20

app.py CHANGED Viewed

@@ -12,6 +12,14 @@ import sys
 import io
 from contextlib import redirect_stdout, redirect_stderr
 # FILES
 iteration_output_file = "llm_benchmark_iteration_results.csv"  # File to store iteration results, defined as global
 results_file = "llm_benchmark_results.csv"  # all data
@@ -41,12 +49,6 @@ difficulty_probabilities = {
     "a very difficult": 0.6
 }
-# Create output displays for main log and debug log
-if 'main_output' not in st.session_state:
-    st.session_state.main_output = []
-if 'debug_output' not in st.session_state:
-    st.session_state.debug_output = []
 # Custom print function to capture output
 def custom_print(*args, **kwargs):
     # Convert args to string and join with spaces
@@ -57,11 +59,17 @@ def custom_print(*args, **kwargs):
     # Also print to standard output for console logging
     print(*args, **kwargs)
 # Custom function to capture warnings and errors
 def log_debug(message):
     st.session_state.debug_output.append(message)
     print(f"DEBUG: {message}", file=sys.stderr)
 def retry_api_request(max_retries=3, wait_time=10):
     """Decorator for retrying API requests with rate limit handling."""
@@ -724,8 +732,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
         for model_id in active_models:
-            answer = answers[model_id] # Retrieve pre-generated answer
             if answer == "Error answering": # Handle answer generation errors
                 consecutive_failures[model_id] += 1
                 if consecutive_failures[model_id] >= failure_threshold:
@@ -794,7 +805,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
             results["question_prompt"].append(question_prompt)
             results["question"].append(question)
             results["answer"].append(answer)
-            results["answer_generation_duration"].append(duration)
             results["average_rank"].append(average_rank)
             results["ranks"].append([ranks[m] for m in active_models if m in ranks]) # Store raw ranks including Nones, ensure order
             results["question_rank_average"].append(question_avg_rank) # Store question rank average
@@ -816,7 +827,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
         total_valid_rank = 0  # Keep track of the sum of valid (non-NaN) ranks
         for m_id in active_models:
-            if cumulative_avg_rank[m_id]:
                 temp_weights[m_id] = cumulative_avg_rank[m_id]
                 total_valid_rank += cumulative_avg_rank[m_id]
             else: # if cumulative is empty, keep original
@@ -884,10 +895,6 @@ def check_model_availability(models, token):
 # Streamlit UI
 st.title("LLM Benchmark")
-# Initialize session state variables for progress tracking
-if 'progress' not in st.session_state:
-    st.session_state.progress = 0
 # Setup sidebar for configuration
 st.sidebar.header("Configuration")
@@ -970,6 +977,7 @@ with tab1:
         # Clear previous outputs
         st.session_state.main_output = []
         st.session_state.debug_output = []
         if not hf_token:
             st.error("Please enter your Hugging Face API token")
@@ -1038,21 +1046,28 @@ with tab1:
 with tab2:
     # Display main output log
     st.subheader("Execution Log")
-    log_container = st.container()
     # Display logs
     log_text = "\n".join(st.session_state.main_output)
-    log_container.text_area("Progress Log", log_text, height=400)
     # Add a refresh button for the log
-    if st.button("Refresh Log"):
-        st.experimental_rerun()
 with tab3:
     # Display debug output
     st.subheader("Debug Log")
-    debug_container = st.container()
     # Display debug logs
     debug_text = "\n".join(st.session_state.debug_output)
-    debug_container.text_area("Debug Information", debug_text, height=400)

 import io
 from contextlib import redirect_stdout, redirect_stderr
+# Initialize session state variables
+if 'main_output' not in st.session_state:
+    st.session_state.main_output = []
+if 'debug_output' not in st.session_state:
+    st.session_state.debug_output = []
+if 'progress' not in st.session_state:
+    st.session_state.progress = 0
 # FILES
 iteration_output_file = "llm_benchmark_iteration_results.csv"  # File to store iteration results, defined as global
 results_file = "llm_benchmark_results.csv"  # all data
     "a very difficult": 0.6
 }
 # Custom print function to capture output
 def custom_print(*args, **kwargs):
     # Convert args to string and join with spaces
     # Also print to standard output for console logging
     print(*args, **kwargs)
+    # Force an immediate update of the UI (when used inside a function)
+    st.session_state.update_counter = st.session_state.get('update_counter', 0) + 1
 # Custom function to capture warnings and errors
 def log_debug(message):
     st.session_state.debug_output.append(message)
     print(f"DEBUG: {message}", file=sys.stderr)
+    # Force an immediate update of the UI
+    st.session_state.update_counter = st.session_state.get('update_counter', 0) + 1
 def retry_api_request(max_retries=3, wait_time=10):
     """Decorator for retrying API requests with rate limit handling."""
         for model_id in active_models:
+            answer = answers.get(model_id)
+            if not answer: # Add guard clause
+                log_debug(f"No answer found for model {model_id}. Skipping ranking.")
+                continue
             if answer == "Error answering": # Handle answer generation errors
                 consecutive_failures[model_id] += 1
                 if consecutive_failures[model_id] >= failure_threshold:
             results["question_prompt"].append(question_prompt)
             results["question"].append(question)
             results["answer"].append(answer)
+            results["answer_generation_duration"].append(answer_durations.get(model_id, 0))
             results["average_rank"].append(average_rank)
             results["ranks"].append([ranks[m] for m in active_models if m in ranks]) # Store raw ranks including Nones, ensure order
             results["question_rank_average"].append(question_avg_rank) # Store question rank average
         total_valid_rank = 0  # Keep track of the sum of valid (non-NaN) ranks
         for m_id in active_models:
+            if m_id in cumulative_avg_rank and not np.isnan(cumulative_avg_rank[m_id]):
                 temp_weights[m_id] = cumulative_avg_rank[m_id]
                 total_valid_rank += cumulative_avg_rank[m_id]
             else: # if cumulative is empty, keep original
 # Streamlit UI
 st.title("LLM Benchmark")
 # Setup sidebar for configuration
 st.sidebar.header("Configuration")
         # Clear previous outputs
         st.session_state.main_output = []
         st.session_state.debug_output = []
+        st.session_state.progress = 0
         if not hf_token:
             st.error("Please enter your Hugging Face API token")
 with tab2:
     # Display main output log
     st.subheader("Execution Log")
     # Display logs
     log_text = "\n".join(st.session_state.main_output)
+    st.text_area("Progress Log", log_text, height=400)
     # Add a refresh button for the log
+    if st.button("Refresh Progress Log"):
+        pass  # The rerun happens automatically at the end
 with tab3:
     # Display debug output
     st.subheader("Debug Log")
     # Display debug logs
     debug_text = "\n".join(st.session_state.debug_output)
+    st.text_area("Debug Information", debug_text, height=400)
+    # Add a refresh button for the debug log
+    if st.button("Refresh Debug Log"):
+        pass  # The rerun happens automatically at the end
+# Auto-refresh mechanism
+if st.session_state.get('update_counter', 0) > 0:
+    time.sleep(0.1)  # Brief pause to allow UI to update
+    st.experimental_rerun()