Spaces:

AutoBench
/

AutoBench_1.0_Demo

Running

App Files Files Community

PeterKruger commited on Feb 28

Commit

130972f

verified ·

1 Parent(s): 2ee78d6

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -12

app.py CHANGED Viewed

@@ -9,14 +9,16 @@ import time
 import random
 import functools
-# Create a logging system for errors and warnings
-if 'log_messages' not in st.session_state:
-    st.session_state.log_messages = []
 # Create areas for different outputs
 status_area = st.empty()  # For current status
 progress_area = st.empty()  # For progress updates
 # Collapsible section for logs
 with st.expander("Execution Log", expanded=False):
     log_area = st.empty()
@@ -743,7 +745,8 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
         update_progress("Ranking all answers...")
         for model_id in active_models:
-            answer = answers[model_id] # Retrieve pre-generated answer
             if answer == "Error answering": # Handle answer generation errors
                 consecutive_failures[model_id] += 1
@@ -821,7 +824,10 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
             results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
             cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
-            cumulative_avg_rank[model_id] = np.nanmean(cumulative_model_ranks[model_id]) if cumulative_model_ranks[model_id] else np.nan
             # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
             ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
@@ -835,10 +841,10 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
         total_valid_rank = 0  # Keep track of the sum of valid (non-NaN) ranks
         for m_id in active_models:
-            if cumulative_avg_rank[m_id]:
                 temp_weights[m_id] = cumulative_avg_rank[m_id]
                 total_valid_rank += cumulative_avg_rank[m_id]
-            else: # if cumulative is empty, keep original
                 temp_weights[m_id] = model_weights.get(m_id, 1.0 / len(active_models))
         # Normalize the weights so they sum to 1, handling cases where total_valid_rank might be zero
@@ -992,10 +998,6 @@ if st.sidebar.button("Start Benchmark"):
             results_placeholder = st.empty()
             iterations_table = st.empty()
-        # Create a global variable to store intermediate results
-        if 'results_df' not in st.session_state:
-            st.session_state.results_df = pd.DataFrame()
         # Run the benchmark
         try:
             # Update status

 import random
 import functools
 # Create areas for different outputs
 status_area = st.empty()  # For current status
 progress_area = st.empty()  # For progress updates
+# Initialize session state variables
+if 'log_messages' not in st.session_state:
+    st.session_state.log_messages = []
+if 'results_df' not in st.session_state:
+    st.session_state.results_df = pd.DataFrame()
 # Collapsible section for logs
 with st.expander("Execution Log", expanded=False):
     log_area = st.empty()
         update_progress("Ranking all answers...")
         for model_id in active_models:
+            answer = answers.get(model_id, "Error answering") # Retrieve pre-generated answer, default if not found
+            duration = answer_durations.get(model_id, 0)  # Get duration with default
             if answer == "Error answering": # Handle answer generation errors
                 consecutive_failures[model_id] += 1
             results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
             cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
+            if cumulative_model_ranks[model_id]:  # Check if the list is not empty
+                cumulative_avg_rank[model_id] = np.nanmean(cumulative_model_ranks[model_id])
+            else:
+                cumulative_avg_rank[model_id] = np.nan
             # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
             ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
         total_valid_rank = 0  # Keep track of the sum of valid (non-NaN) ranks
         for m_id in active_models:
+            if m_id in cumulative_avg_rank and not np.isnan(cumulative_avg_rank[m_id]):
                 temp_weights[m_id] = cumulative_avg_rank[m_id]
                 total_valid_rank += cumulative_avg_rank[m_id]
+            else: # if cumulative is empty or NaN, keep original
                 temp_weights[m_id] = model_weights.get(m_id, 1.0 / len(active_models))
         # Normalize the weights so they sum to 1, handling cases where total_valid_rank might be zero
             results_placeholder = st.empty()
             iterations_table = st.empty()
         # Run the benchmark
         try:
             # Update status