PeterKruger commited on
Commit
130972f
·
verified ·
1 Parent(s): 2ee78d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -12
app.py CHANGED
@@ -9,14 +9,16 @@ import time
9
  import random
10
  import functools
11
 
12
- # Create a logging system for errors and warnings
13
- if 'log_messages' not in st.session_state:
14
- st.session_state.log_messages = []
15
-
16
  # Create areas for different outputs
17
  status_area = st.empty() # For current status
18
  progress_area = st.empty() # For progress updates
19
 
 
 
 
 
 
 
20
  # Collapsible section for logs
21
  with st.expander("Execution Log", expanded=False):
22
  log_area = st.empty()
@@ -743,7 +745,8 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
743
 
744
  update_progress("Ranking all answers...")
745
  for model_id in active_models:
746
- answer = answers[model_id] # Retrieve pre-generated answer
 
747
 
748
  if answer == "Error answering": # Handle answer generation errors
749
  consecutive_failures[model_id] += 1
@@ -821,7 +824,10 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
821
  results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
822
 
823
  cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
824
- cumulative_avg_rank[model_id] = np.nanmean(cumulative_model_ranks[model_id]) if cumulative_model_ranks[model_id] else np.nan
 
 
 
825
 
826
  # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
827
  ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
@@ -835,10 +841,10 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
835
  total_valid_rank = 0 # Keep track of the sum of valid (non-NaN) ranks
836
 
837
  for m_id in active_models:
838
- if cumulative_avg_rank[m_id]:
839
  temp_weights[m_id] = cumulative_avg_rank[m_id]
840
  total_valid_rank += cumulative_avg_rank[m_id]
841
- else: # if cumulative is empty, keep original
842
  temp_weights[m_id] = model_weights.get(m_id, 1.0 / len(active_models))
843
 
844
  # Normalize the weights so they sum to 1, handling cases where total_valid_rank might be zero
@@ -992,10 +998,6 @@ if st.sidebar.button("Start Benchmark"):
992
  results_placeholder = st.empty()
993
  iterations_table = st.empty()
994
 
995
- # Create a global variable to store intermediate results
996
- if 'results_df' not in st.session_state:
997
- st.session_state.results_df = pd.DataFrame()
998
-
999
  # Run the benchmark
1000
  try:
1001
  # Update status
 
9
  import random
10
  import functools
11
 
 
 
 
 
12
  # Create areas for different outputs
13
  status_area = st.empty() # For current status
14
  progress_area = st.empty() # For progress updates
15
 
16
+ # Initialize session state variables
17
+ if 'log_messages' not in st.session_state:
18
+ st.session_state.log_messages = []
19
+ if 'results_df' not in st.session_state:
20
+ st.session_state.results_df = pd.DataFrame()
21
+
22
  # Collapsible section for logs
23
  with st.expander("Execution Log", expanded=False):
24
  log_area = st.empty()
 
745
 
746
  update_progress("Ranking all answers...")
747
  for model_id in active_models:
748
+ answer = answers.get(model_id, "Error answering") # Retrieve pre-generated answer, default if not found
749
+ duration = answer_durations.get(model_id, 0) # Get duration with default
750
 
751
  if answer == "Error answering": # Handle answer generation errors
752
  consecutive_failures[model_id] += 1
 
824
  results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
825
 
826
  cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
827
+ if cumulative_model_ranks[model_id]: # Check if the list is not empty
828
+ cumulative_avg_rank[model_id] = np.nanmean(cumulative_model_ranks[model_id])
829
+ else:
830
+ cumulative_avg_rank[model_id] = np.nan
831
 
832
  # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
833
  ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
 
841
  total_valid_rank = 0 # Keep track of the sum of valid (non-NaN) ranks
842
 
843
  for m_id in active_models:
844
+ if m_id in cumulative_avg_rank and not np.isnan(cumulative_avg_rank[m_id]):
845
  temp_weights[m_id] = cumulative_avg_rank[m_id]
846
  total_valid_rank += cumulative_avg_rank[m_id]
847
+ else: # if cumulative is empty or NaN, keep original
848
  temp_weights[m_id] = model_weights.get(m_id, 1.0 / len(active_models))
849
 
850
  # Normalize the weights so they sum to 1, handling cases where total_valid_rank might be zero
 
998
  results_placeholder = st.empty()
999
  iterations_table = st.empty()
1000
 
 
 
 
 
1001
  # Run the benchmark
1002
  try:
1003
  # Update status