Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -9,14 +9,16 @@ import time
|
|
9 |
import random
|
10 |
import functools
|
11 |
|
12 |
-
# Create a logging system for errors and warnings
|
13 |
-
if 'log_messages' not in st.session_state:
|
14 |
-
st.session_state.log_messages = []
|
15 |
-
|
16 |
# Create areas for different outputs
|
17 |
status_area = st.empty() # For current status
|
18 |
progress_area = st.empty() # For progress updates
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
# Collapsible section for logs
|
21 |
with st.expander("Execution Log", expanded=False):
|
22 |
log_area = st.empty()
|
@@ -743,7 +745,8 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
743 |
|
744 |
update_progress("Ranking all answers...")
|
745 |
for model_id in active_models:
|
746 |
-
answer = answers
|
|
|
747 |
|
748 |
if answer == "Error answering": # Handle answer generation errors
|
749 |
consecutive_failures[model_id] += 1
|
@@ -821,7 +824,10 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
821 |
results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
|
822 |
|
823 |
cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
|
824 |
-
|
|
|
|
|
|
|
825 |
|
826 |
# --- Print and store iteration results IMMEDIATELY after ranking for this model ---
|
827 |
ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
|
@@ -835,10 +841,10 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
835 |
total_valid_rank = 0 # Keep track of the sum of valid (non-NaN) ranks
|
836 |
|
837 |
for m_id in active_models:
|
838 |
-
if cumulative_avg_rank[m_id]:
|
839 |
temp_weights[m_id] = cumulative_avg_rank[m_id]
|
840 |
total_valid_rank += cumulative_avg_rank[m_id]
|
841 |
-
else: # if cumulative is empty, keep original
|
842 |
temp_weights[m_id] = model_weights.get(m_id, 1.0 / len(active_models))
|
843 |
|
844 |
# Normalize the weights so they sum to 1, handling cases where total_valid_rank might be zero
|
@@ -992,10 +998,6 @@ if st.sidebar.button("Start Benchmark"):
|
|
992 |
results_placeholder = st.empty()
|
993 |
iterations_table = st.empty()
|
994 |
|
995 |
-
# Create a global variable to store intermediate results
|
996 |
-
if 'results_df' not in st.session_state:
|
997 |
-
st.session_state.results_df = pd.DataFrame()
|
998 |
-
|
999 |
# Run the benchmark
|
1000 |
try:
|
1001 |
# Update status
|
|
|
9 |
import random
|
10 |
import functools
|
11 |
|
|
|
|
|
|
|
|
|
12 |
# Create areas for different outputs
|
13 |
status_area = st.empty() # For current status
|
14 |
progress_area = st.empty() # For progress updates
|
15 |
|
16 |
+
# Initialize session state variables
|
17 |
+
if 'log_messages' not in st.session_state:
|
18 |
+
st.session_state.log_messages = []
|
19 |
+
if 'results_df' not in st.session_state:
|
20 |
+
st.session_state.results_df = pd.DataFrame()
|
21 |
+
|
22 |
# Collapsible section for logs
|
23 |
with st.expander("Execution Log", expanded=False):
|
24 |
log_area = st.empty()
|
|
|
745 |
|
746 |
update_progress("Ranking all answers...")
|
747 |
for model_id in active_models:
|
748 |
+
answer = answers.get(model_id, "Error answering") # Retrieve pre-generated answer, default if not found
|
749 |
+
duration = answer_durations.get(model_id, 0) # Get duration with default
|
750 |
|
751 |
if answer == "Error answering": # Handle answer generation errors
|
752 |
consecutive_failures[model_id] += 1
|
|
|
824 |
results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
|
825 |
|
826 |
cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
|
827 |
+
if cumulative_model_ranks[model_id]: # Check if the list is not empty
|
828 |
+
cumulative_avg_rank[model_id] = np.nanmean(cumulative_model_ranks[model_id])
|
829 |
+
else:
|
830 |
+
cumulative_avg_rank[model_id] = np.nan
|
831 |
|
832 |
# --- Print and store iteration results IMMEDIATELY after ranking for this model ---
|
833 |
ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
|
|
|
841 |
total_valid_rank = 0 # Keep track of the sum of valid (non-NaN) ranks
|
842 |
|
843 |
for m_id in active_models:
|
844 |
+
if m_id in cumulative_avg_rank and not np.isnan(cumulative_avg_rank[m_id]):
|
845 |
temp_weights[m_id] = cumulative_avg_rank[m_id]
|
846 |
total_valid_rank += cumulative_avg_rank[m_id]
|
847 |
+
else: # if cumulative is empty or NaN, keep original
|
848 |
temp_weights[m_id] = model_weights.get(m_id, 1.0 / len(active_models))
|
849 |
|
850 |
# Normalize the weights so they sum to 1, handling cases where total_valid_rank might be zero
|
|
|
998 |
results_placeholder = st.empty()
|
999 |
iterations_table = st.empty()
|
1000 |
|
|
|
|
|
|
|
|
|
1001 |
# Run the benchmark
|
1002 |
try:
|
1003 |
# Update status
|