Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -469,6 +469,7 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
|
|
469 |
time.sleep(time_sleep) # Small delay
|
470 |
duration = time.time() - start_time # Calculate duration
|
471 |
print(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately as requested - as requested
|
|
|
472 |
|
473 |
return answer, duration # Return answer and duration
|
474 |
|
@@ -531,6 +532,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
531 |
# --- Select difficulty with probabilities ---
|
532 |
difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
|
533 |
print(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---") # Print iteration number
|
|
|
534 |
|
535 |
# --- Question Generation ---
|
536 |
question = None
|
@@ -635,10 +637,12 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
635 |
if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
|
636 |
question_accepted = True
|
637 |
print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
|
|
|
638 |
s_t += 1
|
639 |
else:
|
640 |
question_accepted = False
|
641 |
print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
|
|
|
642 |
|
643 |
if not question_accepted:
|
644 |
print("Generated question was not accepted. Regenerating question.")
|
@@ -775,7 +779,8 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
775 |
|
776 |
# --- Print and store iteration results IMMEDIATELY after ranking for this model ---
|
777 |
ranks_str = "[" + ", ".join(map(str, [ranks.get(m, None) for m in active_models])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order, use .get() for safety
|
778 |
-
print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank.get(model_id, np.nan):.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
|
|
|
779 |
|
780 |
# Write iteration results to file (append mode) - write for each model right after ranking
|
781 |
iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank.get(model_id, np.nan):.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
|
@@ -972,15 +977,12 @@ if st.sidebar.button("Start Benchmark"):
|
|
972 |
|
973 |
# Show model rankings
|
974 |
st.subheader("Model Rankings")
|
975 |
-
|
976 |
"Model": list(cumulative_avg_rank.keys()),
|
977 |
"Average Rank": [round(r, 2) if not np.isnan(r) else 'N/A' for r in cumulative_avg_rank.values()]
|
978 |
})
|
979 |
-
ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
|
980 |
st.dataframe(ranking_df) # Use st.dataframe directly instead of results_placeholder.dataframe
|
981 |
-
'''
|
982 |
-
averages_df = pd.DataFrame(cumulative_avg_rank)
|
983 |
-
st.dataframe(averages_df)
|
984 |
|
985 |
# Show detailed results
|
986 |
st.subheader("Detailed Results")
|
|
|
469 |
time.sleep(time_sleep) # Small delay
|
470 |
duration = time.time() - start_time # Calculate duration
|
471 |
print(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately as requested - as requested
|
472 |
+
st.write(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately as requested - as requested
|
473 |
|
474 |
return answer, duration # Return answer and duration
|
475 |
|
|
|
532 |
# --- Select difficulty with probabilities ---
|
533 |
difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
|
534 |
print(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---") # Print iteration number
|
535 |
+
st.write(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---") # Print iteration number
|
536 |
|
537 |
# --- Question Generation ---
|
538 |
question = None
|
|
|
637 |
if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
|
638 |
question_accepted = True
|
639 |
print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
|
640 |
+
st.write(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
|
641 |
s_t += 1
|
642 |
else:
|
643 |
question_accepted = False
|
644 |
print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
|
645 |
+
st.write(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
|
646 |
|
647 |
if not question_accepted:
|
648 |
print("Generated question was not accepted. Regenerating question.")
|
|
|
779 |
|
780 |
# --- Print and store iteration results IMMEDIATELY after ranking for this model ---
|
781 |
ranks_str = "[" + ", ".join(map(str, [ranks.get(m, None) for m in active_models])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order, use .get() for safety
|
782 |
+
print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, Avg Rank: {cumulative_avg_rank.get(model_id, np.nan):.2f}, Avg Rank for run: {average_rank:.5f}, Ranks: {ranks_str}, {ranking_duration:.2f} sec")
|
783 |
+
st.write(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, Avg Rank: {cumulative_avg_rank.get(model_id, np.nan):.2f}, Avg Rank for run: {average_rank:.5f}, Ranks: {ranks_str}, {ranking_duration:.2f} sec")
|
784 |
|
785 |
# Write iteration results to file (append mode) - write for each model right after ranking
|
786 |
iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank.get(model_id, np.nan):.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
|
|
|
977 |
|
978 |
# Show model rankings
|
979 |
st.subheader("Model Rankings")
|
980 |
+
ranking_df = pd.DataFrame({
|
981 |
"Model": list(cumulative_avg_rank.keys()),
|
982 |
"Average Rank": [round(r, 2) if not np.isnan(r) else 'N/A' for r in cumulative_avg_rank.values()]
|
983 |
})
|
984 |
+
#ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
|
985 |
st.dataframe(ranking_df) # Use st.dataframe directly instead of results_placeholder.dataframe
|
|
|
|
|
|
|
986 |
|
987 |
# Show detailed results
|
988 |
st.subheader("Detailed Results")
|