PeterKruger commited on
Commit
9e5f02a
·
verified ·
1 Parent(s): 816a15e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -6
app.py CHANGED
@@ -469,6 +469,7 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
469
  time.sleep(time_sleep) # Small delay
470
  duration = time.time() - start_time # Calculate duration
471
  print(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately as requested - as requested
 
472
 
473
  return answer, duration # Return answer and duration
474
 
@@ -531,6 +532,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
531
  # --- Select difficulty with probabilities ---
532
  difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
533
  print(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---") # Print iteration number
 
534
 
535
  # --- Question Generation ---
536
  question = None
@@ -635,10 +637,12 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
635
  if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
636
  question_accepted = True
637
  print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
 
638
  s_t += 1
639
  else:
640
  question_accepted = False
641
  print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
 
642
 
643
  if not question_accepted:
644
  print("Generated question was not accepted. Regenerating question.")
@@ -775,7 +779,8 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
775
 
776
  # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
777
  ranks_str = "[" + ", ".join(map(str, [ranks.get(m, None) for m in active_models])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order, use .get() for safety
778
- print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank.get(model_id, np.nan):.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
 
779
 
780
  # Write iteration results to file (append mode) - write for each model right after ranking
781
  iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank.get(model_id, np.nan):.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
@@ -972,15 +977,12 @@ if st.sidebar.button("Start Benchmark"):
972
 
973
  # Show model rankings
974
  st.subheader("Model Rankings")
975
- '''ranking_df = pd.DataFrame({
976
  "Model": list(cumulative_avg_rank.keys()),
977
  "Average Rank": [round(r, 2) if not np.isnan(r) else 'N/A' for r in cumulative_avg_rank.values()]
978
  })
979
- ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
980
  st.dataframe(ranking_df) # Use st.dataframe directly instead of results_placeholder.dataframe
981
- '''
982
- averages_df = pd.DataFrame(cumulative_avg_rank)
983
- st.dataframe(averages_df)
984
 
985
  # Show detailed results
986
  st.subheader("Detailed Results")
 
469
  time.sleep(time_sleep) # Small delay
470
  duration = time.time() - start_time # Calculate duration
471
  print(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately as requested - as requested
472
+ st.write(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately as requested - as requested
473
 
474
  return answer, duration # Return answer and duration
475
 
 
532
  # --- Select difficulty with probabilities ---
533
  difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
534
  print(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---") # Print iteration number
535
+ st.write(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---") # Print iteration number
536
 
537
  # --- Question Generation ---
538
  question = None
 
637
  if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
638
  question_accepted = True
639
  print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
640
+ st.write(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
641
  s_t += 1
642
  else:
643
  question_accepted = False
644
  print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
645
+ st.write(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
646
 
647
  if not question_accepted:
648
  print("Generated question was not accepted. Regenerating question.")
 
779
 
780
  # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
781
  ranks_str = "[" + ", ".join(map(str, [ranks.get(m, None) for m in active_models])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order, use .get() for safety
782
+ print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, Avg Rank: {cumulative_avg_rank.get(model_id, np.nan):.2f}, Avg Rank for run: {average_rank:.5f}, Ranks: {ranks_str}, {ranking_duration:.2f} sec")
783
+ st.write(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, Avg Rank: {cumulative_avg_rank.get(model_id, np.nan):.2f}, Avg Rank for run: {average_rank:.5f}, Ranks: {ranks_str}, {ranking_duration:.2f} sec")
784
 
785
  # Write iteration results to file (append mode) - write for each model right after ranking
786
  iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank.get(model_id, np.nan):.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
 
977
 
978
  # Show model rankings
979
  st.subheader("Model Rankings")
980
+ ranking_df = pd.DataFrame({
981
  "Model": list(cumulative_avg_rank.keys()),
982
  "Average Rank": [round(r, 2) if not np.isnan(r) else 'N/A' for r in cumulative_avg_rank.values()]
983
  })
984
+ #ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
985
  st.dataframe(ranking_df) # Use st.dataframe directly instead of results_placeholder.dataframe
 
 
 
986
 
987
  # Show detailed results
988
  st.subheader("Detailed Results")