Spaces:
Sleeping
Sleeping
File size: 6,739 Bytes
f049604 1d13019 f049604 9d60cf6 f049604 9d60cf6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import time
import traceback
import pandas as pd
import gradio as gr
import spaces
from mmlu_pro_eval_adapted import evaluate_mmlu_pro
from configs.dataset_config import get_subject_mode_param, get_subject_names
def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects,
all_questions, num_questions, model_configs, progress=gr.Progress()):
"""
Runs the MMLU evaluation with the specified parameters.
Args:
subject_selection_mode (str): Mode of subject selection ("all", "number", or "specific")
num_subjects (int): Number of subjects to evaluate (1-14)
selected_subjects (list): List of specific subjects to evaluate
all_questions (bool): Whether to evaluate all questions per subject
num_questions (int): Number of examples per subject (1-100 or all)
model_configs (dict): Configuration for both models
progress (gr.Progress): Progress indicator
"""
try:
# Convert parameters if needed
if subject_selection_mode == "all":
num_subjects = -1
selected_subjects = []
elif subject_selection_mode == "specific":
num_subjects = len(selected_subjects) if selected_subjects else -1
if all_questions:
num_questions = -1
# Extract model configurations
model1_config = model_configs["model1"]
model2_config = model_configs["model2"]
# Run evaluation for Model 1
start_time_model1 = time.time()
model1_results = evaluate_mmlu_pro(
model1_config["name"],
num_subjects=num_subjects,
num_questions=num_questions,
num_shots=model1_config["shots"],
specific_subjects=selected_subjects if subject_selection_mode == "specific" else None,
flash_attention=model1_config["flash_attention"],
regex_pattern=model1_config["regex"] if model1_config["regex"] else None
)
model1_elapsed_time = time.time() - start_time_model1
# Run evaluation for Model 2
start_time_model2 = time.time()
model2_results = evaluate_mmlu_pro(
model2_config["name"],
num_subjects=num_subjects,
num_questions=num_questions,
num_shots=model2_config["shots"],
specific_subjects=selected_subjects if subject_selection_mode == "specific" else None,
flash_attention=model2_config["flash_attention"],
regex_pattern=model2_config["regex"] if model2_config["regex"] else None
)
model2_elapsed_time = time.time() - start_time_model2
# Format summary results
model1_overall_acc = model1_results["overall_accuracy"]
model1_min_subject, model1_min_acc = model1_results["min_accuracy_subject"]
model1_max_subject, model1_max_acc = model1_results["max_accuracy_subject"]
model2_overall_acc = model2_results["overall_accuracy"]
model2_min_subject, model2_min_acc = model2_results["min_accuracy_subject"]
model2_max_subject, model2_max_acc = model2_results["max_accuracy_subject"]
# Create merged results DataFrame
results_df1 = pd.DataFrame(model1_results["full_accuracy_table"])
results_df2 = pd.DataFrame(model2_results["full_accuracy_table"])
# Ensure both dataframes have the same subjects
subjects = sorted(set(results_df1['Subject'].tolist() + results_df2['Subject'].tolist()))
# Create comparison DataFrame
comparison_data = []
for subject in subjects:
model1_row = results_df1[results_df1['Subject'] == subject]
model2_row = results_df2[results_df2['Subject'] == subject]
model1_acc = model1_row['Accuracy'].iloc[0] if not model1_row.empty else 0
model2_acc = model2_row['Accuracy'].iloc[0] if not model2_row.empty else 0
# Calculate the difference and determine the winner
diff = model1_acc - model2_acc
winner = "Model 1" if diff > 0 else ("Model 2" if diff < 0 else "Tie")
comparison_data.append({
'Subject': subject,
'Model 1 Accuracy': model1_acc,
'Model 2 Accuracy': model2_acc,
'Difference': abs(diff),
'Winner': winner
})
# Add overall row
model1_total_samples = results_df1['Num_samples'].sum()
model1_total_correct = results_df1['Num_correct'].sum()
model2_total_samples = results_df2['Num_samples'].sum()
model2_total_correct = results_df2['Num_correct'].sum()
overall_diff = model1_overall_acc - model2_overall_acc
overall_winner = "Model 1" if overall_diff > 0 else ("Model 2" if overall_diff < 0 else "Tie")
comparison_data.insert(0, {
'Subject': '**Overall**',
'Model 1 Accuracy': model1_overall_acc,
'Model 2 Accuracy': model2_overall_acc,
'Difference': abs(overall_diff),
'Winner': overall_winner
})
report = (
f"### Head-to-Head Comparison Results\n\n"
f"#### Model 1: {model1_config['name']}\n"
f"* Overall Accuracy: {model1_overall_acc:.3f}\n"
f"* Best Performance: {model1_max_subject} ({model1_max_acc:.3f})\n"
f"* Worst Performance: {model1_min_subject} ({model1_min_acc:.3f})\n"
f"* Evaluation completed in {model1_elapsed_time:.2f} seconds\n\n"
f"#### Model 2: {model2_config['name']}\n"
f"* Overall Accuracy: {model2_overall_acc:.3f}\n"
f"* Best Performance: {model2_max_subject} ({model2_max_acc:.3f})\n"
f"* Worst Performance: {model2_min_subject} ({model2_min_acc:.3f})\n"
f"* Evaluation completed in {model2_elapsed_time:.2f} seconds\n\n"
f"#### Overall Winner: {overall_winner}\n"
f"* Margin: {abs(overall_diff):.3f}\n"
)
comparison_df = pd.DataFrame(comparison_data)
# Format the report
return {
'report': report,
'comparison_df': comparison_df,
'success': True
}
except Exception as e:
# Handle errors gracefully
error_trace = traceback.format_exc()
error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
# Return error information
return {
'report': error_message,
'comparison_df': None,
'success': False
}
|