H2H-eval-comparator / run_evaluation.py
rohansampath's picture
Update run_evaluation.py
1d13019 verified
import time
import traceback
import pandas as pd
import gradio as gr
import spaces
from mmlu_pro_eval_adapted import evaluate_mmlu_pro
from configs.dataset_config import get_subject_mode_param, get_subject_names
def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects,
all_questions, num_questions, model_configs, progress=gr.Progress()):
"""
Runs the MMLU evaluation with the specified parameters.
Args:
subject_selection_mode (str): Mode of subject selection ("all", "number", or "specific")
num_subjects (int): Number of subjects to evaluate (1-14)
selected_subjects (list): List of specific subjects to evaluate
all_questions (bool): Whether to evaluate all questions per subject
num_questions (int): Number of examples per subject (1-100 or all)
model_configs (dict): Configuration for both models
progress (gr.Progress): Progress indicator
"""
try:
# Convert parameters if needed
if subject_selection_mode == "all":
num_subjects = -1
selected_subjects = []
elif subject_selection_mode == "specific":
num_subjects = len(selected_subjects) if selected_subjects else -1
if all_questions:
num_questions = -1
# Extract model configurations
model1_config = model_configs["model1"]
model2_config = model_configs["model2"]
# Run evaluation for Model 1
start_time_model1 = time.time()
model1_results = evaluate_mmlu_pro(
model1_config["name"],
num_subjects=num_subjects,
num_questions=num_questions,
num_shots=model1_config["shots"],
specific_subjects=selected_subjects if subject_selection_mode == "specific" else None,
flash_attention=model1_config["flash_attention"],
regex_pattern=model1_config["regex"] if model1_config["regex"] else None
)
model1_elapsed_time = time.time() - start_time_model1
# Run evaluation for Model 2
start_time_model2 = time.time()
model2_results = evaluate_mmlu_pro(
model2_config["name"],
num_subjects=num_subjects,
num_questions=num_questions,
num_shots=model2_config["shots"],
specific_subjects=selected_subjects if subject_selection_mode == "specific" else None,
flash_attention=model2_config["flash_attention"],
regex_pattern=model2_config["regex"] if model2_config["regex"] else None
)
model2_elapsed_time = time.time() - start_time_model2
# Format summary results
model1_overall_acc = model1_results["overall_accuracy"]
model1_min_subject, model1_min_acc = model1_results["min_accuracy_subject"]
model1_max_subject, model1_max_acc = model1_results["max_accuracy_subject"]
model2_overall_acc = model2_results["overall_accuracy"]
model2_min_subject, model2_min_acc = model2_results["min_accuracy_subject"]
model2_max_subject, model2_max_acc = model2_results["max_accuracy_subject"]
# Create merged results DataFrame
results_df1 = pd.DataFrame(model1_results["full_accuracy_table"])
results_df2 = pd.DataFrame(model2_results["full_accuracy_table"])
# Ensure both dataframes have the same subjects
subjects = sorted(set(results_df1['Subject'].tolist() + results_df2['Subject'].tolist()))
# Create comparison DataFrame
comparison_data = []
for subject in subjects:
model1_row = results_df1[results_df1['Subject'] == subject]
model2_row = results_df2[results_df2['Subject'] == subject]
model1_acc = model1_row['Accuracy'].iloc[0] if not model1_row.empty else 0
model2_acc = model2_row['Accuracy'].iloc[0] if not model2_row.empty else 0
# Calculate the difference and determine the winner
diff = model1_acc - model2_acc
winner = "Model 1" if diff > 0 else ("Model 2" if diff < 0 else "Tie")
comparison_data.append({
'Subject': subject,
'Model 1 Accuracy': model1_acc,
'Model 2 Accuracy': model2_acc,
'Difference': abs(diff),
'Winner': winner
})
# Add overall row
model1_total_samples = results_df1['Num_samples'].sum()
model1_total_correct = results_df1['Num_correct'].sum()
model2_total_samples = results_df2['Num_samples'].sum()
model2_total_correct = results_df2['Num_correct'].sum()
overall_diff = model1_overall_acc - model2_overall_acc
overall_winner = "Model 1" if overall_diff > 0 else ("Model 2" if overall_diff < 0 else "Tie")
comparison_data.insert(0, {
'Subject': '**Overall**',
'Model 1 Accuracy': model1_overall_acc,
'Model 2 Accuracy': model2_overall_acc,
'Difference': abs(overall_diff),
'Winner': overall_winner
})
report = (
f"### Head-to-Head Comparison Results\n\n"
f"#### Model 1: {model1_config['name']}\n"
f"* Overall Accuracy: {model1_overall_acc:.3f}\n"
f"* Best Performance: {model1_max_subject} ({model1_max_acc:.3f})\n"
f"* Worst Performance: {model1_min_subject} ({model1_min_acc:.3f})\n"
f"* Evaluation completed in {model1_elapsed_time:.2f} seconds\n\n"
f"#### Model 2: {model2_config['name']}\n"
f"* Overall Accuracy: {model2_overall_acc:.3f}\n"
f"* Best Performance: {model2_max_subject} ({model2_max_acc:.3f})\n"
f"* Worst Performance: {model2_min_subject} ({model2_min_acc:.3f})\n"
f"* Evaluation completed in {model2_elapsed_time:.2f} seconds\n\n"
f"#### Overall Winner: {overall_winner}\n"
f"* Margin: {abs(overall_diff):.3f}\n"
)
comparison_df = pd.DataFrame(comparison_data)
# Format the report
return {
'report': report,
'comparison_df': comparison_df,
'success': True
}
except Exception as e:
# Handle errors gracefully
error_trace = traceback.format_exc()
error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
# Return error information
return {
'report': error_message,
'comparison_df': None,
'success': False
}