Spaces:
Sleeping
Sleeping
Update mmlu_pro_eval_adapted.py
Browse files- mmlu_pro_eval_adapted.py +20 -14
mmlu_pro_eval_adapted.py
CHANGED
@@ -310,17 +310,18 @@ def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5, debug_mode
|
|
310 |
|
311 |
return correctness, accuracy
|
312 |
|
313 |
-
|
314 |
-
@spaces.GPU(duration=240) # Extended to 4 minutes for larger evaluations
|
315 |
-
def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
|
316 |
"""
|
317 |
Main evaluation function for MMLU-Pro benchmark.
|
318 |
|
319 |
Args:
|
320 |
model_name: Name/path of model to evaluate
|
321 |
num_subjects: Number of subjects to test (-1 for all)
|
322 |
-
num_questions: Number of questions per subject
|
323 |
num_shots: Number of examples to include in prompts
|
|
|
|
|
|
|
324 |
"""
|
325 |
print(f"Is CUDA available: {torch.cuda.is_available()}")
|
326 |
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
|
@@ -332,12 +333,13 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
|
|
332 |
# Sort DataFrames
|
333 |
test_df = test_df.sort_values(['category', 'question_id'])
|
334 |
val_df = val_df.sort_values(['category', 'question_id'])
|
335 |
-
|
336 |
# Get unique subjects
|
337 |
all_subjects = sorted(test_df['category'].unique())
|
338 |
|
339 |
-
# Select subjects based on
|
340 |
-
if
|
|
|
|
|
341 |
selected_subjects = all_subjects
|
342 |
else:
|
343 |
selected_subjects = all_subjects[:num_subjects]
|
@@ -348,13 +350,17 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
|
|
348 |
results = {}
|
349 |
all_correctness = []
|
350 |
results_table = []
|
351 |
-
|
352 |
# Process each subject
|
353 |
for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
|
354 |
# Filter data for current subject
|
355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
val_samples = val_df[val_df['category'] == subject].head(num_shots)
|
357 |
-
|
358 |
# Run evaluation
|
359 |
correctness, acc = eval_cot(
|
360 |
subject,
|
@@ -377,13 +383,13 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
|
|
377 |
|
378 |
# Calculate overall metrics
|
379 |
weighted_acc = np.mean(all_correctness)
|
380 |
-
min_acc_subject = min(results.items(), key=lambda x: x[1])
|
381 |
-
max_acc_subject = max(results.items(), key=lambda x: x[1])
|
382 |
|
383 |
# Return results summary
|
384 |
return {
|
385 |
"overall_accuracy": weighted_acc,
|
386 |
-
"min_accuracy_subject":
|
387 |
-
"max_accuracy_subject":
|
388 |
"full_accuracy_table": results_table,
|
389 |
}
|
|
|
310 |
|
311 |
return correctness, accuracy
|
312 |
|
313 |
+
def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5, specific_subjects=None, flash_attention=False, regex_pattern=None):
|
|
|
|
|
314 |
"""
|
315 |
Main evaluation function for MMLU-Pro benchmark.
|
316 |
|
317 |
Args:
|
318 |
model_name: Name/path of model to evaluate
|
319 |
num_subjects: Number of subjects to test (-1 for all)
|
320 |
+
num_questions: Number of questions per subject (-1 for all)
|
321 |
num_shots: Number of examples to include in prompts
|
322 |
+
specific_subjects: List of specific subjects to evaluate (overrides num_subjects)
|
323 |
+
flash_attention: Whether to use flash attention (currently ignored)
|
324 |
+
regex_pattern: Regex pattern for answer extraction (currently ignored)
|
325 |
"""
|
326 |
print(f"Is CUDA available: {torch.cuda.is_available()}")
|
327 |
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
|
|
|
333 |
# Sort DataFrames
|
334 |
test_df = test_df.sort_values(['category', 'question_id'])
|
335 |
val_df = val_df.sort_values(['category', 'question_id'])
|
|
|
336 |
# Get unique subjects
|
337 |
all_subjects = sorted(test_df['category'].unique())
|
338 |
|
339 |
+
# Select subjects based on parameters
|
340 |
+
if specific_subjects is not None:
|
341 |
+
selected_subjects = [subject for subject in specific_subjects if subject in all_subjects]
|
342 |
+
elif num_subjects == -1 or num_subjects >= len(all_subjects):
|
343 |
selected_subjects = all_subjects
|
344 |
else:
|
345 |
selected_subjects = all_subjects[:num_subjects]
|
|
|
350 |
results = {}
|
351 |
all_correctness = []
|
352 |
results_table = []
|
|
|
353 |
# Process each subject
|
354 |
for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
|
355 |
# Filter data for current subject
|
356 |
+
if num_questions == -1:
|
357 |
+
# Use all questions for this subject
|
358 |
+
test_samples = test_df[test_df['category'] == subject]
|
359 |
+
else:
|
360 |
+
# Use specified number of questions
|
361 |
+
test_samples = test_df[test_df['category'] == subject].head(num_questions)
|
362 |
+
|
363 |
val_samples = val_df[val_df['category'] == subject].head(num_shots)
|
|
|
364 |
# Run evaluation
|
365 |
correctness, acc = eval_cot(
|
366 |
subject,
|
|
|
383 |
|
384 |
# Calculate overall metrics
|
385 |
weighted_acc = np.mean(all_correctness)
|
386 |
+
min_acc_subject = min(results.items(), key=lambda x: x[1])
|
387 |
+
max_acc_subject = max(results.items(), key=lambda x: x[1])
|
388 |
|
389 |
# Return results summary
|
390 |
return {
|
391 |
"overall_accuracy": weighted_acc,
|
392 |
+
"min_accuracy_subject": min_acc_subject,
|
393 |
+
"max_accuracy_subject": max_acc_subject,
|
394 |
"full_accuracy_table": results_table,
|
395 |
}
|