rohansampath commited on
Commit
e6f8dd1
·
verified ·
1 Parent(s): d739d62

Update mmlu_pro_eval_adapted.py

Browse files
Files changed (1) hide show
  1. mmlu_pro_eval_adapted.py +20 -14
mmlu_pro_eval_adapted.py CHANGED
@@ -310,17 +310,18 @@ def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5, debug_mode
310
 
311
  return correctness, accuracy
312
 
313
-
314
- @spaces.GPU(duration=240) # Extended to 4 minutes for larger evaluations
315
- def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
316
  """
317
  Main evaluation function for MMLU-Pro benchmark.
318
 
319
  Args:
320
  model_name: Name/path of model to evaluate
321
  num_subjects: Number of subjects to test (-1 for all)
322
- num_questions: Number of questions per subject
323
  num_shots: Number of examples to include in prompts
 
 
 
324
  """
325
  print(f"Is CUDA available: {torch.cuda.is_available()}")
326
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
@@ -332,12 +333,13 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
332
  # Sort DataFrames
333
  test_df = test_df.sort_values(['category', 'question_id'])
334
  val_df = val_df.sort_values(['category', 'question_id'])
335
-
336
  # Get unique subjects
337
  all_subjects = sorted(test_df['category'].unique())
338
 
339
- # Select subjects based on num_subjects parameter
340
- if num_subjects == -1 or num_subjects >= len(all_subjects):
 
 
341
  selected_subjects = all_subjects
342
  else:
343
  selected_subjects = all_subjects[:num_subjects]
@@ -348,13 +350,17 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
348
  results = {}
349
  all_correctness = []
350
  results_table = []
351
-
352
  # Process each subject
353
  for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
354
  # Filter data for current subject
355
- test_samples = test_df[test_df['category'] == subject].head(num_questions)
 
 
 
 
 
 
356
  val_samples = val_df[val_df['category'] == subject].head(num_shots)
357
-
358
  # Run evaluation
359
  correctness, acc = eval_cot(
360
  subject,
@@ -377,13 +383,13 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
377
 
378
  # Calculate overall metrics
379
  weighted_acc = np.mean(all_correctness)
380
- min_acc_subject = min(results.items(), key=lambda x: x[1])[0]
381
- max_acc_subject = max(results.items(), key=lambda x: x[1])[0]
382
 
383
  # Return results summary
384
  return {
385
  "overall_accuracy": weighted_acc,
386
- "min_accuracy_subject": (min_acc_subject, results[min_acc_subject]),
387
- "max_accuracy_subject": (max_acc_subject, results[max_acc_subject]),
388
  "full_accuracy_table": results_table,
389
  }
 
310
 
311
  return correctness, accuracy
312
 
313
+ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5, specific_subjects=None, flash_attention=False, regex_pattern=None):
 
 
314
  """
315
  Main evaluation function for MMLU-Pro benchmark.
316
 
317
  Args:
318
  model_name: Name/path of model to evaluate
319
  num_subjects: Number of subjects to test (-1 for all)
320
+ num_questions: Number of questions per subject (-1 for all)
321
  num_shots: Number of examples to include in prompts
322
+ specific_subjects: List of specific subjects to evaluate (overrides num_subjects)
323
+ flash_attention: Whether to use flash attention (currently ignored)
324
+ regex_pattern: Regex pattern for answer extraction (currently ignored)
325
  """
326
  print(f"Is CUDA available: {torch.cuda.is_available()}")
327
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
 
333
  # Sort DataFrames
334
  test_df = test_df.sort_values(['category', 'question_id'])
335
  val_df = val_df.sort_values(['category', 'question_id'])
 
336
  # Get unique subjects
337
  all_subjects = sorted(test_df['category'].unique())
338
 
339
+ # Select subjects based on parameters
340
+ if specific_subjects is not None:
341
+ selected_subjects = [subject for subject in specific_subjects if subject in all_subjects]
342
+ elif num_subjects == -1 or num_subjects >= len(all_subjects):
343
  selected_subjects = all_subjects
344
  else:
345
  selected_subjects = all_subjects[:num_subjects]
 
350
  results = {}
351
  all_correctness = []
352
  results_table = []
 
353
  # Process each subject
354
  for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
355
  # Filter data for current subject
356
+ if num_questions == -1:
357
+ # Use all questions for this subject
358
+ test_samples = test_df[test_df['category'] == subject]
359
+ else:
360
+ # Use specified number of questions
361
+ test_samples = test_df[test_df['category'] == subject].head(num_questions)
362
+
363
  val_samples = val_df[val_df['category'] == subject].head(num_shots)
 
364
  # Run evaluation
365
  correctness, acc = eval_cot(
366
  subject,
 
383
 
384
  # Calculate overall metrics
385
  weighted_acc = np.mean(all_correctness)
386
+ min_acc_subject = min(results.items(), key=lambda x: x[1])
387
+ max_acc_subject = max(results.items(), key=lambda x: x[1])
388
 
389
  # Return results summary
390
  return {
391
  "overall_accuracy": weighted_acc,
392
+ "min_accuracy_subject": min_acc_subject,
393
+ "max_accuracy_subject": max_acc_subject,
394
  "full_accuracy_table": results_table,
395
  }