Spaces:
Sleeping
Sleeping
Update mmlu_eval_original.py
Browse files- mmlu_eval_original.py +18 -18
mmlu_eval_original.py
CHANGED
@@ -93,7 +93,7 @@ def gen_prompt(df, subject, k=-1):
|
|
93 |
|
94 |
|
95 |
@torch.no_grad()
|
96 |
-
def eval_batched(subject, model, tokenizer, dev_df, test_df, num_questions_per_subject=5, train_shots=5, batch_size=
|
97 |
"""
|
98 |
Improved eval function that uses batched processing on GPU
|
99 |
"""
|
@@ -290,15 +290,26 @@ def evaluate_mmlu_batched(model, tokenizer, num_subjects=10, num_questions=10, n
|
|
290 |
batch_size (int): Batch size for processing multiple examples at once
|
291 |
auto_batch_size (bool): If True, automatically determine the optimal batch size
|
292 |
"""
|
293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
# If auto_batch_size is enabled, estimate the optimal batch size
|
295 |
if auto_batch_size:
|
296 |
# Get a sample prompt
|
297 |
-
dataset = load_dataset_from_hf(verbose=False)
|
298 |
-
test_df = pd.DataFrame(dataset['test'])
|
299 |
-
dev_df = pd.DataFrame(dataset['dev'])
|
300 |
-
test_df = test_df.sort_values(['subject', 'question'])
|
301 |
-
dev_df = dev_df.sort_values(['subject', 'question'])
|
302 |
subject = test_df['subject'].iloc[0]
|
303 |
test_sample = test_df[test_df['subject'] == subject].head(1)
|
304 |
dev_sample = dev_df[dev_df['subject'] == subject].head(num_shots)
|
@@ -311,18 +322,7 @@ def evaluate_mmlu_batched(model, tokenizer, num_subjects=10, num_questions=10, n
|
|
311 |
batch_size = get_max_batch_size(model, tokenizer, sample_prompt)
|
312 |
logger.info(f"Auto-adjusted batch size: {batch_size}")
|
313 |
|
314 |
-
model.eval() # Ensure Dropout and BatchNorm behave appropriately for inference
|
315 |
|
316 |
-
dataset = load_dataset_from_hf(verbose=True)
|
317 |
-
|
318 |
-
# Convert dataset partitions to pandas DataFrames
|
319 |
-
test_df = pd.DataFrame(dataset['test'])
|
320 |
-
dev_df = pd.DataFrame(dataset['dev'])
|
321 |
-
|
322 |
-
# Sort datasets by subject and other relevant columns
|
323 |
-
test_df = test_df.sort_values(['subject', 'question'])
|
324 |
-
dev_df = dev_df.sort_values(['subject', 'question'])
|
325 |
-
|
326 |
# Get all unique subjects
|
327 |
all_subjects = sorted(test_df['subject'].unique())
|
328 |
|
|
|
93 |
|
94 |
|
95 |
@torch.no_grad()
|
96 |
+
def eval_batched(subject, model, tokenizer, dev_df, test_df, num_questions_per_subject=5, train_shots=5, batch_size=8):
|
97 |
"""
|
98 |
Improved eval function that uses batched processing on GPU
|
99 |
"""
|
|
|
290 |
batch_size (int): Batch size for processing multiple examples at once
|
291 |
auto_batch_size (bool): If True, automatically determine the optimal batch size
|
292 |
"""
|
293 |
+
model.eval() # Ensure Dropout and BatchNorm behave appropriately for inference
|
294 |
+
|
295 |
+
if tokenizer.pad_token is None:
|
296 |
+
logger.info("NO TOKENIZER PAD TOKEN")
|
297 |
+
tokenizer.pad_token = tokenizer.eos_token
|
298 |
+
if model.config.pad_token_id is None:
|
299 |
+
logger.info("NO PAD TOKEN ID")
|
300 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
301 |
+
|
302 |
+
|
303 |
+
dataset = load_dataset_from_hf(verbose=True)
|
304 |
+
test_df = pd.DataFrame(dataset['test'])
|
305 |
+
dev_df = pd.DataFrame(dataset['dev'])
|
306 |
+
test_df = test_df.sort_values(['subject', 'question'])
|
307 |
+
dev_df = dev_df.sort_values(['subject', 'question'])
|
308 |
+
|
309 |
+
|
310 |
# If auto_batch_size is enabled, estimate the optimal batch size
|
311 |
if auto_batch_size:
|
312 |
# Get a sample prompt
|
|
|
|
|
|
|
|
|
|
|
313 |
subject = test_df['subject'].iloc[0]
|
314 |
test_sample = test_df[test_df['subject'] == subject].head(1)
|
315 |
dev_sample = dev_df[dev_df['subject'] == subject].head(num_shots)
|
|
|
322 |
batch_size = get_max_batch_size(model, tokenizer, sample_prompt)
|
323 |
logger.info(f"Auto-adjusted batch size: {batch_size}")
|
324 |
|
|
|
325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
# Get all unique subjects
|
327 |
all_subjects = sorted(test_df['subject'].unique())
|
328 |
|