Spaces:
Sleeping
Sleeping
Update mmlu_eval.py
Browse files- mmlu_eval.py +2 -2
mmlu_eval.py
CHANGED
@@ -45,7 +45,7 @@ def evaluate_mmlu(model, tokenizer, num_questions_per_task=5):
|
|
45 |
incorrect_examples = []
|
46 |
|
47 |
for task_name in mmlu_dataset.keys():
|
48 |
-
|
49 |
dataset = mmlu_dataset[task_name]
|
50 |
sampled_questions = random.sample(list(dataset), min(num_questions_per_task, len(dataset)))
|
51 |
|
@@ -53,7 +53,7 @@ def evaluate_mmlu(model, tokenizer, num_questions_per_task=5):
|
|
53 |
references = []
|
54 |
|
55 |
for sample in sampled_questions:
|
56 |
-
|
57 |
question = sample["question"]
|
58 |
correct_answer = str(sample["answer"]).strip().lower()
|
59 |
model_output = generate_answer(model, tokenizer, question).strip().lower()
|
|
|
45 |
incorrect_examples = []
|
46 |
|
47 |
for task_name in mmlu_dataset.keys():
|
48 |
+
print ("TASK NAME: ", task_name)
|
49 |
dataset = mmlu_dataset[task_name]
|
50 |
sampled_questions = random.sample(list(dataset), min(num_questions_per_task, len(dataset)))
|
51 |
|
|
|
53 |
references = []
|
54 |
|
55 |
for sample in sampled_questions:
|
56 |
+
print ("SAMPLE", sample)
|
57 |
question = sample["question"]
|
58 |
correct_answer = str(sample["answer"]).strip().lower()
|
59 |
model_output = generate_answer(model, tokenizer, question).strip().lower()
|