Update app.py
Browse files
app.py
CHANGED
@@ -195,17 +195,34 @@ def run_evaluation():
|
|
195 |
def run_mmlu_evaluation(num_questions):
|
196 |
"""
|
197 |
Runs the MMLU evaluation with the specified number of questions per task.
|
|
|
198 |
"""
|
199 |
results = evaluate_mmlu(model, tokenizer, num_questions)
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
report = (
|
202 |
-
f"Overall Accuracy: {
|
203 |
-
f"Min Accuracy
|
204 |
-
f"Max Accuracy
|
|
|
|
|
|
|
205 |
)
|
206 |
-
|
207 |
-
return report
|
208 |
|
|
|
209 |
|
210 |
# ---------------------------------------------------------------------------
|
211 |
# 6. Gradio Interface
|
|
|
195 |
def run_mmlu_evaluation(num_questions):
|
196 |
"""
|
197 |
Runs the MMLU evaluation with the specified number of questions per task.
|
198 |
+
Also displays two correct and two incorrect examples.
|
199 |
"""
|
200 |
results = evaluate_mmlu(model, tokenizer, num_questions)
|
201 |
+
|
202 |
+
overall_accuracy = results["overall_accuracy"]
|
203 |
+
min_task, min_acc = results["min_accuracy_task"]
|
204 |
+
max_task, max_acc = results["max_accuracy_task"]
|
205 |
+
correct_examples = results["correct_examples"]
|
206 |
+
incorrect_examples = results["incorrect_examples"]
|
207 |
+
|
208 |
+
# Format examples for readability
|
209 |
+
def format_example(example):
|
210 |
+
task, question, model_output, correct_answer = example
|
211 |
+
return f"**Task:** {task}\n**Question:** {question}\n**Model Output:** {model_output}\n**Correct Answer:** {correct_answer}\n"
|
212 |
+
|
213 |
+
correct_text = "\n\n".join(format_example(ex) for ex in correct_examples)
|
214 |
+
incorrect_text = "\n\n".join(format_example(ex) for ex in incorrect_examples)
|
215 |
+
|
216 |
report = (
|
217 |
+
f"### Overall Accuracy: {overall_accuracy:.2f}\n"
|
218 |
+
f"**Min Accuracy:** {min_acc:.2f} on `{min_task}`\n"
|
219 |
+
f"**Max Accuracy:** {max_acc:.2f} on `{max_task}`\n\n"
|
220 |
+
f"---\n\n"
|
221 |
+
f"### ✅ Correct Examples\n{correct_text if correct_examples else 'No correct examples available.'}\n\n"
|
222 |
+
f"### ❌ Incorrect Examples\n{incorrect_text if incorrect_examples else 'No incorrect examples available.'}"
|
223 |
)
|
|
|
|
|
224 |
|
225 |
+
return report
|
226 |
|
227 |
# ---------------------------------------------------------------------------
|
228 |
# 6. Gradio Interface
|