rohansampath commited on
Commit
9190bb9
·
verified ·
1 Parent(s): 1b7636f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -6
app.py CHANGED
@@ -195,17 +195,34 @@ def run_evaluation():
195
  def run_mmlu_evaluation(num_questions):
196
  """
197
  Runs the MMLU evaluation with the specified number of questions per task.
 
198
  """
199
  results = evaluate_mmlu(model, tokenizer, num_questions)
200
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  report = (
202
- f"Overall Accuracy: {results['overall_accuracy']:.2f}\n"
203
- f"Min Accuracy: {results['min_accuracy_task'][1]:.2f} on {results['min_accuracy_task'][0]}\n"
204
- f"Max Accuracy: {results['max_accuracy_task'][1]:.2f} on {results['max_accuracy_task'][0]}"
 
 
 
205
  )
206
-
207
- return report
208
 
 
209
 
210
  # ---------------------------------------------------------------------------
211
  # 6. Gradio Interface
 
195
  def run_mmlu_evaluation(num_questions):
196
  """
197
  Runs the MMLU evaluation with the specified number of questions per task.
198
+ Also displays two correct and two incorrect examples.
199
  """
200
  results = evaluate_mmlu(model, tokenizer, num_questions)
201
+
202
+ overall_accuracy = results["overall_accuracy"]
203
+ min_task, min_acc = results["min_accuracy_task"]
204
+ max_task, max_acc = results["max_accuracy_task"]
205
+ correct_examples = results["correct_examples"]
206
+ incorrect_examples = results["incorrect_examples"]
207
+
208
+ # Format examples for readability
209
+ def format_example(example):
210
+ task, question, model_output, correct_answer = example
211
+ return f"**Task:** {task}\n**Question:** {question}\n**Model Output:** {model_output}\n**Correct Answer:** {correct_answer}\n"
212
+
213
+ correct_text = "\n\n".join(format_example(ex) for ex in correct_examples)
214
+ incorrect_text = "\n\n".join(format_example(ex) for ex in incorrect_examples)
215
+
216
  report = (
217
+ f"### Overall Accuracy: {overall_accuracy:.2f}\n"
218
+ f"**Min Accuracy:** {min_acc:.2f} on `{min_task}`\n"
219
+ f"**Max Accuracy:** {max_acc:.2f} on `{max_task}`\n\n"
220
+ f"---\n\n"
221
+ f"### ✅ Correct Examples\n{correct_text if correct_examples else 'No correct examples available.'}\n\n"
222
+ f"### ❌ Incorrect Examples\n{incorrect_text if incorrect_examples else 'No incorrect examples available.'}"
223
  )
 
 
224
 
225
+ return report
226
 
227
  # ---------------------------------------------------------------------------
228
  # 6. Gradio Interface