Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -68,7 +68,7 @@ def run_toy_evaluation():
|
|
68 |
# 3. MMLU Evaluation call
|
69 |
# ---------------------------------------------------------------------------
|
70 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
71 |
-
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions):
|
72 |
"""
|
73 |
Runs the MMLU evaluation with the specified parameters.
|
74 |
|
@@ -78,13 +78,15 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
78 |
num_shots (int): Number of few-shot examples (0-5)
|
79 |
all_questions (bool): Whether to evaluate all questions per subject
|
80 |
num_questions (int): Number of examples per subject (1-20 or -1 for all)
|
|
|
81 |
"""
|
82 |
|
83 |
if not model_loaded:
|
84 |
load_model()
|
85 |
|
86 |
if not model_loaded:
|
87 |
-
return "⚠️ Model not loaded. Please load the model first.", None
|
|
|
88 |
|
89 |
# Convert num_subjects to -1 if all_subjects is True
|
90 |
if all_subjects:
|
@@ -142,7 +144,9 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
142 |
f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
|
143 |
)
|
144 |
|
145 |
-
|
|
|
|
|
146 |
|
147 |
# ---------------------------------------------------------------------------
|
148 |
# 4. Gradio Interface
|
@@ -200,20 +204,31 @@ with gr.Blocks() as demo:
|
|
200 |
value=False, # Default is unchecked
|
201 |
info="When checked, evaluates all available questions for each subject"
|
202 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
num_questions_slider = gr.Slider(
|
204 |
minimum=1,
|
205 |
maximum=20,
|
206 |
value=10, # Default is 10 questions
|
207 |
step=1,
|
208 |
label="Questions per Subject",
|
209 |
-
info="Choose a subset of questions (1-20)
|
210 |
interactive=True
|
211 |
)
|
212 |
|
213 |
with gr.Row():
|
214 |
-
|
|
|
|
|
215 |
results_output = gr.Markdown(label="Evaluation Results")
|
216 |
-
|
|
|
|
|
217 |
|
218 |
# Connect components
|
219 |
load_button.click(fn=load_model, inputs=None, outputs=load_status)
|
@@ -238,21 +253,61 @@ with gr.Blocks() as demo:
|
|
238 |
outputs=[num_subjects_slider]
|
239 |
)
|
240 |
|
241 |
-
# Update
|
242 |
-
def
|
243 |
if checked:
|
244 |
-
return gr.update(
|
245 |
else:
|
246 |
-
return gr.update(
|
247 |
|
248 |
all_questions_checkbox.change(
|
249 |
-
fn=
|
250 |
inputs=[all_questions_checkbox],
|
251 |
-
outputs=[
|
252 |
)
|
253 |
|
254 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
eval_mmlu_button.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
fn=run_mmlu_evaluation,
|
257 |
inputs=[
|
258 |
all_subjects_checkbox,
|
@@ -261,7 +316,30 @@ with gr.Blocks() as demo:
|
|
261 |
all_questions_checkbox,
|
262 |
num_questions_slider
|
263 |
],
|
264 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
)
|
266 |
|
267 |
-
demo.launch()
|
|
|
68 |
# 3. MMLU Evaluation call
|
69 |
# ---------------------------------------------------------------------------
|
70 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
71 |
+
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
|
72 |
"""
|
73 |
Runs the MMLU evaluation with the specified parameters.
|
74 |
|
|
|
78 |
num_shots (int): Number of few-shot examples (0-5)
|
79 |
all_questions (bool): Whether to evaluate all questions per subject
|
80 |
num_questions (int): Number of examples per subject (1-20 or -1 for all)
|
81 |
+
progress (gr.Progress): Progress indicator
|
82 |
"""
|
83 |
|
84 |
if not model_loaded:
|
85 |
load_model()
|
86 |
|
87 |
if not model_loaded:
|
88 |
+
return "⚠️ Model not loaded. Please load the model first.", None, gr.update(interactive=True), gr.update(visible=False), \
|
89 |
+
[gr.update(interactive=True) for _ in range(5)]
|
90 |
|
91 |
# Convert num_subjects to -1 if all_subjects is True
|
92 |
if all_subjects:
|
|
|
144 |
f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
|
145 |
)
|
146 |
|
147 |
+
# Return values that re-enable UI components after completion
|
148 |
+
return report, results_df, gr.update(interactive=True), gr.update(visible=False), \
|
149 |
+
[gr.update(interactive=True) for _ in range(5)]
|
150 |
|
151 |
# ---------------------------------------------------------------------------
|
152 |
# 4. Gradio Interface
|
|
|
204 |
value=False, # Default is unchecked
|
205 |
info="When checked, evaluates all available questions for each subject"
|
206 |
)
|
207 |
+
questions_info_text = gr.Markdown(visible=False, value="**All 14,042 questions across all subjects will be evaluated**")
|
208 |
+
|
209 |
+
with gr.Row(elem_id="questions_selection_row"):
|
210 |
+
questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
|
211 |
+
|
212 |
+
# Move the slider into the container for easier visibility toggling
|
213 |
+
with questions_container:
|
214 |
num_questions_slider = gr.Slider(
|
215 |
minimum=1,
|
216 |
maximum=20,
|
217 |
value=10, # Default is 10 questions
|
218 |
step=1,
|
219 |
label="Questions per Subject",
|
220 |
+
info="Choose a subset of questions (1-20)",
|
221 |
interactive=True
|
222 |
)
|
223 |
|
224 |
with gr.Row():
|
225 |
+
with gr.Column(scale=1):
|
226 |
+
eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary", interactive=True)
|
227 |
+
cancel_mmlu_button = gr.Button("Cancel MMLU Evaluation", variant="stop", visible=False)
|
228 |
results_output = gr.Markdown(label="Evaluation Results")
|
229 |
+
|
230 |
+
with gr.Row():
|
231 |
+
results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)", visible=True)
|
232 |
|
233 |
# Connect components
|
234 |
load_button.click(fn=load_model, inputs=None, outputs=load_status)
|
|
|
253 |
outputs=[num_subjects_slider]
|
254 |
)
|
255 |
|
256 |
+
# Update interface based on all_questions checkbox
|
257 |
+
def update_questions_interface(checked):
|
258 |
if checked:
|
259 |
+
return gr.update(visible=False), gr.update(visible=True)
|
260 |
else:
|
261 |
+
return gr.update(visible=True), gr.update(visible=False)
|
262 |
|
263 |
all_questions_checkbox.change(
|
264 |
+
fn=update_questions_interface,
|
265 |
inputs=[all_questions_checkbox],
|
266 |
+
outputs=[questions_container, questions_info_text]
|
267 |
)
|
268 |
|
269 |
+
# Function to disable UI components during evaluation
|
270 |
+
def disable_ui_for_evaluation():
|
271 |
+
return [
|
272 |
+
gr.update(interactive=False, info="MMLU Evaluation currently in progress"), # all_subjects_checkbox
|
273 |
+
gr.update(interactive=False, info="MMLU Evaluation currently in progress"), # num_subjects_slider
|
274 |
+
gr.update(interactive=False, info="MMLU Evaluation currently in progress"), # num_shots_slider
|
275 |
+
gr.update(interactive=False, info="MMLU Evaluation currently in progress"), # all_questions_checkbox
|
276 |
+
gr.update(interactive=False, info="MMLU Evaluation currently in progress"), # num_questions_slider
|
277 |
+
gr.update(interactive=False), # eval_mmlu_button
|
278 |
+
gr.update(visible=True) # cancel_mmlu_button
|
279 |
+
]
|
280 |
+
|
281 |
+
# Function to handle cancel button click
|
282 |
+
def cancel_evaluation():
|
283 |
+
# This doesn't actually cancel the GPU job (which would require more backend support)
|
284 |
+
# But it does reset the UI state to be interactive again
|
285 |
+
return [
|
286 |
+
gr.update(interactive=True, info="When checked, evaluates all 57 MMLU subjects"), # all_subjects_checkbox
|
287 |
+
gr.update(interactive=True, info="Number of subjects to evaluate (1-57). They will be loaded in alphabetical order."), # num_subjects_slider
|
288 |
+
gr.update(interactive=True, info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."), # num_shots_slider
|
289 |
+
gr.update(interactive=True, info="When checked, evaluates all available questions for each subject"), # all_questions_checkbox
|
290 |
+
gr.update(interactive=True, info="Choose a subset of questions (1-20)"), # num_questions_slider
|
291 |
+
gr.update(interactive=True), # eval_mmlu_button
|
292 |
+
gr.update(visible=False), # cancel_mmlu_button
|
293 |
+
"⚠️ Evaluation canceled by user", # results_output
|
294 |
+
None # results_table
|
295 |
+
]
|
296 |
+
|
297 |
+
# Connect MMLU evaluation button - now disables UI and shows cancel button
|
298 |
eval_mmlu_button.click(
|
299 |
+
fn=disable_ui_for_evaluation,
|
300 |
+
inputs=None,
|
301 |
+
outputs=[
|
302 |
+
all_subjects_checkbox,
|
303 |
+
num_subjects_slider,
|
304 |
+
num_shots_slider,
|
305 |
+
all_questions_checkbox,
|
306 |
+
num_questions_slider,
|
307 |
+
eval_mmlu_button,
|
308 |
+
cancel_mmlu_button
|
309 |
+
]
|
310 |
+
).then(
|
311 |
fn=run_mmlu_evaluation,
|
312 |
inputs=[
|
313 |
all_subjects_checkbox,
|
|
|
316 |
all_questions_checkbox,
|
317 |
num_questions_slider
|
318 |
],
|
319 |
+
outputs=[
|
320 |
+
results_output,
|
321 |
+
results_table,
|
322 |
+
eval_mmlu_button,
|
323 |
+
cancel_mmlu_button,
|
324 |
+
[all_subjects_checkbox, num_subjects_slider, num_shots_slider, all_questions_checkbox, num_questions_slider]
|
325 |
+
]
|
326 |
+
)
|
327 |
+
|
328 |
+
# Connect cancel button
|
329 |
+
cancel_mmlu_button.click(
|
330 |
+
fn=cancel_evaluation,
|
331 |
+
inputs=None,
|
332 |
+
outputs=[
|
333 |
+
all_subjects_checkbox,
|
334 |
+
num_subjects_slider,
|
335 |
+
num_shots_slider,
|
336 |
+
all_questions_checkbox,
|
337 |
+
num_questions_slider,
|
338 |
+
eval_mmlu_button,
|
339 |
+
cancel_mmlu_button,
|
340 |
+
results_output,
|
341 |
+
results_table
|
342 |
+
]
|
343 |
)
|
344 |
|
345 |
+
demo.launch()
|