rohansampath commited on
Commit
a5202a8
·
verified ·
1 Parent(s): 37aa822

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -43
app.py CHANGED
@@ -24,13 +24,14 @@ model_name = "mistralai/Mistral-7B-v0.1"
24
  # 2. MMLU-Pro Evaluation
25
  # ---------------------------------------------------------------------------
26
  @spaces.GPU(duration=240)
27
- def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
28
  """
29
  Runs the MMLU evaluation with the specified parameters.
30
 
31
  Args:
32
- all_subjects (bool): Whether to evaluate all subjects
33
  num_subjects (int): Number of subjects to evaluate (1-14)
 
34
  num_shots (int): Number of few-shot examples (0-5)
35
  all_questions (bool): Whether to evaluate all questions per subject
36
  num_questions (int): Number of examples per subject (1-100 or all)
@@ -38,8 +39,11 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
38
  """
39
  try:
40
  # Convert parameters if needed
41
- if all_subjects:
42
  num_subjects = -1
 
 
 
43
 
44
  if all_questions:
45
  num_questions = -1
@@ -50,7 +54,8 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
50
  model_name,
51
  num_subjects=num_subjects,
52
  num_questions=num_questions,
53
- num_shots=num_shots,
 
54
  )
55
  elapsed_time = time.time() - start_time
56
 
@@ -131,6 +136,17 @@ with gr.Blocks(css="""
131
  h1 {
132
  text-align: center;
133
  }
 
 
 
 
 
 
 
 
 
 
 
134
  """) as demo:
135
  gr.Markdown("# Head-to-Head Model Evaluation Comparator")
136
  gr.Markdown("""
@@ -142,7 +158,7 @@ with gr.Blocks(css="""
142
  """)
143
 
144
  # Dataset Selection Section
145
- gr.Markdown("## (A) Select Dataset for Evaluation")
146
 
147
  with gr.Row():
148
  dataset_dropdown = gr.Dropdown(
@@ -165,35 +181,74 @@ with gr.Blocks(css="""
165
  gr.Markdown(" ")
166
  gr.Markdown(" ")
167
 
 
 
 
 
168
  # MMLU Config Container - Initially hidden until dataset is selected
169
  with gr.Column(visible=False) as mmlu_config_container:
170
- gr.Markdown("## (B) Select Dataset Configuration Options")
171
 
172
- with gr.Row():
173
- all_subjects_checkbox = gr.Checkbox(
174
- label="Evaluate All Subjects",
175
- value=False,
176
- info="When checked, evaluates all 14 MMLU-Pro subjects"
177
- )
178
- num_subjects_slider = gr.Slider(
179
- minimum=1,
180
- maximum=14,
181
- value=14,
182
- step=1,
183
- label="Number of Subjects",
184
- info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order.",
185
- interactive=True
186
- )
187
 
188
  with gr.Row():
189
- num_shots_slider = gr.Slider(
190
- minimum=0,
191
- maximum=5,
192
- value=5,
193
- step=1,
194
- label="Number of Few-shot Examples",
195
- info="Number of examples to use for few-shot learning (0-5)."
196
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  with gr.Row():
199
  all_questions_checkbox = gr.Checkbox(
@@ -296,14 +351,20 @@ with gr.Blocks(css="""
296
  outputs=[preview_visibility, dataset_preview_container, preview_output, preview_toggle]
297
  )
298
 
299
- # Update num_subjects_slider interactivity based on all_subjects checkbox
300
- def update_subjects_slider(checked):
301
- return gr.update(interactive=not checked)
 
 
 
 
 
302
 
303
- all_subjects_checkbox.change(
304
- fn=update_subjects_slider,
305
- inputs=[all_subjects_checkbox],
306
- outputs=[num_subjects_slider]
 
307
  )
308
 
309
  # Update interface based on all_questions checkbox
@@ -319,6 +380,20 @@ with gr.Blocks(css="""
319
  outputs=[questions_container, questions_info_text]
320
  )
321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  # Function to disable UI components during evaluation
323
  def start_evaluation(state):
324
  if state["running"]:
@@ -341,8 +416,9 @@ with gr.Blocks(css="""
341
 
342
  return [
343
  state,
344
- gr.update(interactive=False), # all_subjects_checkbox
345
  gr.update(interactive=False), # num_subjects_slider
 
346
  gr.update(interactive=False), # num_shots_slider
347
  gr.update(interactive=False), # all_questions_checkbox
348
  gr.update(interactive=False), # num_questions_slider
@@ -365,8 +441,9 @@ with gr.Blocks(css="""
365
  state["running"] = False
366
  return [
367
  state,
368
- gr.update(interactive=True), # all_subjects_checkbox
369
  gr.update(interactive=True), # num_subjects_slider
 
370
  gr.update(interactive=True), # num_shots_slider
371
  gr.update(interactive=True), # all_questions_checkbox
372
  gr.update(interactive=True), # num_questions_slider
@@ -383,8 +460,9 @@ with gr.Blocks(css="""
383
  inputs=[evaluation_state],
384
  outputs=[
385
  evaluation_state,
386
- all_subjects_checkbox,
387
  num_subjects_slider,
 
388
  num_shots_slider,
389
  all_questions_checkbox,
390
  num_questions_slider,
@@ -395,10 +473,19 @@ with gr.Blocks(css="""
395
  results_table_container
396
  ]
397
  ).then(
398
- fn=run_mmlu_evaluation,
 
 
 
 
 
 
 
 
399
  inputs=[
400
- all_subjects_checkbox,
401
  num_subjects_slider,
 
402
  num_shots_slider,
403
  all_questions_checkbox,
404
  num_questions_slider
@@ -408,7 +495,7 @@ with gr.Blocks(css="""
408
  results_table,
409
  eval_mmlu_button,
410
  cancel_mmlu_button,
411
- all_subjects_checkbox,
412
  num_subjects_slider,
413
  num_shots_slider,
414
  all_questions_checkbox,
@@ -427,8 +514,9 @@ with gr.Blocks(css="""
427
  inputs=[evaluation_state],
428
  outputs=[
429
  evaluation_state,
430
- all_subjects_checkbox,
431
  num_subjects_slider,
 
432
  num_shots_slider,
433
  all_questions_checkbox,
434
  num_questions_slider,
 
24
  # 2. MMLU-Pro Evaluation
25
  # ---------------------------------------------------------------------------
26
  @spaces.GPU(duration=240)
27
+ def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
28
  """
29
  Runs the MMLU evaluation with the specified parameters.
30
 
31
  Args:
32
+ subject_selection_mode (str): Mode of subject selection ("all", "number", or "specific")
33
  num_subjects (int): Number of subjects to evaluate (1-14)
34
+ selected_subjects (list): List of specific subjects to evaluate
35
  num_shots (int): Number of few-shot examples (0-5)
36
  all_questions (bool): Whether to evaluate all questions per subject
37
  num_questions (int): Number of examples per subject (1-100 or all)
 
39
  """
40
  try:
41
  # Convert parameters if needed
42
+ if subject_selection_mode == "all":
43
  num_subjects = -1
44
+ selected_subjects = []
45
+ elif subject_selection_mode == "specific":
46
+ num_subjects = len(selected_subjects) if selected_subjects else -1
47
 
48
  if all_questions:
49
  num_questions = -1
 
54
  model_name,
55
  num_subjects=num_subjects,
56
  num_questions=num_questions,
57
+ num_shots=num_shots,
58
+ specific_subjects=selected_subjects if subject_selection_mode == "specific" else None
59
  )
60
  elapsed_time = time.time() - start_time
61
 
 
136
  h1 {
137
  text-align: center;
138
  }
139
+ .section-spacing {
140
+ margin-top: 30px;
141
+ margin-bottom: 30px;
142
+ }
143
+ .config-box {
144
+ border: 1px solid #ddd;
145
+ border-radius: 8px;
146
+ padding: 15px;
147
+ margin: 10px;
148
+ background-color: #f9f9f9;
149
+ }
150
  """) as demo:
151
  gr.Markdown("# Head-to-Head Model Evaluation Comparator")
152
  gr.Markdown("""
 
158
  """)
159
 
160
  # Dataset Selection Section
161
+ gr.Markdown("## (A) Select Dataset for Evaluation", elem_classes=["section-spacing"])
162
 
163
  with gr.Row():
164
  dataset_dropdown = gr.Dropdown(
 
181
  gr.Markdown(" ")
182
  gr.Markdown(" ")
183
 
184
+ # Add more spacing between sections
185
+ gr.Markdown(" ", elem_classes=["section-spacing"])
186
+ gr.Markdown(" ", elem_classes=["section-spacing"])
187
+
188
  # MMLU Config Container - Initially hidden until dataset is selected
189
  with gr.Column(visible=False) as mmlu_config_container:
190
+ gr.Markdown("## (B) Select Dataset Configuration Options", elem_classes=["section-spacing"])
191
 
192
+ # Add more spacing
193
+ gr.Markdown(" ")
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  with gr.Row():
196
+ # Left column for subject selection
197
+ with gr.Column(scale=1):
198
+ with gr.Box(elem_classes=["config-box"]):
199
+ gr.Markdown("### Choose Subjects")
200
+
201
+ subject_selection_mode = gr.Radio(
202
+ choices=["Evaluate All Subjects", "Choose Number of Subjects", "Specify which Subjects to Evaluate"],
203
+ value="Evaluate All Subjects",
204
+ label="Subject Selection Mode"
205
+ )
206
+
207
+ # Subject number slider - initially hidden, shown when "Choose Number of Subjects" is selected
208
+ with gr.Column(visible=False) as num_subjects_container:
209
+ num_subjects_slider = gr.Slider(
210
+ minimum=1,
211
+ maximum=14,
212
+ value=14,
213
+ step=1,
214
+ label="Number of Subjects",
215
+ info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order."
216
+ )
217
+
218
+ # Subject checkboxes - initially hidden, shown when "Specify which Subjects to Evaluate" is selected
219
+ with gr.Column(visible=False) as specific_subjects_container:
220
+ # We'll populate this with checkboxes for each subject
221
+ # The actual subjects will come from the dataset preview
222
+ specific_subjects = gr.CheckboxGroup(
223
+ choices=[
224
+ "Biology (n=717)",
225
+ "Chemistry (n=500)",
226
+ "Physics (n=650)",
227
+ "Mathematics (n=800)",
228
+ "Computer Science (n=450)",
229
+ "History (n=300)",
230
+ "Literature (n=250)"
231
+ ],
232
+ label="Select Specific Subjects",
233
+ info="Select which specific subjects to evaluate"
234
+ )
235
+
236
+ # Right column for few-shot examples
237
+ with gr.Column(scale=1):
238
+ with gr.Box(elem_classes=["config-box"]):
239
+ gr.Markdown("### Few-shot Configuration")
240
+
241
+ num_shots_slider = gr.Slider(
242
+ minimum=0,
243
+ maximum=5,
244
+ value=5,
245
+ step=1,
246
+ label="Number of Few-shot Examples",
247
+ info="Number of examples to use for few-shot learning (0-5)."
248
+ )
249
+
250
+ # Add spacing
251
+ gr.Markdown(" ")
252
 
253
  with gr.Row():
254
  all_questions_checkbox = gr.Checkbox(
 
351
  outputs=[preview_visibility, dataset_preview_container, preview_output, preview_toggle]
352
  )
353
 
354
+ # Function to update UI based on subject selection mode
355
+ def update_subject_selection_ui(mode):
356
+ if mode == "Evaluate All Subjects":
357
+ return gr.update(visible=False), gr.update(visible=False)
358
+ elif mode == "Choose Number of Subjects":
359
+ return gr.update(visible=True), gr.update(visible=False)
360
+ else: # "Specify which Subjects to Evaluate"
361
+ return gr.update(visible=False), gr.update(visible=True)
362
 
363
+ # Connect subject selection mode to UI updates
364
+ subject_selection_mode.change(
365
+ fn=update_subject_selection_ui,
366
+ inputs=[subject_selection_mode],
367
+ outputs=[num_subjects_container, specific_subjects_container]
368
  )
369
 
370
  # Update interface based on all_questions checkbox
 
380
  outputs=[questions_container, questions_info_text]
381
  )
382
 
383
+ # Function to convert subject selection mode to parameters
384
+ def get_subject_mode_param(mode):
385
+ if mode == "Evaluate All Subjects":
386
+ return "all"
387
+ elif mode == "Choose Number of Subjects":
388
+ return "number"
389
+ else: # "Specify which Subjects to Evaluate"
390
+ return "specific"
391
+
392
+ # Function to extract subject names from checkboxes
393
+ def get_subject_names(selected_subjects):
394
+ # Extract just the subject name without the count
395
+ return [subject.split(" (")[0] for subject in selected_subjects]
396
+
397
  # Function to disable UI components during evaluation
398
  def start_evaluation(state):
399
  if state["running"]:
 
416
 
417
  return [
418
  state,
419
+ gr.update(interactive=False), # subject_selection_mode
420
  gr.update(interactive=False), # num_subjects_slider
421
+ gr.update(interactive=False), # specific_subjects
422
  gr.update(interactive=False), # num_shots_slider
423
  gr.update(interactive=False), # all_questions_checkbox
424
  gr.update(interactive=False), # num_questions_slider
 
441
  state["running"] = False
442
  return [
443
  state,
444
+ gr.update(interactive=True), # subject_selection_mode
445
  gr.update(interactive=True), # num_subjects_slider
446
+ gr.update(interactive=True), # specific_subjects
447
  gr.update(interactive=True), # num_shots_slider
448
  gr.update(interactive=True), # all_questions_checkbox
449
  gr.update(interactive=True), # num_questions_slider
 
460
  inputs=[evaluation_state],
461
  outputs=[
462
  evaluation_state,
463
+ subject_selection_mode,
464
  num_subjects_slider,
465
+ specific_subjects,
466
  num_shots_slider,
467
  all_questions_checkbox,
468
  num_questions_slider,
 
473
  results_table_container
474
  ]
475
  ).then(
476
+ fn=lambda mode, num, subjects, shots, all_q, num_q:
477
+ run_mmlu_evaluation(
478
+ get_subject_mode_param(mode),
479
+ num,
480
+ get_subject_names(subjects),
481
+ shots,
482
+ all_q,
483
+ num_q
484
+ ),
485
  inputs=[
486
+ subject_selection_mode,
487
  num_subjects_slider,
488
+ specific_subjects,
489
  num_shots_slider,
490
  all_questions_checkbox,
491
  num_questions_slider
 
495
  results_table,
496
  eval_mmlu_button,
497
  cancel_mmlu_button,
498
+ subject_selection_mode,
499
  num_subjects_slider,
500
  num_shots_slider,
501
  all_questions_checkbox,
 
514
  inputs=[evaluation_state],
515
  outputs=[
516
  evaluation_state,
517
+ subject_selection_mode,
518
  num_subjects_slider,
519
+ specific_subjects,
520
  num_shots_slider,
521
  all_questions_checkbox,
522
  num_questions_slider,