rohansampath commited on
Commit
0b21fdd
·
verified ·
1 Parent(s): 7d6f6c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -485
app.py CHANGED
@@ -1,12 +1,19 @@
1
- import gradio as gr
2
  import os
 
3
  from huggingface_hub import login
4
- from mmlu_pro_eval_adapted import evaluate_mmlu_pro
5
- import spaces
6
- import pandas as pd
7
- import time
8
- import traceback
9
- from dataset_previews import mmlupro_dataset_preview, format_preview_for_display
 
 
 
 
 
 
 
10
 
11
  # Read token and login
12
  hf_token = os.getenv("HF_READ_WRITE_TOKEN")
@@ -16,516 +23,184 @@ else:
16
  print("⚠️ No HF_READ_WRITE_TOKEN found in environment")
17
 
18
  # ---------------------------------------------------------------------------
19
- # 1. Model configuration
20
  # ---------------------------------------------------------------------------
21
- model_name = "mistralai/Mistral-7B-v0.1"
22
-
23
- # ---------------------------------------------------------------------------
24
- # 2. MMLU-Pro Evaluation
25
- # ---------------------------------------------------------------------------
26
- @spaces.GPU(duration=240)
27
- def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
28
- """
29
- Runs the MMLU evaluation with the specified parameters.
30
 
31
- Args:
32
- subject_selection_mode (str): Mode of subject selection ("all", "number", or "specific")
33
- num_subjects (int): Number of subjects to evaluate (1-14)
34
- selected_subjects (list): List of specific subjects to evaluate
35
- num_shots (int): Number of few-shot examples (0-5)
36
- all_questions (bool): Whether to evaluate all questions per subject
37
- num_questions (int): Number of examples per subject (1-100 or all)
38
- progress (gr.Progress): Progress indicator
39
- """
40
- try:
41
- # Convert parameters if needed
42
- if subject_selection_mode == "all":
43
- num_subjects = -1
44
- selected_subjects = []
45
- elif subject_selection_mode == "specific":
46
- num_subjects = len(selected_subjects) if selected_subjects else -1
47
-
48
- if all_questions:
49
- num_questions = -1
50
-
51
- # Run evaluation with timing
52
- start_time = time.time()
53
- results = evaluate_mmlu_pro(
54
- model_name,
55
- num_subjects=num_subjects,
56
- num_questions=num_questions,
57
- num_shots=num_shots,
58
- specific_subjects=selected_subjects if subject_selection_mode == "specific" else None
59
- )
60
- elapsed_time = time.time() - start_time
61
-
62
- # Format results
63
- overall_acc = results["overall_accuracy"]
64
- min_subject, min_acc = results["min_accuracy_subject"]
65
- max_subject, max_acc = results["max_accuracy_subject"]
66
-
67
- # Create DataFrame from results table
68
- results_df = pd.DataFrame(results["full_accuracy_table"])
69
-
70
- # Calculate totals for the overall row
71
- total_samples = results_df['Num_samples'].sum()
72
- total_correct = results_df['Num_correct'].sum()
73
-
74
- # Create overall row
75
- overall_row = pd.DataFrame({
76
- 'Subject': ['**Overall**'],
77
- 'Num_samples': [total_samples],
78
- 'Num_correct': [total_correct],
79
- 'Accuracy': [overall_acc]
80
- })
81
-
82
- # Concatenate overall row with results
83
- results_df = pd.concat([overall_row, results_df], ignore_index=True)
84
-
85
- # Format the report
86
- report = (
87
- f"### Overall Results\n"
88
- f"* Overall Accuracy: {overall_acc:.3f}\n"
89
- f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
90
- f"* Worst Performance: {min_subject} ({min_acc:.3f})\n"
91
- f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
92
- )
93
-
94
- # Return values that re-enable UI components after completion
95
- return (report,
96
- results_df,
97
- gr.update(interactive=True),
98
- gr.update(visible=False),
99
- gr.update(interactive=True),
100
- gr.update(interactive=True),
101
- gr.update(interactive=True),
102
- gr.update(interactive=True),
103
- gr.update(interactive=True),
104
- gr.update(visible=True))
105
-
106
- except Exception as e:
107
- # Handle errors gracefully
108
- error_trace = traceback.format_exc()
109
- error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
110
-
111
- # Re-enable UI components on error
112
- return (error_message,
113
- None,
114
- gr.update(interactive=True),
115
- gr.update(visible=False),
116
- gr.update(interactive=True),
117
- gr.update(interactive=True),
118
- gr.update(interactive=True),
119
- gr.update(interactive=True),
120
- gr.update(interactive=True),
121
- gr.update(visible=False))
122
-
123
- # ---------------------------------------------------------------------------
124
- # 3. Gradio Interface
125
- # ---------------------------------------------------------------------------
126
- with gr.Blocks(css="""
127
- #preview_header {
128
- margin-bottom: 10px;
129
- margin-top: 5px;
130
- }
131
- #preview_table {
132
- background-color: #f8f9fa;
133
- border-radius: 8px;
134
- padding: 10px;
135
- }
136
- h1 {
137
- text-align: center;
138
- }
139
- .section-divider {
140
- border-top: 1px solid #ddd;
141
- margin: 12px 0;
142
- }
143
- .config-box {
144
- border: 1px solid #ddd;
145
- border-radius: 8px;
146
- padding: 15px;
147
- margin: 10px;
148
- background-color: #f9f9f9;
149
- }
150
- """) as demo:
151
- gr.Markdown("# Head-to-Head Model Evaluation Comparator")
152
- gr.Markdown("""
153
- This demo evaluates two models (or one model with two different configs), head-to-head, on a benchmark dataset.
154
 
155
- Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)
 
156
 
157
- Available Models: [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
158
- """)
159
 
160
- # Dataset Selection Section
161
- gr.Markdown("## (A) Select Dataset for Evaluation")
162
 
163
- with gr.Row():
164
- dataset_dropdown = gr.Dropdown(
165
- choices=["(Select Dataset)", "MMLU-Pro"],
166
- value="(Select Dataset)",
167
- label="Dataset",
168
- info="Select a dataset to perform the Head-to-Head Evaluation on. Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)"
169
- )
170
- preview_toggle = gr.Button("Show Dataset Preview", interactive=False, variant="secondary")
171
 
172
- # Dataset Preview Container - Initially hidden
173
- with gr.Column(visible=False) as dataset_preview_container:
174
- gr.Markdown("## Dataset Preview", elem_id="preview_header")
175
- preview_output = gr.DataFrame(
176
- interactive=False,
177
- wrap=True,
178
- elem_id="preview_table"
179
- )
180
- # Add a divider instead of lots of space
181
- gr.Markdown("<div class='section-divider'></div>")
182
-
183
- # Preview data state to store the loaded preview data
184
- preview_data_state = gr.State(None)
185
-
186
- # MMLU Config Container - Initially hidden until dataset is selected
187
- with gr.Column(visible=False) as mmlu_config_container:
188
- gr.Markdown("## (B) Select Dataset Configuration Options")
189
-
190
- with gr.Row():
191
- # Left column for subject selection
192
- with gr.Column(scale=1):
193
- with gr.Group(elem_classes=["config-box"]):
194
- gr.Markdown("### Choose Subjects")
195
-
196
- subject_selection_mode = gr.Radio(
197
- choices=["Evaluate All Subjects", "Choose Number of Subjects", "Specify which Subjects to Evaluate"],
198
- value="Evaluate All Subjects",
199
- label="Subject Selection Mode"
200
- )
201
-
202
- # Subject number slider - initially hidden, shown when "Choose Number of Subjects" is selected
203
- with gr.Column(visible=False) as num_subjects_container:
204
- num_subjects_slider = gr.Slider(
205
- minimum=1,
206
- maximum=14, # Will be updated dynamically based on preview data
207
- value=1,
208
- step=1,
209
- label="Number of Subjects",
210
- info="Number of subjects to evaluate. They will be loaded in alphabetical order."
211
- )
212
-
213
- # Subject checkboxes - initially hidden, shown when "Specify which Subjects to Evaluate" is selected
214
- with gr.Column(visible=False) as specific_subjects_container:
215
- # Will be populated dynamically from the preview data
216
- specific_subjects = gr.CheckboxGroup(
217
- choices=[], # Will be populated from preview data
218
- label="Select Specific Subjects",
219
- info="Select which specific subjects to evaluate"
220
- )
221
-
222
- # Right column for few-shot examples
223
- with gr.Column(scale=1):
224
- with gr.Group(elem_classes=["config-box"]):
225
- gr.Markdown("### Few-shot Configuration")
226
-
227
- num_shots_slider = gr.Slider(
228
- minimum=0,
229
- maximum=5,
230
- value=5,
231
- step=1,
232
- label="Number of Few-shot Examples",
233
- info="Number of examples to use for few-shot learning (0-5). They will be loaded in order of question_id."
234
- )
235
-
236
- # Add a small space
237
- gr.Markdown("&nbsp;")
238
-
239
- with gr.Row():
240
- all_questions_checkbox = gr.Checkbox(
241
- label="Evaluate All Questions",
242
- value=False,
243
- info="When checked, evaluates all available questions for each subject"
244
- )
245
- questions_info_text = gr.Markdown(visible=False, value="**All questions across the selected subjects will be evaluated**")
246
-
247
- with gr.Row(elem_id="questions_selection_row"):
248
- questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
249
-
250
- with questions_container:
251
- num_questions_slider = gr.Slider(
252
- minimum=1,
253
- maximum=100,
254
- value=20,
255
- step=1,
256
- label="Questions per Subject",
257
- info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.",
258
- interactive=True
259
- )
260
-
261
- with gr.Row():
262
- with gr.Column(scale=1):
263
- eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True)
264
- cancel_mmlu_button = gr.Button("Cancel Evaluation", variant="stop", visible=False)
265
-
266
- # Results Section - Initially hidden
267
- with gr.Column(visible=False) as results_container:
268
- results_output = gr.Markdown(label="Evaluation Results")
269
-
270
- # Results table - Initially hidden until evaluation completes
271
- with gr.Column(visible=False) as results_table_container:
272
- with gr.Row():
273
- results_table = gr.DataFrame(
274
- interactive=True,
275
- label="Detailed Results (Sortable)",
276
- visible=True
277
- )
278
-
279
- # Track evaluation state
280
  evaluation_state = gr.State({"running": False})
281
-
282
- # Track preview visibility state
283
  preview_visibility = gr.State(False)
284
 
285
- # Function to process dataset preview data
286
- def get_subject_choices_from_preview(preview_data):
287
- if not preview_data or 'subject_counts' not in preview_data:
288
- print("FAILURE HERE")
289
- return [], 0
290
-
291
- # Get subject counts from preview data
292
- subject_counts = preview_data['subject_counts']
293
-
294
- # Sort subjects alphabetically
295
- subjects = sorted(subject_counts.keys())
296
-
297
- # Format as "Subject (n=count)"
298
- formatted_subjects = [f"{subject} (n={subject_counts[subject]})" for subject in subjects]
299
- print("Formatted Subjects", formatted_subjects)
300
-
301
-
302
- return formatted_subjects, len(subjects)
303
-
304
- # Function to load preview data and update UI
305
- def load_dataset_preview(dataset):
306
- if dataset == "MMLU-Pro":
307
- # Load the preview data
308
- preview_data = mmlupro_dataset_preview(regenerate_preview=True)
309
-
310
- # Extract subject choices and count
311
- subject_choices, subject_count = get_subject_choices_from_preview(preview_data)
312
-
313
- # Update the UI components
314
- return (
315
- preview_data, # Store the preview data
316
- gr.update(choices=subject_choices), # Update checkbox choices
317
- gr.update(maximum=subject_count, value=1) # Update slider max
318
- )
319
- return None, gr.update(), gr.update()
320
-
321
- # Function to show/hide configuration based on selected dataset
322
- def update_interface_based_on_dataset(dataset, current_visibility):
323
- if dataset == "MMLU-Pro":
324
- return (
325
- gr.update(visible=True), # mmlu_config_container
326
- gr.update(visible=True), # results_container
327
- gr.update(interactive=True), # preview_toggle
328
- gr.update(visible=False), # dataset_preview_container - hide it initially
329
- False, # Reset preview_visibility to False
330
- gr.update(value="Show Dataset Preview") # Reset button text
331
- )
332
- else:
333
- return (
334
- gr.update(visible=False), # mmlu_config_container
335
- gr.update(visible=False), # results_container
336
- gr.update(interactive=False), # preview_toggle
337
- gr.update(visible=False), # dataset_preview_container - hide when no dataset
338
- False, # Reset preview_visibility to False
339
- gr.update(value="Show Dataset Preview") # Reset button text
340
- )
341
-
342
  # Connect dataset dropdown to show/hide appropriate configuration and load preview data
343
- dataset_dropdown.change(
344
  fn=load_dataset_preview,
345
- inputs=[dataset_dropdown],
346
- outputs=[preview_data_state, specific_subjects, num_subjects_slider],
 
 
 
 
347
  ).then(
348
  fn=update_interface_based_on_dataset,
349
- inputs=[dataset_dropdown, preview_visibility],
350
- outputs=[mmlu_config_container, results_container, preview_toggle, dataset_preview_container, preview_visibility, preview_toggle]
 
 
 
 
 
 
 
 
 
 
 
351
  )
352
 
353
- # Function to toggle dataset preview visibility
354
- def toggle_preview(dataset, preview_visibility, preview_data):
355
- # Toggle the visibility state
356
- is_visible = not preview_visibility
357
-
358
- # Update button text based on new state
359
- button_text = "Hide Dataset Preview" if is_visible else "Show Dataset Preview"
360
-
361
- # Format and show preview if becoming visible
362
- if is_visible and dataset == "MMLU-Pro":
363
- formatted_preview = format_preview_for_display(preview_data)
364
- return is_visible, gr.update(visible=True), formatted_preview, gr.update(value=button_text)
365
- elif is_visible:
366
- # For other datasets (not implemented yet)
367
- return is_visible, gr.update(visible=True), None, gr.update(value=button_text)
368
- else:
369
- # Hiding the preview
370
- return is_visible, gr.update(visible=False), None, gr.update(value=button_text)
371
-
372
  # Connect preview toggle to show/hide dataset information
373
- preview_toggle.click(
374
  fn=toggle_preview,
375
- inputs=[dataset_dropdown, preview_visibility, preview_data_state],
376
- outputs=[preview_visibility, dataset_preview_container, preview_output, preview_toggle]
 
 
 
 
 
 
 
 
 
377
  )
378
 
379
- # Function to update UI based on subject selection mode
380
- def update_subject_selection_ui(mode):
381
- if mode == "Evaluate All Subjects":
382
- return gr.update(visible=False), gr.update(visible=False)
383
- elif mode == "Choose Number of Subjects":
384
- return gr.update(visible=True), gr.update(visible=False)
385
- else: # "Specify which Subjects to Evaluate"
386
- return gr.update(visible=False), gr.update(visible=True)
387
-
388
  # Connect subject selection mode to UI updates
389
- subject_selection_mode.change(
390
  fn=update_subject_selection_ui,
391
- inputs=[subject_selection_mode],
392
- outputs=[num_subjects_container, specific_subjects_container]
 
 
 
 
 
393
  )
394
 
395
  # Update interface based on all_questions checkbox
396
- def update_questions_interface(checked):
397
- if checked:
398
- return gr.update(visible=False), gr.update(visible=True)
399
- else:
400
- return gr.update(visible=True), gr.update(visible=False)
401
-
402
- all_questions_checkbox.change(
403
  fn=update_questions_interface,
404
- inputs=[all_questions_checkbox],
405
- outputs=[questions_container, questions_info_text]
 
 
 
 
 
406
  )
407
 
408
- # Function to convert subject selection mode to parameters
409
- def get_subject_mode_param(mode):
410
- if mode == "Evaluate All Subjects":
411
- return "all"
412
- elif mode == "Choose Number of Subjects":
413
- return "number"
414
- else: # "Specify which Subjects to Evaluate"
415
- return "specific"
416
-
417
- # Function to extract subject names from checkboxes
418
- def get_subject_names(selected_subjects):
419
- # Extract just the subject name without the count
420
- return [subject.split(" (")[0] for subject in selected_subjects]
421
-
422
- # Function to disable UI components during evaluation
423
- def start_evaluation(state):
424
- if state["running"]:
425
- return [
426
- state,
427
- gr.update(interactive=False),
428
- gr.update(interactive=False),
429
- gr.update(interactive=False),
430
- gr.update(interactive=False),
431
- gr.update(interactive=False),
432
- gr.update(interactive=False),
433
- gr.update(visible=True),
434
- "Evaluation already in progress. Please wait.",
435
- None,
436
- gr.update(visible=False)
437
  ]
438
-
439
- # Update state to running
440
- state["running"] = True
441
-
442
- return [
443
- state,
444
- gr.update(interactive=False), # subject_selection_mode
445
- gr.update(interactive=False), # num_subjects_slider
446
- gr.update(interactive=False), # specific_subjects
447
- gr.update(interactive=False), # num_shots_slider
448
- gr.update(interactive=False), # all_questions_checkbox
449
- gr.update(interactive=False), # num_questions_slider
450
- gr.update(interactive=False), # eval_mmlu_button
451
- gr.update(visible=True), # cancel_mmlu_button
452
- "Starting evaluation...", # results_output
453
- None, # results_table
454
- gr.update(visible=False) # results_table_container
455
- ]
456
-
457
- # Function to reset UI after evaluation
458
- def finish_evaluation(state):
459
- state["running"] = False
460
- return state
461
-
462
- # Function to handle cancel button click
463
- def cancel_evaluation(state):
464
- # Note: This doesn't actually stop the evaluation process
465
- # It only updates the UI state to appear canceled
466
- state["running"] = False
467
- return [
468
- state,
469
- gr.update(interactive=True), # subject_selection_mode
470
- gr.update(interactive=True), # num_subjects_slider
471
- gr.update(interactive=True), # specific_subjects
472
- gr.update(interactive=True), # num_shots_slider
473
- gr.update(interactive=True), # all_questions_checkbox
474
- gr.update(interactive=True), # num_questions_slider
475
- gr.update(interactive=True), # eval_mmlu_button
476
- gr.update(visible=False), # cancel_mmlu_button
477
- "⚠️ Evaluation canceled by user (note: backend process may continue running)", # results_output
478
- None, # results_table
479
- gr.update(visible=False) # results_table_container
480
- ]
481
 
482
- # Connect MMLU evaluation button with state tracking
483
- eval_mmlu_button.click(
484
  fn=start_evaluation,
485
  inputs=[evaluation_state],
486
  outputs=[
487
  evaluation_state,
488
- subject_selection_mode,
489
- num_subjects_slider,
490
- specific_subjects,
491
- num_shots_slider,
492
- all_questions_checkbox,
493
- num_questions_slider,
494
- eval_mmlu_button,
495
- cancel_mmlu_button,
496
- results_output,
497
- results_table,
498
- results_table_container
 
499
  ]
500
  ).then(
501
- fn=lambda mode, num, subjects, shots, all_q, num_q:
502
  run_mmlu_evaluation(
503
  get_subject_mode_param(mode),
504
  num,
505
  get_subject_names(subjects),
506
- shots,
507
  all_q,
508
- num_q
 
 
 
 
509
  ),
510
  inputs=[
511
- subject_selection_mode,
512
- num_subjects_slider,
513
- specific_subjects,
514
- num_shots_slider,
515
- all_questions_checkbox,
516
- num_questions_slider
 
 
 
 
 
 
 
517
  ],
518
  outputs=[
519
- results_output,
520
- results_table,
521
- eval_mmlu_button,
522
- cancel_mmlu_button,
523
- subject_selection_mode,
524
- num_subjects_slider,
525
- num_shots_slider,
526
- all_questions_checkbox,
527
- num_questions_slider,
528
- results_table_container
529
  ]
530
  ).then(
531
  fn=finish_evaluation,
@@ -534,23 +209,25 @@ with gr.Blocks(css="""
534
  )
535
 
536
  # Connect cancel button
537
- cancel_mmlu_button.click(
538
  fn=cancel_evaluation,
539
  inputs=[evaluation_state],
540
  outputs=[
541
  evaluation_state,
542
- subject_selection_mode,
543
- num_subjects_slider,
544
- specific_subjects,
545
- num_shots_slider,
546
- all_questions_checkbox,
547
- num_questions_slider,
548
- eval_mmlu_button,
549
- cancel_mmlu_button,
550
- results_output,
551
- results_table,
552
- results_table_container
 
553
  ]
554
  )
555
 
556
- demo.launch()
 
 
 
1
  import os
2
+ import gradio as gr
3
  from huggingface_hub import login
4
+
5
+ # Import modules
6
+ from modules.ui_components import CSS, create_header, create_results_section, create_action_buttons
7
+ from modules.dataset_config import (create_dataset_section, create_mmlu_config_section,
8
+ load_dataset_preview, update_interface_based_on_dataset,
9
+ toggle_preview, update_subject_selection_ui,
10
+ update_questions_interface, get_subject_mode_param,
11
+ get_subject_names)
12
+ from modules.app_models_config import (create_model_config_section, update_eval_button_state,
13
+ get_model_configs)
14
+ from modules.evaluation import run_mmlu_evaluation
15
+ from utils.state_management import (start_evaluation, finish_evaluation,
16
+ cancel_evaluation, handle_evaluation_results)
17
 
18
  # Read token and login
19
  hf_token = os.getenv("HF_READ_WRITE_TOKEN")
 
23
  print("⚠️ No HF_READ_WRITE_TOKEN found in environment")
24
 
25
  # ---------------------------------------------------------------------------
26
+ # Gradio Interface
27
  # ---------------------------------------------------------------------------
28
+ with gr.Blocks(css=CSS) as demo:
29
+ # Create header
30
+ header_components = create_header()
 
 
 
 
 
 
31
 
32
+ # Create dataset section (Section A)
33
+ dataset_components = create_dataset_section()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Create MMLU config section (Section B)
36
+ mmlu_components = create_mmlu_config_section()
37
 
38
+ # Create model config section (Section C)
39
+ model_components = create_model_config_section()
40
 
41
+ # Create results section
42
+ results_components = create_results_section()
43
 
44
+ # Create action buttons
45
+ action_components = create_action_buttons()
 
 
 
 
 
 
46
 
47
+ # State tracking
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  evaluation_state = gr.State({"running": False})
 
 
49
  preview_visibility = gr.State(False)
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # Connect dataset dropdown to show/hide appropriate configuration and load preview data
52
+ dataset_components['dropdown'].change(
53
  fn=load_dataset_preview,
54
+ inputs=[dataset_components['dropdown']],
55
+ outputs=[
56
+ dataset_components['preview_data_state'],
57
+ mmlu_components['specific_subjects'],
58
+ mmlu_components['num_subjects_slider']
59
+ ],
60
  ).then(
61
  fn=update_interface_based_on_dataset,
62
+ inputs=[
63
+ dataset_components['dropdown'],
64
+ preview_visibility
65
+ ],
66
+ outputs=[
67
+ mmlu_components['container'],
68
+ model_components['container'],
69
+ results_components['container'],
70
+ dataset_components['preview_toggle'],
71
+ dataset_components['preview_container'],
72
+ preview_visibility,
73
+ dataset_components['preview_toggle']
74
+ ]
75
  )
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # Connect preview toggle to show/hide dataset information
78
+ dataset_components['preview_toggle'].click(
79
  fn=toggle_preview,
80
+ inputs=[
81
+ dataset_components['dropdown'],
82
+ preview_visibility,
83
+ dataset_components['preview_data_state']
84
+ ],
85
+ outputs=[
86
+ preview_visibility,
87
+ dataset_components['preview_container'],
88
+ dataset_components['preview_output'],
89
+ dataset_components['preview_toggle']
90
+ ]
91
  )
92
 
 
 
 
 
 
 
 
 
 
93
  # Connect subject selection mode to UI updates
94
+ mmlu_components['subject_selection_mode'].change(
95
  fn=update_subject_selection_ui,
96
+ inputs=[
97
+ mmlu_components['subject_selection_mode']
98
+ ],
99
+ outputs=[
100
+ mmlu_components['num_subjects_container'],
101
+ mmlu_components['specific_subjects_container']
102
+ ]
103
  )
104
 
105
  # Update interface based on all_questions checkbox
106
+ mmlu_components['all_questions_checkbox'].change(
 
 
 
 
 
 
107
  fn=update_questions_interface,
108
+ inputs=[
109
+ mmlu_components['all_questions_checkbox']
110
+ ],
111
+ outputs=[
112
+ mmlu_components['num_questions_slider'],
113
+ mmlu_components['questions_info_text']
114
+ ]
115
  )
116
 
117
+ # Connect model config changes to validation
118
+ for component in [
119
+ model_components['model1_dropdown'],
120
+ model_components['model1_shots'],
121
+ model_components['model1_regex'],
122
+ model_components['model1_flash_attn'],
123
+ model_components['model2_dropdown'],
124
+ model_components['model2_shots'],
125
+ model_components['model2_regex'],
126
+ model_components['model2_flash_attn']
127
+ ]:
128
+ component.change(
129
+ fn=update_eval_button_state,
130
+ inputs=[
131
+ model_components['model1_dropdown'],
132
+ model_components['model1_shots'],
133
+ model_components['model1_regex'],
134
+ model_components['model1_flash_attn'],
135
+ model_components['model2_dropdown'],
136
+ model_components['model2_shots'],
137
+ model_components['model2_regex'],
138
+ model_components['model2_flash_attn']
139
+ ],
140
+ outputs=[
141
+ model_components['error_message'],
142
+ action_components['eval_button']
 
 
 
143
  ]
144
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
+ # Connect evaluation button with state tracking
147
+ action_components['eval_button'].click(
148
  fn=start_evaluation,
149
  inputs=[evaluation_state],
150
  outputs=[
151
  evaluation_state,
152
+ mmlu_components['subject_selection_mode'],
153
+ mmlu_components['num_subjects_slider'],
154
+ mmlu_components['specific_subjects'],
155
+ mmlu_components['all_questions_checkbox'],
156
+ mmlu_components['num_questions_slider'],
157
+ model_components['model1_dropdown'],
158
+ model_components['model2_dropdown'],
159
+ action_components['eval_button'],
160
+ action_components['cancel_button'],
161
+ results_components['output'],
162
+ results_components['table'],
163
+ results_components['table_container']
164
  ]
165
  ).then(
166
+ fn=lambda mode, num, subjects, all_q, num_q, m1, m1_shots, m1_regex, m1_flash, m2, m2_shots, m2_regex, m2_flash:
167
  run_mmlu_evaluation(
168
  get_subject_mode_param(mode),
169
  num,
170
  get_subject_names(subjects),
 
171
  all_q,
172
+ num_q,
173
+ get_model_configs(
174
+ m1, m1_shots, m1_regex, m1_flash,
175
+ m2, m2_shots, m2_regex, m2_flash
176
+ )
177
  ),
178
  inputs=[
179
+ mmlu_components['subject_selection_mode'],
180
+ mmlu_components['num_subjects_slider'],
181
+ mmlu_components['specific_subjects'],
182
+ mmlu_components['all_questions_checkbox'],
183
+ mmlu_components['num_questions_slider'],
184
+ model_components['model1_dropdown'],
185
+ model_components['model1_shots'],
186
+ model_components['model1_regex'],
187
+ model_components['model1_flash_attn'],
188
+ model_components['model2_dropdown'],
189
+ model_components['model2_shots'],
190
+ model_components['model2_regex'],
191
+ model_components['model2_flash_attn']
192
  ],
193
  outputs=[
194
+ results_components['output'],
195
+ results_components['table'],
196
+ action_components['eval_button'],
197
+ action_components['cancel_button'],
198
+ mmlu_components['subject_selection_mode'],
199
+ mmlu_components['num_subjects_slider'],
200
+ mmlu_components['all_questions_checkbox'],
201
+ mmlu_components['num_questions_slider'],
202
+ model_components['model1_dropdown'],
203
+ results_components['table_container']
204
  ]
205
  ).then(
206
  fn=finish_evaluation,
 
209
  )
210
 
211
  # Connect cancel button
212
+ action_components['cancel_button'].click(
213
  fn=cancel_evaluation,
214
  inputs=[evaluation_state],
215
  outputs=[
216
  evaluation_state,
217
+ mmlu_components['subject_selection_mode'],
218
+ mmlu_components['num_subjects_slider'],
219
+ mmlu_components['specific_subjects'],
220
+ mmlu_components['all_questions_checkbox'],
221
+ mmlu_components['num_questions_slider'],
222
+ model_components['model1_dropdown'],
223
+ model_components['model2_dropdown'],
224
+ action_components['eval_button'],
225
+ action_components['cancel_button'],
226
+ results_components['output'],
227
+ results_components['table'],
228
+ results_components['table_container']
229
  ]
230
  )
231
 
232
+ if __name__ == "__main__":
233
+ demo.launch()