File size: 21,484 Bytes
c5224d3
4c36941
 
8c4aa75
3f6f5f7
3d567ab
bd05b7b
 
6896b10
4c36941
e8d7a5b
940cf49
e8d7a5b
 
 
bd05b7b
c5224d3
3195f7f
bd05b7b
3195f7f
bb6fa7e
bd05b7b
3195f7f
bd05b7b
3195f7f
3601959
a5202a8
ee60006
df31ae3
 
 
a5202a8
bd05b7b
a5202a8
df31ae3
ed9a008
9d09e02
33231b0
ee60006
bd05b7b
 
a5202a8
bd05b7b
a5202a8
 
 
bd05b7b
 
 
df31ae3
bd05b7b
 
 
 
 
 
a5202a8
 
bd05b7b
 
9190bb9
bd05b7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee60006
bd05b7b
2010e21
 
 
 
 
 
 
 
 
 
bd05b7b
 
 
 
 
 
 
2010e21
 
 
 
 
 
 
 
 
 
554eaa1
3195f7f
bd05b7b
3195f7f
3389471
 
 
 
 
 
 
 
 
 
b03c5d3
 
 
a5202a8
 
 
 
 
 
 
 
 
 
 
3389471
dfececa
e8d7a5b
b03c5d3
 
0cea5f2
b03c5d3
0cea5f2
a2d0717
6896b10
 
a5202a8
7bf88c1
6896b10
 
2010e21
 
6896b10
b03c5d3
6896b10
e4416c3
6896b10
 
2010e21
 
6896b10
2010e21
 
 
6896b10
2010e21
 
 
33231b0
a5202a8
 
 
 
5ea0bec
2010e21
a5202a8
5ea0bec
a5202a8
 
5ea0bec
 
a5202a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ea0bec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df31ae3
5ea0bec
2010e21
df31ae3
33231b0
2010e21
 
 
 
 
 
 
 
bd05b7b
fc5f4bd
bd05b7b
fc5f4bd
 
 
bd05b7b
2010e21
fc5f4bd
5ea0bec
6896b10
 
 
fc5f4bd
 
 
e4416c3
6896b10
5ea0bec
6896b10
 
 
fc5f4bd
 
 
e4416c3
6896b10
5ea0bec
 
 
 
fc5f4bd
 
6896b10
 
2010e21
fc5f4bd
 
 
 
 
e4416c3
2010e21
fc5f4bd
 
37aa822
554eaa1
fc5f4bd
 
6896b10
fc5f4bd
2010e21
 
fc5f4bd
6896b10
2010e21
 
 
fc5f4bd
 
5ea0bec
 
a5202a8
 
 
 
 
 
 
 
ed9a008
a5202a8
 
 
 
 
df31ae3
 
33231b0
 
ed9a008
33231b0
ed9a008
33231b0
ed9a008
 
33231b0
ed9a008
33231b0
ed9a008
 
a5202a8
 
 
 
 
 
 
 
 
 
 
 
 
 
33231b0
bd05b7b
 
 
 
 
 
 
 
 
 
2010e21
bd05b7b
2010e21
 
bd05b7b
 
 
 
 
33231b0
bd05b7b
a5202a8
bd05b7b
a5202a8
bd05b7b
 
 
33231b0
bd05b7b
 
2010e21
 
33231b0
 
bd05b7b
 
 
 
 
33231b0
bd05b7b
 
 
 
33231b0
bd05b7b
a5202a8
bd05b7b
a5202a8
bd05b7b
 
 
33231b0
bd05b7b
 
2010e21
 
33231b0
 
bd05b7b
df31ae3
bd05b7b
 
33231b0
bd05b7b
a5202a8
33231b0
a5202a8
33231b0
 
 
 
bd05b7b
 
2010e21
 
33231b0
 
a5202a8
 
 
 
 
 
 
 
 
df31ae3
a5202a8
df31ae3
a5202a8
df31ae3
ed9a008
 
df31ae3
33231b0
 
 
1642f78
33231b0
a5202a8
1642f78
 
 
2010e21
 
33231b0
bd05b7b
 
 
 
33231b0
 
 
 
 
bd05b7b
33231b0
bd05b7b
a5202a8
33231b0
a5202a8
33231b0
 
 
 
 
 
2010e21
 
33231b0
df31ae3
ee60006
3389471
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
import gradio as gr
import os
from huggingface_hub import login
from mmlu_pro_eval_adapted import evaluate_mmlu_pro
import spaces
import pandas as pd
import time
import traceback
from dataset_previews import mmlupro_dataset_preview, format_preview_for_display

# Read token and login
hf_token = os.getenv("HF_READ_WRITE_TOKEN")
if hf_token:
    login(hf_token)
else:
    print("⚠️ No HF_READ_WRITE_TOKEN found in environment")

# ---------------------------------------------------------------------------
# 1. Model configuration
# ---------------------------------------------------------------------------
model_name = "mistralai/Mistral-7B-v0.1"

# ---------------------------------------------------------------------------
# 2. MMLU-Pro Evaluation
# ---------------------------------------------------------------------------
@spaces.GPU(duration=240)
def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
    """
    Runs the MMLU evaluation with the specified parameters.
    
    Args:
        subject_selection_mode (str): Mode of subject selection ("all", "number", or "specific")
        num_subjects (int): Number of subjects to evaluate (1-14)
        selected_subjects (list): List of specific subjects to evaluate
        num_shots (int): Number of few-shot examples (0-5)
        all_questions (bool): Whether to evaluate all questions per subject
        num_questions (int): Number of examples per subject (1-100 or all)
        progress (gr.Progress): Progress indicator
    """
    try:
        # Convert parameters if needed
        if subject_selection_mode == "all":
            num_subjects = -1
            selected_subjects = []
        elif subject_selection_mode == "specific":
            num_subjects = len(selected_subjects) if selected_subjects else -1
            
        if all_questions:
            num_questions = -1

        # Run evaluation with timing
        start_time = time.time()
        results = evaluate_mmlu_pro(
            model_name,
            num_subjects=num_subjects,
            num_questions=num_questions,
            num_shots=num_shots,
            specific_subjects=selected_subjects if subject_selection_mode == "specific" else None
        )
        elapsed_time = time.time() - start_time

        # Format results
        overall_acc = results["overall_accuracy"]
        min_subject, min_acc = results["min_accuracy_subject"]
        max_subject, max_acc = results["max_accuracy_subject"]
        
        # Create DataFrame from results table
        results_df = pd.DataFrame(results["full_accuracy_table"])
        
        # Calculate totals for the overall row
        total_samples = results_df['Num_samples'].sum()
        total_correct = results_df['Num_correct'].sum()
        
        # Create overall row
        overall_row = pd.DataFrame({
            'Subject': ['**Overall**'],
            'Num_samples': [total_samples],
            'Num_correct': [total_correct],
            'Accuracy': [overall_acc]
        })
        
        # Concatenate overall row with results
        results_df = pd.concat([overall_row, results_df], ignore_index=True)
        
        # Format the report
        report = (
            f"### Overall Results\n"
            f"* Overall Accuracy: {overall_acc:.3f}\n"
            f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
            f"* Worst Performance: {min_subject} ({min_acc:.3f})\n"
            f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
        )

        # Return values that re-enable UI components after completion
        return (report, 
                results_df, 
                gr.update(interactive=True), 
                gr.update(visible=False),
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True),
                gr.update(visible=True))
                
    except Exception as e:
        # Handle errors gracefully
        error_trace = traceback.format_exc()
        error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
        
        # Re-enable UI components on error
        return (error_message, 
                None,
                gr.update(interactive=True), 
                gr.update(visible=False),
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True),
                gr.update(visible=False))
        
# ---------------------------------------------------------------------------
# 3. Gradio Interface
# ---------------------------------------------------------------------------
with gr.Blocks(css="""
    #preview_header {
        margin-bottom: 10px;
        margin-top: 5px;
    }
    #preview_table {
        background-color: #f8f9fa;
        border-radius: 8px;
        padding: 10px;
    }
    h1 {
        text-align: center;
    }
    .section-spacing {
        margin-top: 30px;
        margin-bottom: 30px;
    }
    .config-box {
        border: 1px solid #ddd;
        border-radius: 8px;
        padding: 15px;
        margin: 10px;
        background-color: #f9f9f9;
    }
""") as demo:
    gr.Markdown("# Head-to-Head Model Evaluation Comparator")
    gr.Markdown("""
    This demo evaluates two models (or one model with two different configs), head-to-head, on a benchmark dataset.
    
    Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)
    
    Available Models: [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
    """)
    
    # Dataset Selection Section
    gr.Markdown("## (A) Select Dataset for Evaluation", elem_classes=["section-spacing"])
    
    with gr.Row():
        dataset_dropdown = gr.Dropdown(
            choices=["(Select Dataset)", "MMLU-Pro"],
            value="(Select Dataset)",
            label="Dataset",
            info="Select a dataset to perform the Head-to-Head Evaluation on. Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)"
        )
        preview_toggle = gr.Button("Show Dataset Preview", interactive=False, variant="secondary")
    
    # Dataset Preview Container - Initially hidden
    with gr.Column(visible=False) as dataset_preview_container:
        gr.Markdown("## Dataset Preview", elem_id="preview_header")
        preview_output = gr.DataFrame(
            interactive=False,
            wrap=True,
            elem_id="preview_table"
        )
        # Add vertical space after the preview
        gr.Markdown(" ")
        gr.Markdown(" ")
    
    # Add more spacing between sections
    gr.Markdown(" ", elem_classes=["section-spacing"])
    gr.Markdown(" ", elem_classes=["section-spacing"])
    
    # MMLU Config Container - Initially hidden until dataset is selected
    with gr.Column(visible=False) as mmlu_config_container:
        gr.Markdown("## (B) Select Dataset Configuration Options", elem_classes=["section-spacing"])
        
        # Add more spacing
        gr.Markdown(" ")
        
        with gr.Row():
            # Left column for subject selection
            with gr.Column(scale=1):
                with gr.Box(elem_classes=["config-box"]):
                    gr.Markdown("### Choose Subjects")
                    
                    subject_selection_mode = gr.Radio(
                        choices=["Evaluate All Subjects", "Choose Number of Subjects", "Specify which Subjects to Evaluate"],
                        value="Evaluate All Subjects",
                        label="Subject Selection Mode"
                    )
                    
                    # Subject number slider - initially hidden, shown when "Choose Number of Subjects" is selected
                    with gr.Column(visible=False) as num_subjects_container:
                        num_subjects_slider = gr.Slider(
                            minimum=1,
                            maximum=14,
                            value=14,
                            step=1,
                            label="Number of Subjects",
                            info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order."
                        )
                    
                    # Subject checkboxes - initially hidden, shown when "Specify which Subjects to Evaluate" is selected
                    with gr.Column(visible=False) as specific_subjects_container:
                        # We'll populate this with checkboxes for each subject
                        # The actual subjects will come from the dataset preview
                        specific_subjects = gr.CheckboxGroup(
                            choices=[
                                "Biology (n=717)",
                                "Chemistry (n=500)",
                                "Physics (n=650)",
                                "Mathematics (n=800)",
                                "Computer Science (n=450)",
                                "History (n=300)",
                                "Literature (n=250)"
                            ],
                            label="Select Specific Subjects",
                            info="Select which specific subjects to evaluate"
                        )
            
            # Right column for few-shot examples
            with gr.Column(scale=1):
                with gr.Box(elem_classes=["config-box"]):
                    gr.Markdown("### Few-shot Configuration")
                    
                    num_shots_slider = gr.Slider(
                        minimum=0,
                        maximum=5,
                        value=5,
                        step=1,
                        label="Number of Few-shot Examples",
                        info="Number of examples to use for few-shot learning (0-5)."
                    )
        
        # Add spacing
        gr.Markdown(" ")
        
        with gr.Row():
            all_questions_checkbox = gr.Checkbox(
                label="Evaluate All Questions",
                value=False,
                info="When checked, evaluates all available questions for each subject"
            )
            questions_info_text = gr.Markdown(visible=False, value="**All 12,032 questions across all subjects will be evaluated**")
        
        with gr.Row(elem_id="questions_selection_row"):
            questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
        
        with questions_container:
            num_questions_slider = gr.Slider(
                minimum=1,
                maximum=100,
                value=20,
                step=1,
                label="Questions per Subject",
                info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.",
                interactive=True
            )
        
        with gr.Row():
            with gr.Column(scale=1):
                eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True)
                cancel_mmlu_button = gr.Button("Cancel Evaluation", variant="stop", visible=False)
    
    # Results Section - Initially hidden
    with gr.Column(visible=False) as results_container:
        results_output = gr.Markdown(label="Evaluation Results")
        
        # Results table - Initially hidden until evaluation completes
        with gr.Column(visible=False) as results_table_container:
            with gr.Row():
                results_table = gr.DataFrame(
                    interactive=True, 
                    label="Detailed Results (Sortable)",
                    visible=True
                )
    
    # Track evaluation state 
    evaluation_state = gr.State({"running": False})
    
    # Track preview visibility state
    preview_visibility = gr.State(False)
    
    # Function to show/hide configuration based on selected dataset
    def update_interface_based_on_dataset(dataset, current_visibility):
        if dataset == "MMLU-Pro":
            return (
                gr.update(visible=True),  # mmlu_config_container
                gr.update(visible=True),  # results_container
                gr.update(interactive=True),  # preview_toggle
                gr.update(visible=False),  # dataset_preview_container - hide it initially
                False,  # Reset preview_visibility to False
                gr.update(value="Show Dataset Preview")  # Reset button text
            )
        else:
            return (
                gr.update(visible=False),  # mmlu_config_container
                gr.update(visible=False),  # results_container
                gr.update(interactive=False),  # preview_toggle
                gr.update(visible=False),  # dataset_preview_container - hide when no dataset
                False,  # Reset preview_visibility to False
                gr.update(value="Show Dataset Preview")  # Reset button text
            )
    
    # Connect dataset dropdown to show/hide appropriate configuration
    dataset_dropdown.change(
        fn=update_interface_based_on_dataset,
        inputs=[dataset_dropdown, preview_visibility],
        outputs=[mmlu_config_container, results_container, preview_toggle, dataset_preview_container, preview_visibility, preview_toggle]
    )
    
    # Function to toggle dataset preview visibility
    def toggle_preview(dataset, preview_visibility):
        # Toggle the visibility state
        is_visible = not preview_visibility
        
        # Update button text based on new state
        button_text = "Hide Dataset Preview" if is_visible else "Show Dataset Preview"
        
        # Get preview data if becoming visible
        if is_visible and dataset == "MMLU-Pro":
            preview_data = mmlupro_dataset_preview(regenerate_preview=False) # Change regenerate_preview=True if you want to regenerate the preview.
            formatted_preview = format_preview_for_display(preview_data)
            return is_visible, gr.update(visible=True), formatted_preview, gr.update(value=button_text)
        elif is_visible:
            # For other datasets (not implemented yet)
            return is_visible, gr.update(visible=True), None, gr.update(value=button_text)
        else:
            # Hiding the preview
            return is_visible, gr.update(visible=False), None, gr.update(value=button_text)
    
    # Connect preview toggle to show/hide dataset information
    preview_toggle.click(
        fn=toggle_preview,
        inputs=[dataset_dropdown, preview_visibility],
        outputs=[preview_visibility, dataset_preview_container, preview_output, preview_toggle]
    )
    
    # Function to update UI based on subject selection mode
    def update_subject_selection_ui(mode):
        if mode == "Evaluate All Subjects":
            return gr.update(visible=False), gr.update(visible=False)
        elif mode == "Choose Number of Subjects":
            return gr.update(visible=True), gr.update(visible=False)
        else:  # "Specify which Subjects to Evaluate"
            return gr.update(visible=False), gr.update(visible=True)
    
    # Connect subject selection mode to UI updates
    subject_selection_mode.change(
        fn=update_subject_selection_ui,
        inputs=[subject_selection_mode],
        outputs=[num_subjects_container, specific_subjects_container]
    )
    
    # Update interface based on all_questions checkbox
    def update_questions_interface(checked):
        if checked:
            return gr.update(visible=False), gr.update(visible=True)
        else:
            return gr.update(visible=True), gr.update(visible=False)
    
    all_questions_checkbox.change(
        fn=update_questions_interface,
        inputs=[all_questions_checkbox],
        outputs=[questions_container, questions_info_text]
    )
    
    # Function to convert subject selection mode to parameters
    def get_subject_mode_param(mode):
        if mode == "Evaluate All Subjects":
            return "all"
        elif mode == "Choose Number of Subjects":
            return "number"
        else:  # "Specify which Subjects to Evaluate"
            return "specific"
    
    # Function to extract subject names from checkboxes
    def get_subject_names(selected_subjects):
        # Extract just the subject name without the count
        return [subject.split(" (")[0] for subject in selected_subjects]
    
    # Function to disable UI components during evaluation
    def start_evaluation(state):
        if state["running"]:
            return [
                state,
                gr.update(interactive=False),
                gr.update(interactive=False),
                gr.update(interactive=False),
                gr.update(interactive=False),
                gr.update(interactive=False),
                gr.update(interactive=False),
                gr.update(visible=True),
                "Evaluation already in progress. Please wait.",
                None,
                gr.update(visible=False)
            ]
        
        # Update state to running
        state["running"] = True
        
        return [
            state,
            gr.update(interactive=False),  # subject_selection_mode
            gr.update(interactive=False),  # num_subjects_slider
            gr.update(interactive=False),  # specific_subjects
            gr.update(interactive=False),  # num_shots_slider
            gr.update(interactive=False),  # all_questions_checkbox
            gr.update(interactive=False),  # num_questions_slider
            gr.update(interactive=False),  # eval_mmlu_button
            gr.update(visible=True),       # cancel_mmlu_button
            "Starting evaluation...",      # results_output
            None,                          # results_table
            gr.update(visible=False)       # results_table_container
        ]
    
    # Function to reset UI after evaluation
    def finish_evaluation(state):
        state["running"] = False
        return state
    
    # Function to handle cancel button click
    def cancel_evaluation(state):
        # Note: This doesn't actually stop the evaluation process
        # It only updates the UI state to appear canceled
        state["running"] = False
        return [
            state,
            gr.update(interactive=True),  # subject_selection_mode
            gr.update(interactive=True),  # num_subjects_slider
            gr.update(interactive=True),  # specific_subjects
            gr.update(interactive=True),  # num_shots_slider
            gr.update(interactive=True),  # all_questions_checkbox
            gr.update(interactive=True),  # num_questions_slider
            gr.update(interactive=True),  # eval_mmlu_button
            gr.update(visible=False),     # cancel_mmlu_button
            "⚠️ Evaluation canceled by user (note: backend process may continue running)", # results_output
            None,                         # results_table
            gr.update(visible=False)      # results_table_container
        ]
    
    # Connect MMLU evaluation button with state tracking
    eval_mmlu_button.click(
        fn=start_evaluation,
        inputs=[evaluation_state],
        outputs=[
            evaluation_state,
            subject_selection_mode,
            num_subjects_slider,
            specific_subjects,
            num_shots_slider,
            all_questions_checkbox,
            num_questions_slider,
            eval_mmlu_button,
            cancel_mmlu_button,
            results_output,
            results_table,
            results_table_container
        ]
    ).then(
        fn=lambda mode, num, subjects, shots, all_q, num_q: 
            run_mmlu_evaluation(
                get_subject_mode_param(mode),
                num,
                get_subject_names(subjects),
                shots,
                all_q,
                num_q
            ),
        inputs=[
            subject_selection_mode,
            num_subjects_slider,
            specific_subjects,
            num_shots_slider,
            all_questions_checkbox,
            num_questions_slider
        ],
        outputs=[
            results_output,
            results_table,
            eval_mmlu_button, 
            cancel_mmlu_button,
            subject_selection_mode,
            num_subjects_slider,
            num_shots_slider,
            all_questions_checkbox,
            num_questions_slider,
            results_table_container
        ]
    ).then(
        fn=finish_evaluation,
        inputs=[evaluation_state],
        outputs=[evaluation_state]
    )
    
    # Connect cancel button
    cancel_mmlu_button.click(
        fn=cancel_evaluation,
        inputs=[evaluation_state],
        outputs=[
            evaluation_state,
            subject_selection_mode,
            num_subjects_slider,
            specific_subjects,
            num_shots_slider,
            all_questions_checkbox,
            num_questions_slider,
            eval_mmlu_button,
            cancel_mmlu_button,
            results_output,
            results_table,
            results_table_container
        ]
    )

demo.launch()