File size: 12,468 Bytes
d093a4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
import gradio as gr
import pandas as pd
from dataset_previews import mmlupro_dataset_preview, format_preview_for_display

def create_dataset_section():
    """
    Creates the dataset selection section (Section A) of the UI.
    
    Returns:
        dict: Dictionary containing UI components and containers.
    """
    # Dataset Selection Section
    header = gr.Markdown("## (A) Select Dataset for Evaluation")
    
    with gr.Row():
        dataset_dropdown = gr.Dropdown(
            choices=["(Select Dataset)", "MMLU-Pro"],
            value="(Select Dataset)",
            label="Dataset",
            info="Select a dataset to perform the Head-to-Head Evaluation on. Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)"
        )
        preview_toggle = gr.Button("Show Dataset Preview", interactive=False, variant="secondary")
    
    # Dataset Preview Container - Initially hidden
    with gr.Column(visible=False) as dataset_preview_container:
        gr.Markdown("## Dataset Preview", elem_id="preview_header")
        preview_output = gr.DataFrame(
            interactive=False,
            wrap=True,
            elem_id="preview_table"
        )
        # Add a divider
        gr.Markdown("<div class='section-divider'></div>")
    
    # Preview data state to store the loaded preview data
    preview_data_state = gr.State(None)
    
    # Return components dictionary
    return {
        'header': header,
        'dropdown': dataset_dropdown,
        'preview_toggle': preview_toggle,
        'preview_container': dataset_preview_container,
        'preview_output': preview_output,
        'preview_data_state': preview_data_state
    }

def create_mmlu_config_section():
    """
    Creates the dataset configuration section (Section B) of the UI.
    
    Returns:
        dict: Dictionary containing UI components and containers.
    """
    with gr.Column(visible=False) as mmlu_config_container:
        gr.Markdown("## (B) Select Dataset Configuration Options")
        
        with gr.Row():
            # Left column for subject selection
            with gr.Column(scale=1):
                with gr.Group(elem_classes=["config-box"]):
                    gr.Markdown("### Choose Subjects")
                    
                    subject_selection_mode = gr.Radio(
                        choices=["Evaluate All Subjects", "Choose Number of Subjects", "Specify which Subjects to Evaluate"],
                        value="Evaluate All Subjects",
                        label="Subject Selection Mode"
                    )
                    
                    # Subject number slider - initially hidden
                    with gr.Column(visible=False) as num_subjects_container:
                        num_subjects_slider = gr.Slider(
                            minimum=1,
                            maximum=14,  # Will be updated dynamically based on preview data
                            value=1,
                            step=1,
                            label="Number of Subjects",
                            info="Number of subjects to evaluate. They will be loaded in alphabetical order."
                        )
                    
                    # Subject checkboxes - initially hidden
                    with gr.Column(visible=False) as specific_subjects_container:
                        # Will be populated dynamically from the preview data
                        specific_subjects = gr.CheckboxGroup(
                            choices=[],  # Will be populated from preview data
                            label="Select Specific Subjects",
                            info="Select which specific subjects to evaluate"
                        )
            
            # Right column for sample configuration
            with gr.Column(scale=1):
                with gr.Group(elem_classes=["config-box"]):
                    gr.Markdown("### Sample Configuration")
                    
                    all_questions_checkbox = gr.Checkbox(
                        label="Evaluate All Questions",
                        value=False,
                        info="When checked, evaluates all available questions for each subject"
                    )
                    
                    questions_info_text = gr.Markdown(visible=False, value="**All questions across the selected subjects will be evaluated**")
                    
                    # Questions per subject slider
                    num_questions_slider = gr.Slider(
                        minimum=1,
                        maximum=100,
                        value=20,
                        step=1,
                        label="Questions per Subject",
                        info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.",
                        interactive=True
                    )
    
    # Return components dictionary
    return {
        'container': mmlu_config_container,
        'subject_selection_mode': subject_selection_mode,
        'num_subjects_container': num_subjects_container,
        'num_subjects_slider': num_subjects_slider,
        'specific_subjects_container': specific_subjects_container,
        'specific_subjects': specific_subjects,
        'all_questions_checkbox': all_questions_checkbox,
        'questions_info_text': questions_info_text,
        'num_questions_slider': num_questions_slider
    }

# Utility functions for dataset section
def get_subject_choices_from_preview(preview_data):
    """
    Extracts subject choices from preview data.
    
    Args:
        preview_data (dict): Preview data containing subject counts.
        
    Returns:
        tuple: (formatted_subjects, subject_count)
    """
    if not preview_data or 'subject_counts' not in preview_data:
        return [], 0
    
    # Get subject counts from preview data
    subject_counts = preview_data['subject_counts']
    
    # Sort subjects alphabetically
    subjects = sorted(subject_counts.keys())
    
    # Format as "Subject (n=count)"
    formatted_subjects = [f"{subject} (n={subject_counts[subject]})" for subject in subjects]
    
    return formatted_subjects, len(subjects)

def load_dataset_preview(dataset):
    """
    Loads preview data for the selected dataset.
    
    Args:
        dataset (str): Selected dataset name.
        
    Returns:
        tuple: (preview_data, specific_subjects_update, num_subjects_slider_update)
    """
    if dataset == "MMLU-Pro":
        # Load the preview data
        preview_data = mmlupro_dataset_preview(regenerate_preview=True)
        
        # Extract subject choices and count
        subject_choices, subject_count = get_subject_choices_from_preview(preview_data)
        
        # Update the UI components
        return (
            preview_data,  # Store the preview data
            gr.update(choices=subject_choices),  # Update checkbox choices
            gr.update(maximum=subject_count, value=1)  # Update slider max
        )
    return None, gr.update(), gr.update()

def update_interface_based_on_dataset(dataset, current_visibility, 
                                     mmlu_config_container, model_config_container, 
                                     results_container, preview_toggle, 
                                     dataset_preview_container):
    """
    Updates the interface based on dataset selection.
    
    Args:
        dataset (str): Selected dataset name.
        current_visibility (bool): Current preview visibility state.
        mmlu_config_container: MMLU config container component.
        model_config_container: Model config container component.
        results_container: Results container component.
        preview_toggle: Preview toggle button.
        dataset_preview_container: Dataset preview container.
        
    Returns:
        tuple: Updates for UI components.
    """
    if dataset == "MMLU-Pro":
        return (
            gr.update(visible=True),  # mmlu_config_container
            gr.update(visible=True),  # model_config_container
            gr.update(visible=True),  # results_container
            gr.update(interactive=True),  # preview_toggle
            gr.update(visible=False),  # dataset_preview_container - hide it initially
            False,  # Reset preview_visibility to False
            gr.update(value="Show Dataset Preview")  # Reset button text
        )
    else:
        return (
            gr.update(visible=False),  # mmlu_config_container
            gr.update(visible=False),  # model_config_container
            gr.update(visible=False),  # results_container
            gr.update(interactive=False),  # preview_toggle
            gr.update(visible=False),  # dataset_preview_container - hide when no dataset
            False,  # Reset preview_visibility to False
            gr.update(value="Show Dataset Preview")  # Reset button text
        )

def toggle_preview(dataset, preview_visibility, preview_data):
    """
    Toggles the dataset preview visibility.
    
    Args:
        dataset (str): Selected dataset name.
        preview_visibility (bool): Current preview visibility state.
        preview_data (dict): Preview data.
        
    Returns:
        tuple: (new_visibility, preview_container_update, preview_output_update, button_text_update)
    """
    # Toggle the visibility state
    is_visible = not preview_visibility
    
    # Update button text based on new state
    button_text = "Hide Dataset Preview" if is_visible else "Show Dataset Preview"
    
    # Format and show preview if becoming visible
    if is_visible and dataset == "MMLU-Pro":
        formatted_preview = format_preview_for_display(preview_data)
        return is_visible, gr.update(visible=True), formatted_preview, gr.update(value=button_text)
    elif is_visible:
        # For other datasets (not implemented yet)
        return is_visible, gr.update(visible=True), None, gr.update(value=button_text)
    else:
        # Hiding the preview
        return is_visible, gr.update(visible=False), None, gr.update(value=button_text)

def update_subject_selection_ui(mode, num_subjects_container, specific_subjects_container):
    """
    Updates UI based on subject selection mode.
    
    Args:
        mode (str): Selected subject selection mode.
        num_subjects_container: Container for number of subjects slider.
        specific_subjects_container: Container for specific subjects checkboxes.
        
    Returns:
        tuple: (num_subjects_container_update, specific_subjects_container_update)
    """
    if mode == "Evaluate All Subjects":
        return gr.update(visible=False), gr.update(visible=False)
    elif mode == "Choose Number of Subjects":
        return gr.update(visible=True), gr.update(visible=False)
    else:  # "Specify which Subjects to Evaluate"
        return gr.update(visible=False), gr.update(visible=True)

def update_questions_interface(checked, num_questions_slider, questions_info_text):
    """
    Updates questions interface based on "Evaluate All Questions" checkbox.
    
    Args:
        checked (bool): Whether "Evaluate All Questions" is checked.
        num_questions_slider: Questions per subject slider component.
        questions_info_text: Questions info text component.
        
    Returns:
        tuple: (num_questions_slider_update, questions_info_text_update)
    """
    if checked:
        return gr.update(visible=False), gr.update(visible=True)
    else:
        return gr.update(visible=True), gr.update(visible=False)

def get_subject_mode_param(mode):
    """
    Converts subject selection mode to parameter string.
    
    Args:
        mode (str): Subject selection mode.
        
    Returns:
        str: Parameter value for evaluation function.
    """
    if mode == "Evaluate All Subjects":
        return "all"
    elif mode == "Choose Number of Subjects":
        return "number"
    else:  # "Specify which Subjects to Evaluate"
        return "specific"

def get_subject_names(selected_subjects):
    """
    Extracts subject names from checkbox values.
    
    Args:
        selected_subjects (list): Selected subjects with counts.
        
    Returns:
        list: Clean subject names without count information.
    """
    # Extract just the subject name without the count
    return [subject.split(" (")[0] for subject in selected_subjects]