rohansampath commited on
Commit
5ea0bec
·
verified ·
1 Parent(s): 9d09e02

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -58
app.py CHANGED
@@ -103,79 +103,115 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
103
  gr.update(interactive=True), gr.update(interactive=True),
104
  gr.update(interactive=True), gr.update(interactive=True),
105
  gr.update(interactive=True))
106
-
107
  # ---------------------------------------------------------------------------
108
  # 3. Gradio Interface
109
  # ---------------------------------------------------------------------------
110
  with gr.Blocks() as demo:
111
- gr.Markdown("# Mistral-7B on MMLU-Pro Evaluation Demo")
112
  gr.Markdown("""
113
- This demo evaluates [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the [MMLU-Pro Dataset](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro).
 
 
 
 
 
 
 
 
 
114
  """)
 
 
115
 
116
- # MMLU Evaluation Section
117
- gr.Markdown("### MMLU Evaluation")
118
-
119
- with gr.Row():
120
- all_subjects_checkbox = gr.Checkbox(
121
- label="Evaluate All Subjects",
122
- value=False,
123
- info="When checked, evaluates all 14 MMLU-Pro subjects"
124
- )
125
- num_subjects_slider = gr.Slider(
126
- minimum=1,
127
- maximum=14,
128
- value=14,
129
- step=1,
130
- label="Number of Subjects",
131
- info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order.",
132
- interactive=True
133
- )
134
-
135
- with gr.Row():
136
- num_shots_slider = gr.Slider(
137
- minimum=0,
138
- maximum=5,
139
- value=5,
140
- step=1,
141
- label="Number of Few-shot Examples",
142
- info="Number of examples to use for few-shot learning (0-5)."
143
- )
144
-
145
- with gr.Row():
146
- all_questions_checkbox = gr.Checkbox(
147
- label="Evaluate All Questions",
148
- value=False,
149
- info="When checked, evaluates all available questions for each subject"
150
- )
151
- questions_info_text = gr.Markdown(visible=False, value="**All 12,032 questions across all subjects will be evaluated**")
152
-
153
- with gr.Row(elem_id="questions_selection_row"):
154
- questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
155
 
156
- with questions_container:
157
- num_questions_slider = gr.Slider(
158
- minimum=1,
159
- maximum=100,
160
- value=20,
161
- step=1,
162
- label="Questions per Subject",
163
- info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.",
164
- interactive=True
165
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
- with gr.Row():
168
- with gr.Column(scale=1):
169
- eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True)
170
- cancel_mmlu_button = gr.Button("Cancel Evaluation", variant="stop", visible=False)
171
  results_output = gr.Markdown(label="Evaluation Results")
172
 
173
- with gr.Row():
174
- results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)", visible=True)
175
 
176
  # Track evaluation state - used to prevent multiple evaluations
177
  evaluation_state = gr.State({"running": False})
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  # Update num_subjects_slider interactivity based on all_subjects checkbox
180
  def update_subjects_slider(checked):
181
  return gr.update(interactive=not checked)
 
103
  gr.update(interactive=True), gr.update(interactive=True),
104
  gr.update(interactive=True), gr.update(interactive=True),
105
  gr.update(interactive=True))
106
+
107
  # ---------------------------------------------------------------------------
108
  # 3. Gradio Interface
109
  # ---------------------------------------------------------------------------
110
  with gr.Blocks() as demo:
111
+ gr.Markdown("#Head to Head Evaluation Comparator")
112
  gr.Markdown("""
113
+ This demo evaluates two models (or the same model with different configs) on a single dataset.
114
+
115
+ Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro), MMLU, MMLU-Redux.
116
+ Available Models: [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) .
117
+
118
+ Available Configs:
119
+ - Model Configs: a) Different quantizations, b) Using Flash Attention etc.
120
+ - Eval Dataset Configs include: i) subset to specific questions or subjects
121
+ - Eval Method Configs include: i) different prompts, ii) use different regexes for capturing the "answer" (especially for multiple-choice evals), iii) different eval scripts,
122
+ iv) different eval metrics.
123
  """)
124
+ # Dataset Selection Section
125
+ gr.Markdown("### (A) Select Dataset for evaluation")
126
 
127
+ dataset_dropdown = gr.Dropdown(
128
+ choices=["MMLU-Pro"],
129
+ value=None,
130
+ label="Dataset",
131
+ info="Select a dataset to evaluate the model on"
132
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ # MMLU Config Container - Initially hidden until dataset is selected
135
+ with gr.Group(visible=False) as mmlu_config_container:
136
+ gr.Markdown("### MMLU-Pro Evaluation Configuration")
137
+
138
+ with gr.Row():
139
+ all_subjects_checkbox = gr.Checkbox(
140
+ label="Evaluate All Subjects",
141
+ value=False,
142
+ info="When checked, evaluates all 14 MMLU-Pro subjects"
143
+ )
144
+ num_subjects_slider = gr.Slider(
145
+ minimum=1,
146
+ maximum=14,
147
+ value=14,
148
+ step=1,
149
+ label="Number of Subjects",
150
+ info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order.",
151
+ interactive=True
152
+ )
153
+
154
+ with gr.Row():
155
+ num_shots_slider = gr.Slider(
156
+ minimum=0,
157
+ maximum=5,
158
+ value=5,
159
+ step=1,
160
+ label="Number of Few-shot Examples",
161
+ info="Number of examples to use for few-shot learning (0-5)."
162
+ )
163
+
164
+ with gr.Row():
165
+ all_questions_checkbox = gr.Checkbox(
166
+ label="Evaluate All Questions",
167
+ value=False,
168
+ info="When checked, evaluates all available questions for each subject"
169
+ )
170
+ questions_info_text = gr.Markdown(visible=False, value="**All 12,032 questions across all subjects will be evaluated**")
171
+
172
+ with gr.Row(elem_id="questions_selection_row"):
173
+ questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
174
+
175
+ with questions_container:
176
+ num_questions_slider = gr.Slider(
177
+ minimum=1,
178
+ maximum=100,
179
+ value=20,
180
+ step=1,
181
+ label="Questions per Subject",
182
+ info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.",
183
+ interactive=True
184
+ )
185
+
186
+ with gr.Row():
187
+ with gr.Column(scale=1):
188
+ eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True)
189
+ cancel_mmlu_button = gr.Button("Cancel Evaluation", variant="stop", visible=False)
190
 
191
+ # Results Section - Initially hidden
192
+ with gr.Group(visible=False) as results_container:
 
 
193
  results_output = gr.Markdown(label="Evaluation Results")
194
 
195
+ with gr.Row():
196
+ results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)", visible=True)
197
 
198
  # Track evaluation state - used to prevent multiple evaluations
199
  evaluation_state = gr.State({"running": False})
200
 
201
+ # Function to show configuration based on selected dataset
202
+ def update_interface_based_on_dataset(dataset):
203
+ if dataset == "MMLU-Pro":
204
+ return gr.update(visible=True), gr.update(visible=True)
205
+ else:
206
+ return gr.update(visible=False), gr.update(visible=False)
207
+
208
+ # Connect dataset dropdown to show/hide appropriate configuration
209
+ dataset_dropdown.change(
210
+ fn=update_interface_based_on_dataset,
211
+ inputs=[dataset_dropdown],
212
+ outputs=[mmlu_config_container, results_container]
213
+ )
214
+
215
  # Update num_subjects_slider interactivity based on all_subjects checkbox
216
  def update_subjects_slider(checked):
217
  return gr.update(interactive=not checked)