H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 26

Commit

0b21fdd

verified ·

1 Parent(s): 7d6f6c7

Update app.py

Browse files

Files changed (1) hide show

app.py +162 -485

app.py CHANGED Viewed

@@ -1,12 +1,19 @@
-import gradio as gr
 import os
 from huggingface_hub import login
-from mmlu_pro_eval_adapted import evaluate_mmlu_pro
-import spaces
-import pandas as pd
-import time
-import traceback
-from dataset_previews import mmlupro_dataset_preview, format_preview_for_display
 # Read token and login
 hf_token = os.getenv("HF_READ_WRITE_TOKEN")
@@ -16,516 +23,184 @@ else:
     print("⚠️ No HF_READ_WRITE_TOKEN found in environment")
 # ---------------------------------------------------------------------------
-# 1. Model configuration
 # ---------------------------------------------------------------------------
-model_name = "mistralai/Mistral-7B-v0.1"
-# ---------------------------------------------------------------------------
-# 2. MMLU-Pro Evaluation
-# ---------------------------------------------------------------------------
-@spaces.GPU(duration=240)
-def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
-    """
-    Runs the MMLU evaluation with the specified parameters.
-    Args:
-        subject_selection_mode (str): Mode of subject selection ("all", "number", or "specific")
-        num_subjects (int): Number of subjects to evaluate (1-14)
-        selected_subjects (list): List of specific subjects to evaluate
-        num_shots (int): Number of few-shot examples (0-5)
-        all_questions (bool): Whether to evaluate all questions per subject
-        num_questions (int): Number of examples per subject (1-100 or all)
-        progress (gr.Progress): Progress indicator
-    """
-    try:
-        # Convert parameters if needed
-        if subject_selection_mode == "all":
-            num_subjects = -1
-            selected_subjects = []
-        elif subject_selection_mode == "specific":
-            num_subjects = len(selected_subjects) if selected_subjects else -1
-        if all_questions:
-            num_questions = -1
-        # Run evaluation with timing
-        start_time = time.time()
-        results = evaluate_mmlu_pro(
-            model_name,
-            num_subjects=num_subjects,
-            num_questions=num_questions,
-            num_shots=num_shots,
-            specific_subjects=selected_subjects if subject_selection_mode == "specific" else None
-        )
-        elapsed_time = time.time() - start_time
-        # Format results
-        overall_acc = results["overall_accuracy"]
-        min_subject, min_acc = results["min_accuracy_subject"]
-        max_subject, max_acc = results["max_accuracy_subject"]
-        # Create DataFrame from results table
-        results_df = pd.DataFrame(results["full_accuracy_table"])
-        # Calculate totals for the overall row
-        total_samples = results_df['Num_samples'].sum()
-        total_correct = results_df['Num_correct'].sum()
-        # Create overall row
-        overall_row = pd.DataFrame({
-            'Subject': ['**Overall**'],
-            'Num_samples': [total_samples],
-            'Num_correct': [total_correct],
-            'Accuracy': [overall_acc]
-        })
-        # Concatenate overall row with results
-        results_df = pd.concat([overall_row, results_df], ignore_index=True)
-        # Format the report
-        report = (
-            f"### Overall Results\n"
-            f"* Overall Accuracy: {overall_acc:.3f}\n"
-            f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
-            f"* Worst Performance: {min_subject} ({min_acc:.3f})\n"
-            f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
-        )
-        # Return values that re-enable UI components after completion
-        return (report,
-                results_df,
-                gr.update(interactive=True),
-                gr.update(visible=False),
-                gr.update(interactive=True),
-                gr.update(interactive=True),
-                gr.update(interactive=True),
-                gr.update(interactive=True),
-                gr.update(interactive=True),
-                gr.update(visible=True))
-    except Exception as e:
-        # Handle errors gracefully
-        error_trace = traceback.format_exc()
-        error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
-        # Re-enable UI components on error
-        return (error_message,
-                None,
-                gr.update(interactive=True),
-                gr.update(visible=False),
-                gr.update(interactive=True),
-                gr.update(interactive=True),
-                gr.update(interactive=True),
-                gr.update(interactive=True),
-                gr.update(interactive=True),
-                gr.update(visible=False))
-# ---------------------------------------------------------------------------
-# 3. Gradio Interface
-# ---------------------------------------------------------------------------
-with gr.Blocks(css="""
-    #preview_header {
-        margin-bottom: 10px;
-        margin-top: 5px;
-    }
-    #preview_table {
-        background-color: #f8f9fa;
-        border-radius: 8px;
-        padding: 10px;
-    }
-    h1 {
-        text-align: center;
-    }
-    .section-divider {
-        border-top: 1px solid #ddd;
-        margin: 12px 0;
-    }
-    .config-box {
-        border: 1px solid #ddd;
-        border-radius: 8px;
-        padding: 15px;
-        margin: 10px;
-        background-color: #f9f9f9;
-    }
-""") as demo:
-    gr.Markdown("# Head-to-Head Model Evaluation Comparator")
-    gr.Markdown("""
-    This demo evaluates two models (or one model with two different configs), head-to-head, on a benchmark dataset.
-    Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)
-    Available Models: [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
-    """)
-    # Dataset Selection Section
-    gr.Markdown("## (A) Select Dataset for Evaluation")
-    with gr.Row():
-        dataset_dropdown = gr.Dropdown(
-            choices=["(Select Dataset)", "MMLU-Pro"],
-            value="(Select Dataset)",
-            label="Dataset",
-            info="Select a dataset to perform the Head-to-Head Evaluation on. Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)"
-        )
-        preview_toggle = gr.Button("Show Dataset Preview", interactive=False, variant="secondary")
-    # Dataset Preview Container - Initially hidden
-    with gr.Column(visible=False) as dataset_preview_container:
-        gr.Markdown("## Dataset Preview", elem_id="preview_header")
-        preview_output = gr.DataFrame(
-            interactive=False,
-            wrap=True,
-            elem_id="preview_table"
-        )
-        # Add a divider instead of lots of space
-        gr.Markdown("<div class='section-divider'></div>")
-    # Preview data state to store the loaded preview data
-    preview_data_state = gr.State(None)
-    # MMLU Config Container - Initially hidden until dataset is selected
-    with gr.Column(visible=False) as mmlu_config_container:
-        gr.Markdown("## (B) Select Dataset Configuration Options")
-        with gr.Row():
-            # Left column for subject selection
-            with gr.Column(scale=1):
-                with gr.Group(elem_classes=["config-box"]):
-                    gr.Markdown("### Choose Subjects")
-                    subject_selection_mode = gr.Radio(
-                        choices=["Evaluate All Subjects", "Choose Number of Subjects", "Specify which Subjects to Evaluate"],
-                        value="Evaluate All Subjects",
-                        label="Subject Selection Mode"
-                    )
-                    # Subject number slider - initially hidden, shown when "Choose Number of Subjects" is selected
-                    with gr.Column(visible=False) as num_subjects_container:
-                        num_subjects_slider = gr.Slider(
-                            minimum=1,
-                            maximum=14,  # Will be updated dynamically based on preview data
-                            value=1,
-                            step=1,
-                            label="Number of Subjects",
-                            info="Number of subjects to evaluate. They will be loaded in alphabetical order."
-                        )
-                    # Subject checkboxes - initially hidden, shown when "Specify which Subjects to Evaluate" is selected
-                    with gr.Column(visible=False) as specific_subjects_container:
-                        # Will be populated dynamically from the preview data
-                        specific_subjects = gr.CheckboxGroup(
-                            choices=[],  # Will be populated from preview data
-                            label="Select Specific Subjects",
-                            info="Select which specific subjects to evaluate"
-                        )
-            # Right column for few-shot examples
-            with gr.Column(scale=1):
-                with gr.Group(elem_classes=["config-box"]):
-                    gr.Markdown("### Few-shot Configuration")
-                    num_shots_slider = gr.Slider(
-                        minimum=0,
-                        maximum=5,
-                        value=5,
-                        step=1,
-                        label="Number of Few-shot Examples",
-                        info="Number of examples to use for few-shot learning (0-5). They will be loaded in order of question_id."
-                    )
-        # Add a small space
-        gr.Markdown("&nbsp;")
-        with gr.Row():
-            all_questions_checkbox = gr.Checkbox(
-                label="Evaluate All Questions",
-                value=False,
-                info="When checked, evaluates all available questions for each subject"
-            )
-            questions_info_text = gr.Markdown(visible=False, value="**All questions across the selected subjects will be evaluated**")
-        with gr.Row(elem_id="questions_selection_row"):
-            questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
-        with questions_container:
-            num_questions_slider = gr.Slider(
-                minimum=1,
-                maximum=100,
-                value=20,
-                step=1,
-                label="Questions per Subject",
-                info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.",
-                interactive=True
-            )
-        with gr.Row():
-            with gr.Column(scale=1):
-                eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True)
-                cancel_mmlu_button = gr.Button("Cancel Evaluation", variant="stop", visible=False)
-    # Results Section - Initially hidden
-    with gr.Column(visible=False) as results_container:
-        results_output = gr.Markdown(label="Evaluation Results")
-        # Results table - Initially hidden until evaluation completes
-        with gr.Column(visible=False) as results_table_container:
-            with gr.Row():
-                results_table = gr.DataFrame(
-                    interactive=True,
-                    label="Detailed Results (Sortable)",
-                    visible=True
-                )
-    # Track evaluation state
     evaluation_state = gr.State({"running": False})
-    # Track preview visibility state
     preview_visibility = gr.State(False)
-    # Function to process dataset preview data
-    def get_subject_choices_from_preview(preview_data):
-        if not preview_data or 'subject_counts' not in preview_data:
-            print("FAILURE HERE")
-            return [], 0
-        # Get subject counts from preview data
-        subject_counts = preview_data['subject_counts']
-        # Sort subjects alphabetically
-        subjects = sorted(subject_counts.keys())
-        # Format as "Subject (n=count)"
-        formatted_subjects = [f"{subject} (n={subject_counts[subject]})" for subject in subjects]
-        print("Formatted Subjects", formatted_subjects)
-        return formatted_subjects, len(subjects)
-    # Function to load preview data and update UI
-    def load_dataset_preview(dataset):
-        if dataset == "MMLU-Pro":
-            # Load the preview data
-            preview_data = mmlupro_dataset_preview(regenerate_preview=True)
-            # Extract subject choices and count
-            subject_choices, subject_count = get_subject_choices_from_preview(preview_data)
-            # Update the UI components
-            return (
-                preview_data,  # Store the preview data
-                gr.update(choices=subject_choices),  # Update checkbox choices
-                gr.update(maximum=subject_count, value=1)  # Update slider max
-            )
-        return None, gr.update(), gr.update()
-    # Function to show/hide configuration based on selected dataset
-    def update_interface_based_on_dataset(dataset, current_visibility):
-        if dataset == "MMLU-Pro":
-            return (
-                gr.update(visible=True),  # mmlu_config_container
-                gr.update(visible=True),  # results_container
-                gr.update(interactive=True),  # preview_toggle
-                gr.update(visible=False),  # dataset_preview_container - hide it initially
-                False,  # Reset preview_visibility to False
-                gr.update(value="Show Dataset Preview")  # Reset button text
-            )
-        else:
-            return (
-                gr.update(visible=False),  # mmlu_config_container
-                gr.update(visible=False),  # results_container
-                gr.update(interactive=False),  # preview_toggle
-                gr.update(visible=False),  # dataset_preview_container - hide when no dataset
-                False,  # Reset preview_visibility to False
-                gr.update(value="Show Dataset Preview")  # Reset button text
-            )
     # Connect dataset dropdown to show/hide appropriate configuration and load preview data
-    dataset_dropdown.change(
         fn=load_dataset_preview,
-        inputs=[dataset_dropdown],
-        outputs=[preview_data_state, specific_subjects, num_subjects_slider],
     ).then(
         fn=update_interface_based_on_dataset,
-        inputs=[dataset_dropdown, preview_visibility],
-        outputs=[mmlu_config_container, results_container, preview_toggle, dataset_preview_container, preview_visibility, preview_toggle]
     )
-    # Function to toggle dataset preview visibility
-    def toggle_preview(dataset, preview_visibility, preview_data):
-        # Toggle the visibility state
-        is_visible = not preview_visibility
-        # Update button text based on new state
-        button_text = "Hide Dataset Preview" if is_visible else "Show Dataset Preview"
-        # Format and show preview if becoming visible
-        if is_visible and dataset == "MMLU-Pro":
-            formatted_preview = format_preview_for_display(preview_data)
-            return is_visible, gr.update(visible=True), formatted_preview, gr.update(value=button_text)
-        elif is_visible:
-            # For other datasets (not implemented yet)
-            return is_visible, gr.update(visible=True), None, gr.update(value=button_text)
-        else:
-            # Hiding the preview
-            return is_visible, gr.update(visible=False), None, gr.update(value=button_text)
     # Connect preview toggle to show/hide dataset information
-    preview_toggle.click(
         fn=toggle_preview,
-        inputs=[dataset_dropdown, preview_visibility, preview_data_state],
-        outputs=[preview_visibility, dataset_preview_container, preview_output, preview_toggle]
     )
-    # Function to update UI based on subject selection mode
-    def update_subject_selection_ui(mode):
-        if mode == "Evaluate All Subjects":
-            return gr.update(visible=False), gr.update(visible=False)
-        elif mode == "Choose Number of Subjects":
-            return gr.update(visible=True), gr.update(visible=False)
-        else:  # "Specify which Subjects to Evaluate"
-            return gr.update(visible=False), gr.update(visible=True)
     # Connect subject selection mode to UI updates
-    subject_selection_mode.change(
         fn=update_subject_selection_ui,
-        inputs=[subject_selection_mode],
-        outputs=[num_subjects_container, specific_subjects_container]
     )
     # Update interface based on all_questions checkbox
-    def update_questions_interface(checked):
-        if checked:
-            return gr.update(visible=False), gr.update(visible=True)
-        else:
-            return gr.update(visible=True), gr.update(visible=False)
-    all_questions_checkbox.change(
         fn=update_questions_interface,
-        inputs=[all_questions_checkbox],
-        outputs=[questions_container, questions_info_text]
     )
-    # Function to convert subject selection mode to parameters
-    def get_subject_mode_param(mode):
-        if mode == "Evaluate All Subjects":
-            return "all"
-        elif mode == "Choose Number of Subjects":
-            return "number"
-        else:  # "Specify which Subjects to Evaluate"
-            return "specific"
-    # Function to extract subject names from checkboxes
-    def get_subject_names(selected_subjects):
-        # Extract just the subject name without the count
-        return [subject.split(" (")[0] for subject in selected_subjects]
-    # Function to disable UI components during evaluation
-    def start_evaluation(state):
-        if state["running"]:
-            return [
-                state,
-                gr.update(interactive=False),
-                gr.update(interactive=False),
-                gr.update(interactive=False),
-                gr.update(interactive=False),
-                gr.update(interactive=False),
-                gr.update(interactive=False),
-                gr.update(visible=True),
-                "Evaluation already in progress. Please wait.",
-                None,
-                gr.update(visible=False)
             ]
-        # Update state to running
-        state["running"] = True
-        return [
-            state,
-            gr.update(interactive=False),  # subject_selection_mode
-            gr.update(interactive=False),  # num_subjects_slider
-            gr.update(interactive=False),  # specific_subjects
-            gr.update(interactive=False),  # num_shots_slider
-            gr.update(interactive=False),  # all_questions_checkbox
-            gr.update(interactive=False),  # num_questions_slider
-            gr.update(interactive=False),  # eval_mmlu_button
-            gr.update(visible=True),       # cancel_mmlu_button
-            "Starting evaluation...",      # results_output
-            None,                          # results_table
-            gr.update(visible=False)       # results_table_container
-        ]
-    # Function to reset UI after evaluation
-    def finish_evaluation(state):
-        state["running"] = False
-        return state
-    # Function to handle cancel button click
-    def cancel_evaluation(state):
-        # Note: This doesn't actually stop the evaluation process
-        # It only updates the UI state to appear canceled
-        state["running"] = False
-        return [
-            state,
-            gr.update(interactive=True),  # subject_selection_mode
-            gr.update(interactive=True),  # num_subjects_slider
-            gr.update(interactive=True),  # specific_subjects
-            gr.update(interactive=True),  # num_shots_slider
-            gr.update(interactive=True),  # all_questions_checkbox
-            gr.update(interactive=True),  # num_questions_slider
-            gr.update(interactive=True),  # eval_mmlu_button
-            gr.update(visible=False),     # cancel_mmlu_button
-            "⚠️ Evaluation canceled by user (note: backend process may continue running)", # results_output
-            None,                         # results_table
-            gr.update(visible=False)      # results_table_container
-        ]
-    # Connect MMLU evaluation button with state tracking
-    eval_mmlu_button.click(
         fn=start_evaluation,
         inputs=[evaluation_state],
         outputs=[
             evaluation_state,
-            subject_selection_mode,
-            num_subjects_slider,
-            specific_subjects,
-            num_shots_slider,
-            all_questions_checkbox,
-            num_questions_slider,
-            eval_mmlu_button,
-            cancel_mmlu_button,
-            results_output,
-            results_table,
-            results_table_container
         ]
     ).then(
-        fn=lambda mode, num, subjects, shots, all_q, num_q:
             run_mmlu_evaluation(
                 get_subject_mode_param(mode),
                 num,
                 get_subject_names(subjects),
-                shots,
                 all_q,
-                num_q
             ),
         inputs=[
-            subject_selection_mode,
-            num_subjects_slider,
-            specific_subjects,
-            num_shots_slider,
-            all_questions_checkbox,
-            num_questions_slider
         ],
         outputs=[
-            results_output,
-            results_table,
-            eval_mmlu_button,
-            cancel_mmlu_button,
-            subject_selection_mode,
-            num_subjects_slider,
-            num_shots_slider,
-            all_questions_checkbox,
-            num_questions_slider,
-            results_table_container
         ]
     ).then(
         fn=finish_evaluation,
@@ -534,23 +209,25 @@ with gr.Blocks(css="""
     )
     # Connect cancel button
-    cancel_mmlu_button.click(
         fn=cancel_evaluation,
         inputs=[evaluation_state],
         outputs=[
             evaluation_state,
-            subject_selection_mode,
-            num_subjects_slider,
-            specific_subjects,
-            num_shots_slider,
-            all_questions_checkbox,
-            num_questions_slider,
-            eval_mmlu_button,
-            cancel_mmlu_button,
-            results_output,
-            results_table,
-            results_table_container
         ]
     )
-demo.launch()

 import os
+import gradio as gr
 from huggingface_hub import login
+# Import modules
+from modules.ui_components import CSS, create_header, create_results_section, create_action_buttons
+from modules.dataset_config import (create_dataset_section, create_mmlu_config_section,
+                                  load_dataset_preview, update_interface_based_on_dataset,
+                                  toggle_preview, update_subject_selection_ui,
+                                  update_questions_interface, get_subject_mode_param,
+                                  get_subject_names)
+from modules.app_models_config import (create_model_config_section, update_eval_button_state,
+                                      get_model_configs)
+from modules.evaluation import run_mmlu_evaluation
+from utils.state_management import (start_evaluation, finish_evaluation,
+                                   cancel_evaluation, handle_evaluation_results)
 # Read token and login
 hf_token = os.getenv("HF_READ_WRITE_TOKEN")
     print("⚠️ No HF_READ_WRITE_TOKEN found in environment")
 # ---------------------------------------------------------------------------
+# Gradio Interface
 # ---------------------------------------------------------------------------
+with gr.Blocks(css=CSS) as demo:
+    # Create header
+    header_components = create_header()
+    # Create dataset section (Section A)
+    dataset_components = create_dataset_section()
+    # Create MMLU config section (Section B)
+    mmlu_components = create_mmlu_config_section()
+    # Create model config section (Section C)
+    model_components = create_model_config_section()
+    # Create results section
+    results_components = create_results_section()
+    # Create action buttons
+    action_components = create_action_buttons()
+    # State tracking
     evaluation_state = gr.State({"running": False})
     preview_visibility = gr.State(False)
     # Connect dataset dropdown to show/hide appropriate configuration and load preview data
+    dataset_components['dropdown'].change(
         fn=load_dataset_preview,
+        inputs=[dataset_components['dropdown']],
+        outputs=[
+            dataset_components['preview_data_state'],
+            mmlu_components['specific_subjects'],
+            mmlu_components['num_subjects_slider']
+        ],
     ).then(
         fn=update_interface_based_on_dataset,
+        inputs=[
+            dataset_components['dropdown'],
+            preview_visibility
+        ],
+        outputs=[
+            mmlu_components['container'],
+            model_components['container'],
+            results_components['container'],
+            dataset_components['preview_toggle'],
+            dataset_components['preview_container'],
+            preview_visibility,
+            dataset_components['preview_toggle']
+        ]
     )
     # Connect preview toggle to show/hide dataset information
+    dataset_components['preview_toggle'].click(
         fn=toggle_preview,
+        inputs=[
+            dataset_components['dropdown'],
+            preview_visibility,
+            dataset_components['preview_data_state']
+        ],
+        outputs=[
+            preview_visibility,
+            dataset_components['preview_container'],
+            dataset_components['preview_output'],
+            dataset_components['preview_toggle']
+        ]
     )
     # Connect subject selection mode to UI updates
+    mmlu_components['subject_selection_mode'].change(
         fn=update_subject_selection_ui,
+        inputs=[
+            mmlu_components['subject_selection_mode']
+        ],
+        outputs=[
+            mmlu_components['num_subjects_container'],
+            mmlu_components['specific_subjects_container']
+        ]
     )
     # Update interface based on all_questions checkbox
+    mmlu_components['all_questions_checkbox'].change(
         fn=update_questions_interface,
+        inputs=[
+            mmlu_components['all_questions_checkbox']
+        ],
+        outputs=[
+            mmlu_components['num_questions_slider'],
+            mmlu_components['questions_info_text']
+        ]
     )
+    # Connect model config changes to validation
+    for component in [
+        model_components['model1_dropdown'],
+        model_components['model1_shots'],
+        model_components['model1_regex'],
+        model_components['model1_flash_attn'],
+        model_components['model2_dropdown'],
+        model_components['model2_shots'],
+        model_components['model2_regex'],
+        model_components['model2_flash_attn']
+    ]:
+        component.change(
+            fn=update_eval_button_state,
+            inputs=[
+                model_components['model1_dropdown'],
+                model_components['model1_shots'],
+                model_components['model1_regex'],
+                model_components['model1_flash_attn'],
+                model_components['model2_dropdown'],
+                model_components['model2_shots'],
+                model_components['model2_regex'],
+                model_components['model2_flash_attn']
+            ],
+            outputs=[
+                model_components['error_message'],
+                action_components['eval_button']
             ]
+        )
+    # Connect evaluation button with state tracking
+    action_components['eval_button'].click(
         fn=start_evaluation,
         inputs=[evaluation_state],
         outputs=[
             evaluation_state,
+            mmlu_components['subject_selection_mode'],
+            mmlu_components['num_subjects_slider'],
+            mmlu_components['specific_subjects'],
+            mmlu_components['all_questions_checkbox'],
+            mmlu_components['num_questions_slider'],
+            model_components['model1_dropdown'],
+            model_components['model2_dropdown'],
+            action_components['eval_button'],
+            action_components['cancel_button'],
+            results_components['output'],
+            results_components['table'],
+            results_components['table_container']
         ]
     ).then(
+        fn=lambda mode, num, subjects, all_q, num_q, m1, m1_shots, m1_regex, m1_flash, m2, m2_shots, m2_regex, m2_flash:
             run_mmlu_evaluation(
                 get_subject_mode_param(mode),
                 num,
                 get_subject_names(subjects),
                 all_q,
+                num_q,
+                get_model_configs(
+                    m1, m1_shots, m1_regex, m1_flash,
+                    m2, m2_shots, m2_regex, m2_flash
+                )
             ),
         inputs=[
+            mmlu_components['subject_selection_mode'],
+            mmlu_components['num_subjects_slider'],
+            mmlu_components['specific_subjects'],
+            mmlu_components['all_questions_checkbox'],
+            mmlu_components['num_questions_slider'],
+            model_components['model1_dropdown'],
+            model_components['model1_shots'],
+            model_components['model1_regex'],
+            model_components['model1_flash_attn'],
+            model_components['model2_dropdown'],
+            model_components['model2_shots'],
+            model_components['model2_regex'],
+            model_components['model2_flash_attn']
         ],
         outputs=[
+            results_components['output'],
+            results_components['table'],
+            action_components['eval_button'],
+            action_components['cancel_button'],
+            mmlu_components['subject_selection_mode'],
+            mmlu_components['num_subjects_slider'],
+            mmlu_components['all_questions_checkbox'],
+            mmlu_components['num_questions_slider'],
+            model_components['model1_dropdown'],
+            results_components['table_container']
         ]
     ).then(
         fn=finish_evaluation,
     )
     # Connect cancel button
+    action_components['cancel_button'].click(
         fn=cancel_evaluation,
         inputs=[evaluation_state],
         outputs=[
             evaluation_state,
+            mmlu_components['subject_selection_mode'],
+            mmlu_components['num_subjects_slider'],
+            mmlu_components['specific_subjects'],
+            mmlu_components['all_questions_checkbox'],
+            mmlu_components['num_questions_slider'],
+            model_components['model1_dropdown'],
+            model_components['model2_dropdown'],
+            action_components['eval_button'],
+            action_components['cancel_button'],
+            results_components['output'],
+            results_components['table'],
+            results_components['table_container']
         ]
     )
+if __name__ == "__main__":
+    demo.launch()