H2H-eval-comparator

Sleeping

File size: 21,484 Bytes

c5224d3
4c36941
 
8c4aa75
3f6f5f7
3d567ab
bd05b7b
 
6896b10
4c36941
e8d7a5b
940cf49
e8d7a5b
 
 
bd05b7b
c5224d3
3195f7f
bd05b7b
3195f7f
bb6fa7e
bd05b7b
3195f7f
bd05b7b
3195f7f
3601959
a5202a8
ee60006
df31ae3
 
 
a5202a8
bd05b7b
a5202a8
df31ae3
ed9a008
9d09e02
33231b0
ee60006
bd05b7b
 
a5202a8
bd05b7b
a5202a8
 
 
bd05b7b
 
 
df31ae3
bd05b7b
 
 
 
 
 
a5202a8
 
bd05b7b
 
9190bb9
bd05b7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee60006
bd05b7b
2010e21
 
 
 
 
 
 
 
 
 
bd05b7b
 
 
 
 
 
 
2010e21
 
 
 
 
 
 
 
 
 
554eaa1
3195f7f
bd05b7b
3195f7f
3389471
 
 
 
 
 
 
 
 
 
b03c5d3
 
 
a5202a8
 
 
 
 
 
 
 
 
 
 
3389471
dfececa
e8d7a5b
b03c5d3
 
0cea5f2
b03c5d3
0cea5f2
a2d0717
6896b10
 
a5202a8
7bf88c1
6896b10
 
2010e21
 
6896b10
b03c5d3
6896b10
e4416c3
6896b10
 
2010e21
 
6896b10
2010e21
 
 
6896b10
2010e21
 
 
33231b0
a5202a8
 
 
 
5ea0bec
2010e21
a5202a8
5ea0bec
a5202a8
 
5ea0bec
 
a5202a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ea0bec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df31ae3
5ea0bec
2010e21
df31ae3
33231b0
2010e21
 
 
 
 
 
 
 
bd05b7b
fc5f4bd
bd05b7b
fc5f4bd
 
 
bd05b7b
2010e21
fc5f4bd
5ea0bec
6896b10
 
 
fc5f4bd
 
 
e4416c3
6896b10
5ea0bec
6896b10
 
 
fc5f4bd
 
 
e4416c3
6896b10
5ea0bec
 
 
 
fc5f4bd
 
6896b10
 
2010e21
fc5f4bd
 
 
 
 
e4416c3
2010e21
fc5f4bd
 
37aa822
554eaa1
fc5f4bd
 
6896b10
fc5f4bd
2010e21
 
fc5f4bd
6896b10
2010e21
 
 
fc5f4bd
 
5ea0bec
 
a5202a8
 
 
 
 
 
 
 
ed9a008
a5202a8
 
 
 
 
df31ae3
 
33231b0
 
ed9a008
33231b0
ed9a008
33231b0
ed9a008
 
33231b0
ed9a008
33231b0
ed9a008
 
a5202a8
 
 
 
 
 
 
 
 
 
 
 
 
 
33231b0
bd05b7b
 
 
 
 
 
 
 
 
 
2010e21
bd05b7b
2010e21
 
bd05b7b
 
 
 
 
33231b0
bd05b7b
a5202a8
bd05b7b
a5202a8
bd05b7b
 
 
33231b0
bd05b7b
 
2010e21
 
33231b0
 
bd05b7b
 
 
 
 
33231b0
bd05b7b
 
 
 
33231b0
bd05b7b
a5202a8
bd05b7b
a5202a8
bd05b7b
 
 
33231b0
bd05b7b
 
2010e21
 
33231b0
 
bd05b7b
df31ae3
bd05b7b
 
33231b0
bd05b7b
a5202a8
33231b0
a5202a8
33231b0
 
 
 
bd05b7b
 
2010e21
 
33231b0
 
a5202a8
 
 
 
 
 
 
 
 
df31ae3
a5202a8
df31ae3
a5202a8
df31ae3
ed9a008
 
df31ae3
33231b0
 
 
1642f78
33231b0
a5202a8
1642f78
 
 
2010e21
 
33231b0
bd05b7b
 
 
 
33231b0
 
 
 
 
bd05b7b
33231b0
bd05b7b
a5202a8
33231b0
a5202a8
33231b0
 
 
 
 
 
2010e21
 
33231b0
df31ae3
ee60006
3389471

import gradio as gr
import os
from huggingface_hub import login
from mmlu_pro_eval_adapted import evaluate_mmlu_pro
import spaces
import pandas as pd
import time
import traceback
from dataset_previews import mmlupro_dataset_preview, format_preview_for_display

# Read token and login
hf_token = os.getenv("HF_READ_WRITE_TOKEN")
if hf_token:
    login(hf_token)
else:
    print("⚠️ No HF_READ_WRITE_TOKEN found in environment")

# ---------------------------------------------------------------------------
# 1. Model configuration
# ---------------------------------------------------------------------------
model_name = "mistralai/Mistral-7B-v0.1"

# ---------------------------------------------------------------------------
# 2. MMLU-Pro Evaluation
# ---------------------------------------------------------------------------
@spaces.GPU(duration=240)
def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
    """
    Runs the MMLU evaluation with the specified parameters.
    
    Args:
        subject_selection_mode (str): Mode of subject selection ("all", "number", or "specific")
        num_subjects (int): Number of subjects to evaluate (1-14)
        selected_subjects (list): List of specific subjects to evaluate
        num_shots (int): Number of few-shot examples (0-5)
        all_questions (bool): Whether to evaluate all questions per subject
        num_questions (int): Number of examples per subject (1-100 or all)
        progress (gr.Progress): Progress indicator
    """
    try:
        # Convert parameters if needed
        if subject_selection_mode == "all":
            num_subjects = -1
            selected_subjects = []
        elif subject_selection_mode == "specific":
            num_subjects = len(selected_subjects) if selected_subjects else -1
            
        if all_questions:
            num_questions = -1

        # Run evaluation with timing
        start_time = time.time()
        results = evaluate_mmlu_pro(
            model_name,
            num_subjects=num_subjects,
            num_questions=num_questions,
            num_shots=num_shots,
            specific_subjects=selected_subjects if subject_selection_mode == "specific" else None
        )
        elapsed_time = time.time() - start_time

        # Format results
        overall_acc = results["overall_accuracy"]
        min_subject, min_acc = results["min_accuracy_subject"]
        max_subject, max_acc = results["max_accuracy_subject"]
        
        # Create DataFrame from results table
        results_df = pd.DataFrame(results["full_accuracy_table"])
        
        # Calculate totals for the overall row
        total_samples = results_df['Num_samples'].sum()
        total_correct = results_df['Num_correct'].sum()
        
        # Create overall row
        overall_row = pd.DataFrame({
            'Subject': ['**Overall**'],
            'Num_samples': [total_samples],
            'Num_correct': [total_correct],
            'Accuracy': [overall_acc]
        })
        
        # Concatenate overall row with results
        results_df = pd.concat([overall_row, results_df], ignore_index=True)
        
        # Format the report
        report = (
            f"### Overall Results\n"
            f"* Overall Accuracy: {overall_acc:.3f}\n"
            f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
            f"* Worst Performance: {min_subject} ({min_acc:.3f})\n"
            f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
        )

        # Return values that re-enable UI components after completion
        return (report, 
                results_df, 
                gr.update(interactive=True), 
                gr.update(visible=False),
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True),
                gr.update(visible=True))
                
    except Exception as e:
        # Handle errors gracefully
        error_trace = traceback.format_exc()
        error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
        
        # Re-enable UI components on error
        return (error_message, 
                None,
                gr.update(interactive=True), 
                gr.update(visible=False),
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True), 
                gr.update(interactive=True),
                gr.update(visible=False))
        
# ---------------------------------------------------------------------------
# 3. Gradio Interface
# ---------------------------------------------------------------------------
with gr.Blocks(css="""
    #preview_header {
        margin-bottom: 10px;
        margin-top: 5px;
    }
    #preview_table {
        background-color: #f8f9fa;
        border-radius: 8px;
        padding: 10px;
    }
    h1 {
        text-align: center;
    }
    .section-spacing {
        margin-top: 30px;
        margin-bottom: 30px;
    }
    .config-box {
        border: 1px solid #ddd;
        border-radius: 8px;
        padding: 15px;
        margin: 10px;
        background-color: #f9f9f9;
    }
""") as demo:
    gr.Markdown("# Head-to-Head Model Evaluation Comparator")
    gr.Markdown("""
    This demo evaluates two models (or one model with two different configs), head-to-head, on a benchmark dataset.
    
    Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)
    
    Available Models: [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
    """)
    
    # Dataset Selection Section
    gr.Markdown("## (A) Select Dataset for Evaluation", elem_classes=["section-spacing"])
    
    with gr.Row():
        dataset_dropdown = gr.Dropdown(
            choices=["(Select Dataset)", "MMLU-Pro"],
            value="(Select Dataset)",
            label="Dataset",
            info="Select a dataset to perform the Head-to-Head Evaluation on. Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)"
        )
        preview_toggle = gr.Button("Show Dataset Preview", interactive=False, variant="secondary")
    
    # Dataset Preview Container - Initially hidden
    with gr.Column(visible=False) as dataset_preview_container:
        gr.Markdown("## Dataset Preview", elem_id="preview_header")
        preview_output = gr.DataFrame(
            interactive=False,
            wrap=True,
            elem_id="preview_table"
        )
        # Add vertical space after the preview
        gr.Markdown("&nbsp;")
        gr.Markdown("&nbsp;")
    
    # Add more spacing between sections
    gr.Markdown("&nbsp;", elem_classes=["section-spacing"])
    gr.Markdown("&nbsp;", elem_classes=["section-spacing"])
    
    # MMLU Config Container - Initially hidden until dataset is selected
    with gr.Column(visible=False) as mmlu_config_container:
        gr.Markdown("## (B) Select Dataset Configuration Options", elem_classes=["section-spacing"])
        
        # Add more spacing
        gr.Markdown("&nbsp;")
        
        with gr.Row():
            # Left column for subject selection
            with gr.Column(scale=1):
                with gr.Box(elem_classes=["config-box"]):
                    gr.Markdown("### Choose Subjects")
                    
                    subject_selection_mode = gr.Radio(
                        choices=["Evaluate All Subjects", "Choose Number of Subjects", "Specify which Subjects to Evaluate"],
                        value="Evaluate All Subjects",
                        label="Subject Selection Mode"
                    )
                    
                    # Subject number slider - initially hidden, shown when "Choose Number of Subjects" is selected
                    with gr.Column(visible=False) as num_subjects_container:
                        num_subjects_slider = gr.Slider(
                            minimum=1,
                            maximum=14,
                            value=14,
                            step=1,
                            label="Number of Subjects",
                            info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order."
                        )
                    
                    # Subject checkboxes - initially hidden, shown when "Specify which Subjects to Evaluate" is selected
                    with gr.Column(visible=False) as specific_subjects_container:
                        # We'll populate this with checkboxes for each subject
                        # The actual subjects will come from the dataset preview
                        specific_subjects = gr.CheckboxGroup(
                            choices=[
                                "Biology (n=717)",
                                "Chemistry (n=500)",
                                "Physics (n=650)",
                                "Mathematics (n=800)",
                                "Computer Science (n=450)",
                                "History (n=300)",
                                "Literature (n=250)"
                            ],
                            label="Select Specific Subjects",
                            info="Select which specific subjects to evaluate"
                        )
            
            # Right column for few-shot examples
            with gr.Column(scale=1):
                with gr.Box(elem_classes=["config-box"]):
                    gr.Markdown("### Few-shot Configuration")
                    
                    num_shots_slider = gr.Slider(
                        minimum=0,
                        maximum=5,
                        value=5,
                        step=1,
                        label="Number of Few-shot Examples",
                        info="Number of examples to use for few-shot learning (0-5)."
                    )
        
        # Add spacing
        gr.Markdown("&nbsp;")
        
        with gr.Row():
            all_questions_checkbox = gr.Checkbox(
                label="Evaluate All Questions",
                value=False,
                info="When checked, evaluates all available questions for each subject"
            )
            questions_info_text = gr.Markdown(visible=False, value="**All 12,032 questions across all subjects will be evaluated**")
        
        with gr.Row(elem_id="questions_selection_row"):
            questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
        
        with questions_container:
            num_questions_slider = gr.Slider(
                minimum=1,
                maximum=100,
                value=20,
                step=1,
                label="Questions per Subject",
                info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.",
                interactive=True
            )
        
        with gr.Row():
            with gr.Column(scale=1):
                eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True)
                cancel_mmlu_button = gr.Button("Cancel Evaluation", variant="stop", visible=False)
    
    # Results Section - Initially hidden
    with gr.Column(visible=False) as results_container:
        results_output = gr.Markdown(label="Evaluation Results")
        
        # Results table - Initially hidden until evaluation completes
        with gr.Column(visible=False) as results_table_container:
            with gr.Row():
                results_table = gr.DataFrame(
                    interactive=True, 
                    label="Detailed Results (Sortable)",
                    visible=True
                )
    
    # Track evaluation state 
    evaluation_state = gr.State({"running": False})
    
    # Track preview visibility state
    preview_visibility = gr.State(False)
    
    # Function to show/hide configuration based on selected dataset
    def update_interface_based_on_dataset(dataset, current_visibility):
        if dataset == "MMLU-Pro":
            return (
                gr.update(visible=True),  # mmlu_config_container
                gr.update(visible=True),  # results_container
                gr.update(interactive=True),  # preview_toggle
                gr.update(visible=False),  # dataset_preview_container - hide it initially
                False,  # Reset preview_visibility to False
                gr.update(value="Show Dataset Preview")  # Reset button text
            )
        else:
            return (
                gr.update(visible=False),  # mmlu_config_container
                gr.update(visible=False),  # results_container
                gr.update(interactive=False),  # preview_toggle
                gr.update(visible=False),  # dataset_preview_container - hide when no dataset
                False,  # Reset preview_visibility to False
                gr.update(value="Show Dataset Preview")  # Reset button text
            )
    
    # Connect dataset dropdown to show/hide appropriate configuration
    dataset_dropdown.change(
        fn=update_interface_based_on_dataset,
        inputs=[dataset_dropdown, preview_visibility],
        outputs=[mmlu_config_container, results_container, preview_toggle, dataset_preview_container, preview_visibility, preview_toggle]
    )
    
    # Function to toggle dataset preview visibility
    def toggle_preview(dataset, preview_visibility):
        # Toggle the visibility state
        is_visible = not preview_visibility
        
        # Update button text based on new state
        button_text = "Hide Dataset Preview" if is_visible else "Show Dataset Preview"
        
        # Get preview data if becoming visible
        if is_visible and dataset == "MMLU-Pro":
            preview_data = mmlupro_dataset_preview(regenerate_preview=False) # Change regenerate_preview=True if you want to regenerate the preview.
            formatted_preview = format_preview_for_display(preview_data)
            return is_visible, gr.update(visible=True), formatted_preview, gr.update(value=button_text)
        elif is_visible:
            # For other datasets (not implemented yet)
            return is_visible, gr.update(visible=True), None, gr.update(value=button_text)
        else:
            # Hiding the preview
            return is_visible, gr.update(visible=False), None, gr.update(value=button_text)
    
    # Connect preview toggle to show/hide dataset information
    preview_toggle.click(
        fn=toggle_preview,
        inputs=[dataset_dropdown, preview_visibility],
        outputs=[preview_visibility, dataset_preview_container, preview_output, preview_toggle]
    )
    
    # Function to update UI based on subject selection mode
    def update_subject_selection_ui(mode):
        if mode == "Evaluate All Subjects":
            return gr.update(visible=False), gr.update(visible=False)
        elif mode == "Choose Number of Subjects":
            return gr.update(visible=True), gr.update(visible=False)
        else:  # "Specify which Subjects to Evaluate"
            return gr.update(visible=False), gr.update(visible=True)
    
    # Connect subject selection mode to UI updates
    subject_selection_mode.change(
        fn=update_subject_selection_ui,
        inputs=[subject_selection_mode],
        outputs=[num_subjects_container, specific_subjects_container]
    )
    
    # Update interface based on all_questions checkbox
    def update_questions_interface(checked):
        if checked:
            return gr.update(visible=False), gr.update(visible=True)
        else:
            return gr.update(visible=True), gr.update(visible=False)
    
    all_questions_checkbox.change(
        fn=update_questions_interface,
        inputs=[all_questions_checkbox],
        outputs=[questions_container, questions_info_text]
    )
    
    # Function to convert subject selection mode to parameters
    def get_subject_mode_param(mode):
        if mode == "Evaluate All Subjects":
            return "all"
        elif mode == "Choose Number of Subjects":
            return "number"
        else:  # "Specify which Subjects to Evaluate"
            return "specific"
    
    # Function to extract subject names from checkboxes
    def get_subject_names(selected_subjects):
        # Extract just the subject name without the count
        return [subject.split(" (")[0] for subject in selected_subjects]
    
    # Function to disable UI components during evaluation
    def start_evaluation(state):
        if state["running"]:
            return [
                state,
                gr.update(interactive=False),
                gr.update(interactive=False),
                gr.update(interactive=False),
                gr.update(interactive=False),
                gr.update(interactive=False),
                gr.update(interactive=False),
                gr.update(visible=True),
                "Evaluation already in progress. Please wait.",
                None,
                gr.update(visible=False)
            ]
        
        # Update state to running
        state["running"] = True
        
        return [
            state,
            gr.update(interactive=False),  # subject_selection_mode
            gr.update(interactive=False),  # num_subjects_slider
            gr.update(interactive=False),  # specific_subjects
            gr.update(interactive=False),  # num_shots_slider
            gr.update(interactive=False),  # all_questions_checkbox
            gr.update(interactive=False),  # num_questions_slider
            gr.update(interactive=False),  # eval_mmlu_button
            gr.update(visible=True),       # cancel_mmlu_button
            "Starting evaluation...",      # results_output
            None,                          # results_table
            gr.update(visible=False)       # results_table_container
        ]
    
    # Function to reset UI after evaluation
    def finish_evaluation(state):
        state["running"] = False
        return state
    
    # Function to handle cancel button click
    def cancel_evaluation(state):
        # Note: This doesn't actually stop the evaluation process
        # It only updates the UI state to appear canceled
        state["running"] = False
        return [
            state,
            gr.update(interactive=True),  # subject_selection_mode
            gr.update(interactive=True),  # num_subjects_slider
            gr.update(interactive=True),  # specific_subjects
            gr.update(interactive=True),  # num_shots_slider
            gr.update(interactive=True),  # all_questions_checkbox
            gr.update(interactive=True),  # num_questions_slider
            gr.update(interactive=True),  # eval_mmlu_button
            gr.update(visible=False),     # cancel_mmlu_button
            "⚠️ Evaluation canceled by user (note: backend process may continue running)", # results_output
            None,                         # results_table
            gr.update(visible=False)      # results_table_container
        ]
    
    # Connect MMLU evaluation button with state tracking
    eval_mmlu_button.click(
        fn=start_evaluation,
        inputs=[evaluation_state],
        outputs=[
            evaluation_state,
            subject_selection_mode,
            num_subjects_slider,
            specific_subjects,
            num_shots_slider,
            all_questions_checkbox,
            num_questions_slider,
            eval_mmlu_button,
            cancel_mmlu_button,
            results_output,
            results_table,
            results_table_container
        ]
    ).then(
        fn=lambda mode, num, subjects, shots, all_q, num_q: 
            run_mmlu_evaluation(
                get_subject_mode_param(mode),
                num,
                get_subject_names(subjects),
                shots,
                all_q,
                num_q
            ),
        inputs=[
            subject_selection_mode,
            num_subjects_slider,
            specific_subjects,
            num_shots_slider,
            all_questions_checkbox,
            num_questions_slider
        ],
        outputs=[
            results_output,
            results_table,
            eval_mmlu_button, 
            cancel_mmlu_button,
            subject_selection_mode,
            num_subjects_slider,
            num_shots_slider,
            all_questions_checkbox,
            num_questions_slider,
            results_table_container
        ]
    ).then(
        fn=finish_evaluation,
        inputs=[evaluation_state],
        outputs=[evaluation_state]
    )
    
    # Connect cancel button
    cancel_mmlu_button.click(
        fn=cancel_evaluation,
        inputs=[evaluation_state],
        outputs=[
            evaluation_state,
            subject_selection_mode,
            num_subjects_slider,
            specific_subjects,
            num_shots_slider,
            all_questions_checkbox,
            num_questions_slider,
            eval_mmlu_button,
            cancel_mmlu_button,
            results_output,
            results_table,
            results_table_container
        ]
    )

demo.launch()