rohansampath's picture
Update app.py
a5202a8 verified
raw
history blame
21.5 kB
import gradio as gr
import os
from huggingface_hub import login
from mmlu_pro_eval_adapted import evaluate_mmlu_pro
import spaces
import pandas as pd
import time
import traceback
from dataset_previews import mmlupro_dataset_preview, format_preview_for_display
# Read token and login
hf_token = os.getenv("HF_READ_WRITE_TOKEN")
if hf_token:
login(hf_token)
else:
print("⚠️ No HF_READ_WRITE_TOKEN found in environment")
# ---------------------------------------------------------------------------
# 1. Model configuration
# ---------------------------------------------------------------------------
model_name = "mistralai/Mistral-7B-v0.1"
# ---------------------------------------------------------------------------
# 2. MMLU-Pro Evaluation
# ---------------------------------------------------------------------------
@spaces.GPU(duration=240)
def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
"""
Runs the MMLU evaluation with the specified parameters.
Args:
subject_selection_mode (str): Mode of subject selection ("all", "number", or "specific")
num_subjects (int): Number of subjects to evaluate (1-14)
selected_subjects (list): List of specific subjects to evaluate
num_shots (int): Number of few-shot examples (0-5)
all_questions (bool): Whether to evaluate all questions per subject
num_questions (int): Number of examples per subject (1-100 or all)
progress (gr.Progress): Progress indicator
"""
try:
# Convert parameters if needed
if subject_selection_mode == "all":
num_subjects = -1
selected_subjects = []
elif subject_selection_mode == "specific":
num_subjects = len(selected_subjects) if selected_subjects else -1
if all_questions:
num_questions = -1
# Run evaluation with timing
start_time = time.time()
results = evaluate_mmlu_pro(
model_name,
num_subjects=num_subjects,
num_questions=num_questions,
num_shots=num_shots,
specific_subjects=selected_subjects if subject_selection_mode == "specific" else None
)
elapsed_time = time.time() - start_time
# Format results
overall_acc = results["overall_accuracy"]
min_subject, min_acc = results["min_accuracy_subject"]
max_subject, max_acc = results["max_accuracy_subject"]
# Create DataFrame from results table
results_df = pd.DataFrame(results["full_accuracy_table"])
# Calculate totals for the overall row
total_samples = results_df['Num_samples'].sum()
total_correct = results_df['Num_correct'].sum()
# Create overall row
overall_row = pd.DataFrame({
'Subject': ['**Overall**'],
'Num_samples': [total_samples],
'Num_correct': [total_correct],
'Accuracy': [overall_acc]
})
# Concatenate overall row with results
results_df = pd.concat([overall_row, results_df], ignore_index=True)
# Format the report
report = (
f"### Overall Results\n"
f"* Overall Accuracy: {overall_acc:.3f}\n"
f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
f"* Worst Performance: {min_subject} ({min_acc:.3f})\n"
f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
)
# Return values that re-enable UI components after completion
return (report,
results_df,
gr.update(interactive=True),
gr.update(visible=False),
gr.update(interactive=True),
gr.update(interactive=True),
gr.update(interactive=True),
gr.update(interactive=True),
gr.update(interactive=True),
gr.update(visible=True))
except Exception as e:
# Handle errors gracefully
error_trace = traceback.format_exc()
error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
# Re-enable UI components on error
return (error_message,
None,
gr.update(interactive=True),
gr.update(visible=False),
gr.update(interactive=True),
gr.update(interactive=True),
gr.update(interactive=True),
gr.update(interactive=True),
gr.update(interactive=True),
gr.update(visible=False))
# ---------------------------------------------------------------------------
# 3. Gradio Interface
# ---------------------------------------------------------------------------
with gr.Blocks(css="""
#preview_header {
margin-bottom: 10px;
margin-top: 5px;
}
#preview_table {
background-color: #f8f9fa;
border-radius: 8px;
padding: 10px;
}
h1 {
text-align: center;
}
.section-spacing {
margin-top: 30px;
margin-bottom: 30px;
}
.config-box {
border: 1px solid #ddd;
border-radius: 8px;
padding: 15px;
margin: 10px;
background-color: #f9f9f9;
}
""") as demo:
gr.Markdown("# Head-to-Head Model Evaluation Comparator")
gr.Markdown("""
This demo evaluates two models (or one model with two different configs), head-to-head, on a benchmark dataset.
Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)
Available Models: [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
""")
# Dataset Selection Section
gr.Markdown("## (A) Select Dataset for Evaluation", elem_classes=["section-spacing"])
with gr.Row():
dataset_dropdown = gr.Dropdown(
choices=["(Select Dataset)", "MMLU-Pro"],
value="(Select Dataset)",
label="Dataset",
info="Select a dataset to perform the Head-to-Head Evaluation on. Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)"
)
preview_toggle = gr.Button("Show Dataset Preview", interactive=False, variant="secondary")
# Dataset Preview Container - Initially hidden
with gr.Column(visible=False) as dataset_preview_container:
gr.Markdown("## Dataset Preview", elem_id="preview_header")
preview_output = gr.DataFrame(
interactive=False,
wrap=True,
elem_id="preview_table"
)
# Add vertical space after the preview
gr.Markdown(" ")
gr.Markdown(" ")
# Add more spacing between sections
gr.Markdown(" ", elem_classes=["section-spacing"])
gr.Markdown(" ", elem_classes=["section-spacing"])
# MMLU Config Container - Initially hidden until dataset is selected
with gr.Column(visible=False) as mmlu_config_container:
gr.Markdown("## (B) Select Dataset Configuration Options", elem_classes=["section-spacing"])
# Add more spacing
gr.Markdown(" ")
with gr.Row():
# Left column for subject selection
with gr.Column(scale=1):
with gr.Box(elem_classes=["config-box"]):
gr.Markdown("### Choose Subjects")
subject_selection_mode = gr.Radio(
choices=["Evaluate All Subjects", "Choose Number of Subjects", "Specify which Subjects to Evaluate"],
value="Evaluate All Subjects",
label="Subject Selection Mode"
)
# Subject number slider - initially hidden, shown when "Choose Number of Subjects" is selected
with gr.Column(visible=False) as num_subjects_container:
num_subjects_slider = gr.Slider(
minimum=1,
maximum=14,
value=14,
step=1,
label="Number of Subjects",
info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order."
)
# Subject checkboxes - initially hidden, shown when "Specify which Subjects to Evaluate" is selected
with gr.Column(visible=False) as specific_subjects_container:
# We'll populate this with checkboxes for each subject
# The actual subjects will come from the dataset preview
specific_subjects = gr.CheckboxGroup(
choices=[
"Biology (n=717)",
"Chemistry (n=500)",
"Physics (n=650)",
"Mathematics (n=800)",
"Computer Science (n=450)",
"History (n=300)",
"Literature (n=250)"
],
label="Select Specific Subjects",
info="Select which specific subjects to evaluate"
)
# Right column for few-shot examples
with gr.Column(scale=1):
with gr.Box(elem_classes=["config-box"]):
gr.Markdown("### Few-shot Configuration")
num_shots_slider = gr.Slider(
minimum=0,
maximum=5,
value=5,
step=1,
label="Number of Few-shot Examples",
info="Number of examples to use for few-shot learning (0-5)."
)
# Add spacing
gr.Markdown(" ")
with gr.Row():
all_questions_checkbox = gr.Checkbox(
label="Evaluate All Questions",
value=False,
info="When checked, evaluates all available questions for each subject"
)
questions_info_text = gr.Markdown(visible=False, value="**All 12,032 questions across all subjects will be evaluated**")
with gr.Row(elem_id="questions_selection_row"):
questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
with questions_container:
num_questions_slider = gr.Slider(
minimum=1,
maximum=100,
value=20,
step=1,
label="Questions per Subject",
info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.",
interactive=True
)
with gr.Row():
with gr.Column(scale=1):
eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True)
cancel_mmlu_button = gr.Button("Cancel Evaluation", variant="stop", visible=False)
# Results Section - Initially hidden
with gr.Column(visible=False) as results_container:
results_output = gr.Markdown(label="Evaluation Results")
# Results table - Initially hidden until evaluation completes
with gr.Column(visible=False) as results_table_container:
with gr.Row():
results_table = gr.DataFrame(
interactive=True,
label="Detailed Results (Sortable)",
visible=True
)
# Track evaluation state
evaluation_state = gr.State({"running": False})
# Track preview visibility state
preview_visibility = gr.State(False)
# Function to show/hide configuration based on selected dataset
def update_interface_based_on_dataset(dataset, current_visibility):
if dataset == "MMLU-Pro":
return (
gr.update(visible=True), # mmlu_config_container
gr.update(visible=True), # results_container
gr.update(interactive=True), # preview_toggle
gr.update(visible=False), # dataset_preview_container - hide it initially
False, # Reset preview_visibility to False
gr.update(value="Show Dataset Preview") # Reset button text
)
else:
return (
gr.update(visible=False), # mmlu_config_container
gr.update(visible=False), # results_container
gr.update(interactive=False), # preview_toggle
gr.update(visible=False), # dataset_preview_container - hide when no dataset
False, # Reset preview_visibility to False
gr.update(value="Show Dataset Preview") # Reset button text
)
# Connect dataset dropdown to show/hide appropriate configuration
dataset_dropdown.change(
fn=update_interface_based_on_dataset,
inputs=[dataset_dropdown, preview_visibility],
outputs=[mmlu_config_container, results_container, preview_toggle, dataset_preview_container, preview_visibility, preview_toggle]
)
# Function to toggle dataset preview visibility
def toggle_preview(dataset, preview_visibility):
# Toggle the visibility state
is_visible = not preview_visibility
# Update button text based on new state
button_text = "Hide Dataset Preview" if is_visible else "Show Dataset Preview"
# Get preview data if becoming visible
if is_visible and dataset == "MMLU-Pro":
preview_data = mmlupro_dataset_preview(regenerate_preview=False) # Change regenerate_preview=True if you want to regenerate the preview.
formatted_preview = format_preview_for_display(preview_data)
return is_visible, gr.update(visible=True), formatted_preview, gr.update(value=button_text)
elif is_visible:
# For other datasets (not implemented yet)
return is_visible, gr.update(visible=True), None, gr.update(value=button_text)
else:
# Hiding the preview
return is_visible, gr.update(visible=False), None, gr.update(value=button_text)
# Connect preview toggle to show/hide dataset information
preview_toggle.click(
fn=toggle_preview,
inputs=[dataset_dropdown, preview_visibility],
outputs=[preview_visibility, dataset_preview_container, preview_output, preview_toggle]
)
# Function to update UI based on subject selection mode
def update_subject_selection_ui(mode):
if mode == "Evaluate All Subjects":
return gr.update(visible=False), gr.update(visible=False)
elif mode == "Choose Number of Subjects":
return gr.update(visible=True), gr.update(visible=False)
else: # "Specify which Subjects to Evaluate"
return gr.update(visible=False), gr.update(visible=True)
# Connect subject selection mode to UI updates
subject_selection_mode.change(
fn=update_subject_selection_ui,
inputs=[subject_selection_mode],
outputs=[num_subjects_container, specific_subjects_container]
)
# Update interface based on all_questions checkbox
def update_questions_interface(checked):
if checked:
return gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=True), gr.update(visible=False)
all_questions_checkbox.change(
fn=update_questions_interface,
inputs=[all_questions_checkbox],
outputs=[questions_container, questions_info_text]
)
# Function to convert subject selection mode to parameters
def get_subject_mode_param(mode):
if mode == "Evaluate All Subjects":
return "all"
elif mode == "Choose Number of Subjects":
return "number"
else: # "Specify which Subjects to Evaluate"
return "specific"
# Function to extract subject names from checkboxes
def get_subject_names(selected_subjects):
# Extract just the subject name without the count
return [subject.split(" (")[0] for subject in selected_subjects]
# Function to disable UI components during evaluation
def start_evaluation(state):
if state["running"]:
return [
state,
gr.update(interactive=False),
gr.update(interactive=False),
gr.update(interactive=False),
gr.update(interactive=False),
gr.update(interactive=False),
gr.update(interactive=False),
gr.update(visible=True),
"Evaluation already in progress. Please wait.",
None,
gr.update(visible=False)
]
# Update state to running
state["running"] = True
return [
state,
gr.update(interactive=False), # subject_selection_mode
gr.update(interactive=False), # num_subjects_slider
gr.update(interactive=False), # specific_subjects
gr.update(interactive=False), # num_shots_slider
gr.update(interactive=False), # all_questions_checkbox
gr.update(interactive=False), # num_questions_slider
gr.update(interactive=False), # eval_mmlu_button
gr.update(visible=True), # cancel_mmlu_button
"Starting evaluation...", # results_output
None, # results_table
gr.update(visible=False) # results_table_container
]
# Function to reset UI after evaluation
def finish_evaluation(state):
state["running"] = False
return state
# Function to handle cancel button click
def cancel_evaluation(state):
# Note: This doesn't actually stop the evaluation process
# It only updates the UI state to appear canceled
state["running"] = False
return [
state,
gr.update(interactive=True), # subject_selection_mode
gr.update(interactive=True), # num_subjects_slider
gr.update(interactive=True), # specific_subjects
gr.update(interactive=True), # num_shots_slider
gr.update(interactive=True), # all_questions_checkbox
gr.update(interactive=True), # num_questions_slider
gr.update(interactive=True), # eval_mmlu_button
gr.update(visible=False), # cancel_mmlu_button
"⚠️ Evaluation canceled by user (note: backend process may continue running)", # results_output
None, # results_table
gr.update(visible=False) # results_table_container
]
# Connect MMLU evaluation button with state tracking
eval_mmlu_button.click(
fn=start_evaluation,
inputs=[evaluation_state],
outputs=[
evaluation_state,
subject_selection_mode,
num_subjects_slider,
specific_subjects,
num_shots_slider,
all_questions_checkbox,
num_questions_slider,
eval_mmlu_button,
cancel_mmlu_button,
results_output,
results_table,
results_table_container
]
).then(
fn=lambda mode, num, subjects, shots, all_q, num_q:
run_mmlu_evaluation(
get_subject_mode_param(mode),
num,
get_subject_names(subjects),
shots,
all_q,
num_q
),
inputs=[
subject_selection_mode,
num_subjects_slider,
specific_subjects,
num_shots_slider,
all_questions_checkbox,
num_questions_slider
],
outputs=[
results_output,
results_table,
eval_mmlu_button,
cancel_mmlu_button,
subject_selection_mode,
num_subjects_slider,
num_shots_slider,
all_questions_checkbox,
num_questions_slider,
results_table_container
]
).then(
fn=finish_evaluation,
inputs=[evaluation_state],
outputs=[evaluation_state]
)
# Connect cancel button
cancel_mmlu_button.click(
fn=cancel_evaluation,
inputs=[evaluation_state],
outputs=[
evaluation_state,
subject_selection_mode,
num_subjects_slider,
specific_subjects,
num_shots_slider,
all_questions_checkbox,
num_questions_slider,
eval_mmlu_button,
cancel_mmlu_button,
results_output,
results_table,
results_table_container
]
)
demo.launch()