H2H-eval-comparator

Sleeping

File size: 2,759 Bytes

import gradio as gr

# CSS styling for the application
CSS = """
#preview_header {
    margin-bottom: 10px;
    margin-top: 5px;
}
#preview_table {
    background-color: #f8f9fa;
    border-radius: 8px;
    padding: 10px;
}
h1 {
    text-align: center;
}
.section-divider {
    border-top: 1px solid #ddd;
    margin: 12px 0;
}
.config-box {
    border: 1px solid #ddd;
    border-radius: 8px;
    padding: 15px;
    margin: 10px;
    background-color: #f9f9f9;
}
.center-divider {
    display: flex;
    justify-content: center;
    height: 100%;
}
.error-message {
    color: #d32f2f;
    background-color: #ffebee;
    padding: 10px;
    border-radius: 4px;
    margin: 10px 0;
}
"""

def create_header():
    """
    Creates the main application header.
    
    Returns:
        list: List of header components.
    """
    title = gr.Markdown("# Head-to-Head Model Evaluation Comparator")
    description = gr.Markdown("""
    This demo / proof of concept evaluates two models (or one model with two different configs), head-to-head, on a benchmark dataset.
    
    Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)
    Available Models: [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
    """)
    
    return [title, description]

def create_results_section():
    """
    Creates the results section of the UI.
    
    Returns:
        dict: Dictionary containing results components.
    """
    with gr.Column(visible=False) as results_container:
        results_output = gr.Markdown(label="Evaluation Results")
        
        # Results table - Initially hidden until evaluation completes
        with gr.Column(visible=False) as results_table_container:
            with gr.Row():
                results_table = gr.DataFrame(
                    interactive=True,
                    label="Detailed Results (Sortable)",
                    visible=True
                )
    
    # Create a State component to store temporary data
    tmp_data = gr.State(None)
    
    return {
        'container': results_container,
        'output': results_output,
        'table_container': results_table_container,
        'table': results_table,
        'tmp_data': tmp_data  # Add this line
    }


def create_action_buttons():
    """
    Creates the action buttons for evaluation.
    
    Returns:
        dict: Dictionary containing button components.
    """
    with gr.Row():
        with gr.Column(scale=1):
            eval_button = gr.Button("Run Evaluation", variant="primary", interactive=False)
            cancel_button = gr.Button("Cancel Evaluation", variant="stop", visible=False)
    
    return {
        'eval_button': eval_button,
        'cancel_button': cancel_button
    }