evalita_llm_leaderboard

Running

File size: 25,265 Bytes

ad05cd8

import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from functools import lru_cache
import logging

from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, \
    LLM_BENCHMARKS_TEXT, TITLE
from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
from src.display.css_html_js import custom_css
from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, \
    WeightType, Precision
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
import matplotlib.pyplot as plt
import re
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# EVALITA results
BASELINES = {
    "TE": 71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
    "LS": 38.82, "SU": 38.91, "NER": 88.00, "REL": 62.99
}

# GPT-4o results
REFERENCES = {
    "NER": 79.11, "REL": 63.32, "LS": 59.25, "SU": 33.04
}

TASK_METADATA_MULTIPLECHOICE = {
    "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""},
    "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""},
    "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
    "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""},
    "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""},
    "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}
}

TASK_METADATA_GENERATIVE = {
    "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""},
    "SU": {"icon": "📝", "name": "Summarization", "tooltip": ""},
    "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
    "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
}

def highlight_best_per_task(df):
    """Add 🟡 symbol next to the maximum value in each task column"""

    task_columns = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]

    df = df.copy()
    for col in task_columns:
        if col in df.columns:
            max_val = df[col].max()
            df[col] = df[col].apply(
                lambda x: f"{x:.1f}🔺" if x == max_val else f"{x:.1f}"
            )
    return df

def theoretical_performance(df_hash):
    """

    Theoretical performance of a model that scores the highest on every individual task

    """
    # This is a placeholder - you'd need to pass the actual dataframe
    # In practice, you'd compute this once and store it
    #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
    return 75.0  # Placeholder value


def scale_sizes(values, min_size=8, max_size=30):
    """Normalize sizes for scatter plot markers """
    if not values:
        return []
    vmin, vmax = min(values), max(values)
    if vmax == vmin:
        return [(min_size + max_size) / 2] * len(values)
    return [
        min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size)
        for val in values
    ]


def extract_model_name(model_string):
    """Extract model name from HTML string."""
    match = re.search(r'>([^<]+)<', model_string)
    return match.group(1) if match else model_string


def create_line_chart(dataframe):
    """Create left chart."""

    def scale_sizes(values, min_size=8, max_size=30):
        vmin, vmax = min(values), max(values)
        return [
            min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin
            else (min_size + max_size) / 2
            for val in values
        ]

    fig = go.Figure()

    # Loop su 5-Shot e 0-Shot
    for shot, color in [(True, "blue"), (False, "red")]:
        df = dataframe[dataframe["IS_FS"] == shot]

        x = df["#Params (B)"].tolist()
        y = df["Avg. Comb. Perf. ⬆️"].tolist()
        labels = [
            re.search(r'>([^<]+)<', m).group(1) if isinstance(m, str) and re.search(r'>([^<]+)<', m) else str(m)
            for m in df["Model"].tolist()
        ]

        fig.add_trace(go.Scatter(
            x=x,
            y=y,
            mode="markers",
            name="5-Shot" if shot else "0-Shot",
            marker=dict(color=color, size=scale_sizes(x)),
            hovertemplate="<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>",
            customdata=labels,
        ))

    # Show the best model
    all_y = dataframe["Avg. Comb. Perf. ⬆️"].tolist()
    if all_y:
        max_idx = all_y.index(max(all_y))
        max_x = dataframe["#Params (B)"].iloc[max_idx]
        max_y = all_y[max_idx]
        max_label = re.search(r'>([^<]+)<', dataframe["Model"].iloc[max_idx]).group(1)

        fig.add_annotation(
            x=max_x,
            y=max_y,
            text=max_label,
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=2,
            arrowcolor="black",
            font=dict(size=11, color="black"),
            xshift=10, yshift=10,
            ax=-30, ay=-20,
            xanchor="right"
        )

    # Layout
    fig.update_layout(
        title="Average Combined Performance vs #Params",
        xaxis_title="#Params (B)", yaxis_title="Average Combined Performance",
        template="plotly_white", hovermode="closest",
        font=dict(family="Arial", size=10), dragmode=False,
        xaxis=dict(tickvals=[0, 25, 50, 75, 100, 125], ticktext=["0", "25", "50", "75", "100"]),
        yaxis=dict(tickvals=[0, 20, 40, 60, 80, 100], range=[0, 100])
    )

    # Caption
    fig.add_annotation(
        text="Accuracy generally rises with #Params, but smaller models <br>"
             "with 5-shot can outperform larger zero-shot models.",
        xref="paper", yref="paper", x=0.5, y=-0.3,
        showarrow=False, font=dict(size=11, color="gray"),
        align="center", xanchor="center"
    )

    fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
    fig.update_yaxes(fixedrange=True)

    return fig


def create_boxplot_task(dataframe=None, baselines=None, references=None):
    """Create right chart"""

    tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]

    # Dati di default se non forniti
    if dataframe is None:
        np.random.seed(42)
        dataframe = pd.DataFrame({task: np.random.uniform(0.4, 0.9, 20) * 100 for task in tasks})

    if baselines is None:
        baselines = {task: np.random.randint(50, 70) for task in tasks}

    if references is None:
        references = {}

    colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
              "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]

    fig = go.Figure()

    for i, task in enumerate(tasks):
        if task not in dataframe.columns:
            continue

        y_data = dataframe[task].dropna().tolist()

        # Boxplot
        fig.add_trace(go.Box(
            y=y_data,
            name=task,
            marker=dict(color=colors[i]),
            line=dict(color="black", width=2),
            fillcolor=colors[i],
            opacity=0.7,
            hovertemplate="<b>"+task+"</b><br>Accuracy: %{y:.2f}%<extra></extra>",
            hoverlabel=dict(bgcolor=colors[i], font_color="white"),
            width=0.6,
            whiskerwidth=0.2,
            quartilemethod="linear"
        ))

        # Linea baseline
        baseline_value = baselines.get(task)
        if baseline_value is not None:
            fig.add_shape(
                type="line",
                x0=i - 0.3, x1=i + 0.3,
                y0=baseline_value, y1=baseline_value,
                line=dict(color="black", width=2, dash="dot"),
                xref="x", yref="y"
            )

        # Linea reference GPT-4o
        reference_value = references.get(task)
        if reference_value is not None:
            fig.add_shape(
                type="line",
                x0=i - 0.3, x1=i + 0.3,
                y0=reference_value, y1=reference_value,
                line=dict(color="red", width=2, dash="dashdot"),
                xref="x", yref="y"
            )

    # Layout
    fig.update_layout(
        title="Distribution of Model Accuracy by Task",
        xaxis_title="Task",
        yaxis_title="Combined Performance",
        template="plotly_white",
        boxmode="group",
        dragmode=False,
        font=dict(family="Arial", size=10),
        margin=dict(b=80),
    )

    # Caption
    fig.add_annotation(
        text=(
            "In tasks like TE and SA, models approach the accuracy of supervised <br>"
            "models at EVALITA (dashed black line); in NER and REL they remain lower. <br>"
            "Dashed red lines show GPT-4o reference results for generative tasks."
        ),
        xref="paper", yref="paper",
        x=0.5, y=-0.30,
        showarrow=False,
        font=dict(size=11, color="gray"),
        align="center"
    )

    fig.update_yaxes(range=[0, 100], fixedrange=True)
    fig.update_xaxes(fixedrange=True)

    return fig


def create_medal_assignments(sorted_df):
    """Function for medal assignment logic"""
    medals = {
        'large_fs': False, 'medium_fs': False, 'small_fs': False,
        'large_0shot': False, 'medium_0shot': False, 'small_0shot': False
    }

    new_model_column = []

    for _, row in sorted_df.iterrows():
        model_name = row['Model']
        size = row["Size"]
        is_fs = row['IS_FS']

        if is_fs:  # 5-Few-Shot
            if size == "🔵🔵🔵" and not medals['large_fs']:
                model_name = f"{model_name} 🔵🔵🔵🏆"
                medals['large_fs'] = True
            elif size == "🔵🔵" and not medals['medium_fs']:
                model_name = f"{model_name} 🔵🔵🏆"
                medals['medium_fs'] = True
            elif size == "🔵" and not medals['small_fs']:
                model_name = f"{model_name} 🔵🏆"
                medals['small_fs'] = True
        else:  # 0-Shot
            if size == "🔵🔵🔵" and not medals['large_0shot']:
                model_name = f"{model_name} 🔵🔵🔵🎖️"
                medals['large_0shot'] = True
            elif size == "🔵🔵" and not medals['medium_0shot']:
                model_name = f"{model_name} 🔵🔵🎖️"
                medals['medium_0shot'] = True
            elif size == "🔵" and not medals['small_0shot']:
                model_name = f"{model_name} 🔵🎖️"
                medals['small_0shot'] = True

        new_model_column.append(model_name)

    return new_model_column


def create_leaderboard_base(sorted_dataframe, field_list, hidden_columns):
    """Base leaderboard creation with common parameters. """

    return Leaderboard(
        value=sorted_dataframe,
        datatype=[c.type for c in field_list],
        search_columns=[AutoEvalColumn.model.name],
        hide_columns=hidden_columns,
        filter_columns=[
            ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
            ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
                         label="Select the number of parameters (B)"),
        ],
        bool_checkboxgroup_label="Evaluation Mode",
        interactive=False,
    )


def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
    """Leaderboard initialization """
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")

    # Sort and reset index
    sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. ⬆️", ascending=False).reset_index(drop=True)
    sorted_dataframe["Rank"] = sorted_dataframe.index + 1

    # Apply medal assignments
    sorted_dataframe["Model"] = create_medal_assignments(sorted_dataframe)

    # Show the best values for tasks
    sorted_dataframe = highlight_best_per_task(sorted_dataframe)

    field_list = fields(AutoEvalColumn)

    return create_leaderboard_base(sorted_dataframe, field_list, hidden_columns)


def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):

    """ Task-specific leaderboard update."""
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")

    # Sort and reset index
    sorted_dataframe = dataframe.sort_values(by="Comb. Perf. ⬆️", ascending=False).reset_index(drop=True)
    sorted_dataframe["Rank"] = sorted_dataframe.index + 1

    # Apply medal assignments
    sorted_dataframe["Model"] = create_medal_assignments(sorted_dataframe)

    field_list = fields(AutoEvalColumn)

    return Leaderboard(
        value=sorted_dataframe,
        datatype=[c.type for c in field_list] + [int],
        search_columns=[AutoEvalColumn.model.name],
        hide_columns=hidden_columns,
        filter_columns=[
            ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
            ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
                         label="Select the number of parameters (B)"),
        ],
        bool_checkboxgroup_label="Evaluation Mode",
        interactive=False
    )


def download_snapshot(repo, local_dir, max_retries=3):
    """Snapshot download with retry logic."""
    for attempt in range(max_retries):
        try:
            logger.info(f"Downloading from {repo} to {local_dir} (attempt {attempt + 1}/{max_retries})")
            snapshot_download(
                repo_id=repo,
                local_dir=local_dir,
                repo_type="dataset",
                tqdm_class=None,
                etag_timeout=30,
                token=TOKEN
            )
            return True
        except Exception as e:
            logger.error(f"Error downloading {repo} (attempt {attempt + 1}): {e}")
            if attempt == max_retries - 1:
                logger.error(f"Failed to download {repo} after {max_retries} attempts")
                return False
    return False


def restart_space():
    """Restart the Hugging Face space."""
    try:
        logger.info("Restarting space... ")
        API.restart_space(repo_id=REPO_ID)
    except Exception as e:
        logger.error(f"Error restarting space: {e}")


def create_title_html():
    """Function for title HTML."""
    return """

    <div style="display: flex; align-items: center; position: relative; width: 100%; height: 60px; padding: 10px 0;">

        <h1 style="

            margin: 0 auto; 

            font-weight: 900; 

            font-size: 2.5em; 

            letter-spacing: 2px; 

            text-transform: uppercase; 

            background: linear-gradient(90deg, #1f77b4, #00c6ff); 

            -webkit-background-clip: text; 

            -webkit-text-fill-color: transparent; 

            text-shadow: 2px 2px 8px rgba(0,0,0,0.2);

        ">

            EVALITA-LLM Leaderboard

        </h1>

        <a href="https://huggingface.co/spaces/mii-llm/open_ita_llm_leaderboard" target="_blank" 

           style="position: absolute; right: 0; display: inline-flex; align-items: center; gap: 6px; text-decoration: none; color: #1f77b4; font-weight: 600;">

            <svg xmlns="http://www.w3.org/2000/svg" width="22" height="22" fill="#1f77b4" viewBox="0 0 24 24">

                <path d="M3.9 12a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42a3 3 0 1 0 4.24 4.24l3.54-3.54a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42z"/>

                <path d="M20.1 12a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42a3 3 0 1 0-4.24-4.24l-3.54 3.54a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42z"/>

            </svg>

            Open Italian LLM Leaderboard

        </a>

    </div>

    """


def create_credits_markdown():
    """Credits section."""
    return """

**This project has benefited from the following support:**



- 🧠 **Codebase**: Based on and extended from the Open Italian LLM Leaderboard, developed by **Alessandro Ercolani** and **Samuele Colombo**. We warmly thank them for their invaluable support and guidance in implementing this leaderboard.



- 💶 **Funding**: Partially supported by the PNRR project **FAIR - Future AI Research (PE00000013)**, under the NRRP MUR program funded by **NextGenerationEU**.



- 🖥️ **Computation**: We gratefully acknowledge **CINECA** for granting access to the **LEONARDO** supercomputer.

"""


# Main initialization
def initialize_app():
    """Initialize the application ."""
    try:
        # Download snapshots
        queue_success = download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
        results_success = download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)

        if not (queue_success and results_success):
            logger.error("Failed to download required data")
            return None, None, None, None, None

        # Load leaderboard data
        leaderboard_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
        finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
            EVAL_REQUESTS_PATH, EVAL_COLS)

        # Calculate theoretical max performance
        theoretical_max = theoretical_performance(hash(str(leaderboard_df.values.tobytes())))

        return leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, theoretical_max

    except Exception as e:
        logger.error(f"Error initializing app: {e}")
        return None, None, None, None, None


# Initialize data
LEADERBOARD_DF, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, theoretical_max_combined_perf = initialize_app()

if LEADERBOARD_DF is None:
    # Fallback behavior
    logger.error("Failed to initialize app data")
    theoretical_max_combined_perf = 0.0


def create_gradio_interface():
    """The main Gradio interface."""
    demo = gr.Blocks(css=custom_css)

    with demo:
        # Title
        gr.HTML(create_title_html())
        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

        # Charts section
        with gr.Row():
            if LEADERBOARD_DF is not None:
                # Note: You'd need to implement these chart functions properly
                gr.Plot(value=create_line_chart(LEADERBOARD_DF), elem_id="line-chart")
                gr.Plot(value=create_boxplot_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")

        # Tabs
        with gr.Tabs(elem_classes="tab-buttons") as tabs:
            # Main leaderboard tab
            with gr.TabItem("🏅 Benchmark"):
                if LEADERBOARD_DF is not None:
                    leaderboard = init_leaderboard(
                        LEADERBOARD_DF,
                        default_selection=['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT",
                                           "WIC", "FAQ", "LS", "SU", "NER", "REL"],
                        hidden_columns=[col for col in LEADERBOARD_DF.columns if
                                        col not in ['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA",
                                                    "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
                    )

                    gr.HTML(
                        f"""

                        <div style="

                            border: 2px solid #1f77b4;

                            border-radius: 10px;

                            padding: 10px;

                            background-color: #f0f8ff;

                            font-weight: bold;

                            font-size: 14px;

                            display: inline-block;

                        ">

                            Theoretical performance of a model that scores the highest on every individual task: 

                            <span style="color:#d62728; font-size:18px;">{theoretical_max_combined_perf:.2f}</span>

                        </div>

                        """
                    )

            # About tab
            with gr.TabItem("📝 About"):
                gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

            with gr.TabItem("║", interactive=False):
                gr.Markdown("", elem_classes="markdown-text")

            # Task-specific tabs
            if LEADERBOARD_DF is not None:
                # Multiple choice tasks
                for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
                    with gr.TabItem(f"{metadata['icon']}{task}"):
                        task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
                        gr.Markdown(task_description, elem_classes="markdown-text")

                        leaderboard = update_task_leaderboard(
                            LEADERBOARD_DF.rename(columns={
                                f"{task} Prompt Average": "Prompt Average",
                                f"{task} Prompt Std": "Prompt Std",
                                f"{task} Best Prompt": "Best Prompt",
                                f"{task} Best Prompt Id": "Best Prompt Id",
                                task: "Comb. Perf. ⬆️"
                            }),
                            default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️', 'Prompt Average',
                                               'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
                            hidden_columns=[col for col in LEADERBOARD_DF.columns if
                                            col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
                                                        'Prompt Average', 'Prompt Std', 'Best Prompt',
                                                        'Best Prompt Id']]
                        )

                with gr.TabItem("│", interactive=False):
                    gr.Markdown("", elem_classes="markdown-text")

                # Generative tasks
                for task, metadata in TASK_METADATA_GENERATIVE.items():
                    with gr.TabItem(f"{metadata['icon']}{task}"):
                        task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
                        gr.Markdown(task_description, elem_classes="markdown-text")

                        leaderboard = update_task_leaderboard(
                            LEADERBOARD_DF.rename(columns={
                                f"{task} Prompt Average": "Prompt Average",
                                f"{task} Prompt Std": "Prompt Std",
                                f"{task} Best Prompt": "Best Prompt",
                                f"{task} Best Prompt Id": "Best Prompt Id",
                                task: "Comb. Perf. ⬆️"
                            }),
                            default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️', 'Prompt Average',
                                               'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
                            hidden_columns=[col for col in LEADERBOARD_DF.columns if
                                            col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
                                                        'Prompt Average', 'Prompt Std', 'Best Prompt',
                                                        'Best Prompt Id']]
                        )

        # Citation and Credits sections
        with gr.Accordion("📙 Citation", open=False):
            gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                lines=20,
                elem_id="citation-button",
                show_copy_button=True
            )

        with gr.Accordion("📙 Credits", open=False):
            gr.Markdown(create_credits_markdown())

    return demo


# Create and configure the demo
demo = create_gradio_interface()

# Background scheduler for space restart
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()

# Launch configuration
if __name__ == "__main__":
    demo.queue(default_concurrency_limit=40).launch(
        debug=True,
        show_error=True
    )