Spaces:

akera
/

leaderboard

Running

File size: 17,293 Bytes

# src/plotting.py
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
import json
from collections import defaultdict
from typing import Dict, List, Optional, Union
from config import (
    LANGUAGE_NAMES,
    ALL_UG40_LANGUAGES,
    GOOGLE_SUPPORTED_LANGUAGES,
    METRICS_CONFIG,
    EVALUATION_TRACKS,
    MODEL_CATEGORIES,
    CHART_CONFIG,
)


def create_leaderboard_plot(
    df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
) -> go.Figure:
    """Create leaderboard plot with confidence intervals."""
    
    if df.empty:
        fig = go.Figure()
        fig.add_annotation(
            text="No models available for this track",
            xref="paper", yref="paper",
            x=0.5, y=0.5, showarrow=False,
            font=dict(size=16)
        )
        fig.update_layout(
            title=f"No Data Available - {track.title()} Track",
            paper_bgcolor="rgba(0,0,0,0)",
            plot_bgcolor="rgba(0,0,0,0)"
        )
        return fig
    
    try:
        # Get top N models for this track
        metric_col = f"{track}_{metric}"
        ci_lower_col = f"{track}_ci_lower"
        ci_upper_col = f"{track}_ci_upper"
        
        if metric_col not in df.columns:
            fig = go.Figure()
            fig.add_annotation(
                text=f"Metric {metric} not available for {track} track",
                xref="paper", yref="paper",
                x=0.5, y=0.5, showarrow=False,
            )
            return fig
        
        # Ensure numeric columns are properly typed
        numeric_cols = [metric_col, ci_lower_col, ci_upper_col]
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
        
        # Filter and sort
        valid_models = df[(df[metric_col] > 0)].head(top_n).copy()
        
        if valid_models.empty:
            fig = go.Figure()
            fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
            return fig
        
        # Create color mapping by category
        colors = [MODEL_CATEGORIES.get(cat, {}).get("color", "#808080") for cat in valid_models["model_category"]]
        
        # Main bar plot
        fig = go.Figure()
        
        # Add bars with error bars if confidence intervals available
        error_x = None
        if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
            try:
                error_x = dict(
                    type="data",
                    array=valid_models[ci_upper_col] - valid_models[metric_col],
                    arrayminus=valid_models[metric_col] - valid_models[ci_lower_col],
                    visible=True,
                    thickness=2,
                    width=4,
                )
            except Exception as e:
                print(f"Error creating error bars: {e}")
                error_x = None
        
        # Safely format text values
        try:
            text_values = [f"{float(score):.3f}" for score in valid_models[metric_col]]
        except:
            text_values = ["0.000"] * len(valid_models)
        
        # Safely prepare custom data
        try:
            samples_col = f"{track}_samples"
            samples_data = valid_models.get(samples_col, [0] * len(valid_models))
            customdata = list(zip(
                valid_models["model_category"].fillna("unknown"),
                valid_models["author"].fillna("Anonymous"),
                [int(float(x)) if pd.notnull(x) else 0 for x in samples_data]
            ))
        except Exception as e:
            print(f"Error preparing custom data: {e}")
            customdata = [("unknown", "Anonymous", 0)] * len(valid_models)
        
        fig.add_trace(go.Bar(
            y=valid_models["model_name"],
            x=valid_models[metric_col],
            orientation="h",
            marker=dict(color=colors, line=dict(color="black", width=0.5)),
            error_x=error_x,
            text=text_values,
            textposition="auto",
            hovertemplate=(
                "<b>%{y}</b><br>" +
                f"{metric.title()}: %{{x:.4f}}<br>" +
                "Category: %{customdata[0]}<br>" +
                "Author: %{customdata[1]}<br>" +
                "Samples: %{customdata[2]}<br>" +
                "<extra></extra>"
            ),
            customdata=customdata,
        ))
        
        # Customize layout
        track_info = EVALUATION_TRACKS[track]
        fig.update_layout(
            title=f"🏆 {track_info['name']} - {metric.title()} Score",
            xaxis_title=f"{metric.title()} Score (with 95% CI)",
            yaxis_title="Models",
            height=max(400, len(valid_models) * 35 + 100),
            margin=dict(l=20, r=20, t=60, b=20),
            paper_bgcolor="rgba(0,0,0,0)",
            plot_bgcolor="rgba(0,0,0,0)",
            font=dict(size=12),
        )
        
        # Reverse y-axis to show best model at top
        fig.update_yaxes(autorange="reversed")
        
        return fig
        
    except Exception as e:
        print(f"Error creating leaderboard plot: {e}")
        fig = go.Figure()
        fig.add_annotation(
            text=f"Error creating plot: {str(e)}",
            x=0.5, y=0.5, showarrow=False
        )
        return fig


def create_language_pair_heatmap(
    model_results: Dict, track: str, metric: str = "quality_score"
) -> go.Figure:
    """Create language pair heatmap for a model."""
    
    if not model_results or "tracks" not in model_results:
        fig = go.Figure()
        fig.add_annotation(text="No model results available", x=0.5, y=0.5, showarrow=False)
        return fig
    
    track_data = model_results["tracks"].get(track, {})
    if track_data.get("error") or "pair_metrics" not in track_data:
        fig = go.Figure()
        fig.add_annotation(text=f"No data available for {track} track", x=0.5, y=0.5, showarrow=False)
        return fig
    
    pair_metrics = track_data["pair_metrics"]
    track_languages = EVALUATION_TRACKS[track]["languages"]
    
    # Create matrix for heatmap
    n_langs = len(track_languages)
    matrix = np.full((n_langs, n_langs), np.nan)
    
    for i, src_lang in enumerate(track_languages):
        for j, tgt_lang in enumerate(track_languages):
            if src_lang != tgt_lang:
                pair_key = f"{src_lang}_to_{tgt_lang}"
                if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
                    matrix[i, j] = pair_metrics[pair_key][metric]["mean"]
    
    # Create language labels
    lang_labels = [LANGUAGE_NAMES.get(lang, lang.upper()) for lang in track_languages]
    
    # Create heatmap
    fig = go.Figure(data=go.Heatmap(
        z=matrix,
        x=lang_labels,
        y=lang_labels,
        colorscale="Viridis",
        showscale=True,
        colorbar=dict(
            title=f"{metric.replace('_', ' ').title()}",
            titleside="right",
            len=0.8,
        ),
        hovertemplate=(
            "Source: %{y}<br>" +
            "Target: %{x}<br>" +
            f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>" +
            "<extra></extra>"
        ),
        zmin=0,
        zmax=1 if metric == "quality_score" else None,
    ))
    
    # Customize layout
    track_info = EVALUATION_TRACKS[track]
    fig.update_layout(
        title=f"🗺️ {track_info['name']} - {metric.replace('_', ' ').title()} by Language Pair",
        xaxis_title="Target Language",
        yaxis_title="Source Language",
        height=600,
        width=700,
        font=dict(size=12),
        xaxis=dict(side="bottom"),
        yaxis=dict(autorange="reversed"),
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
    )
    
    return fig


def create_performance_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
    """Create performance comparison plot showing confidence intervals."""
    
    if df.empty:
        fig = go.Figure()
        fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
        return fig
    
    try:
        metric_col = f"{track}_quality"
        ci_lower_col = f"{track}_ci_lower"
        ci_upper_col = f"{track}_ci_upper"
        
        # Ensure numeric columns are properly typed
        numeric_cols = [metric_col, ci_lower_col, ci_upper_col]
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
        
        # Filter to models with data for this track
        valid_models = df[
            (df[metric_col] > 0) & 
            (df[ci_lower_col].notna()) & 
            (df[ci_upper_col].notna())
        ].head(10).copy()
        
        if valid_models.empty:
            fig = go.Figure()
            fig.add_annotation(text="No models with confidence intervals", x=0.5, y=0.5, showarrow=False)
            return fig
        
        fig = go.Figure()
        
        # Add confidence intervals as error bars
        for i, (_, model) in enumerate(valid_models.iterrows()):
            try:
                category = str(model["model_category"])
                color = MODEL_CATEGORIES.get(category, {}).get("color", "#808080")
                model_name = str(model["model_name"])
                
                # Safely extract numeric values
                quality_val = float(model[metric_col])
                ci_lower_val = float(model[ci_lower_col])
                ci_upper_val = float(model[ci_upper_col])
                
                # Main point
                fig.add_trace(go.Scatter(
                    x=[quality_val],
                    y=[i],
                    mode="markers",
                    marker=dict(
                        size=12,
                        color=color,
                        line=dict(color="black", width=1),
                    ),
                    name=model_name,
                    showlegend=False,
                    hovertemplate=(
                        f"<b>{model_name}</b><br>" +
                        f"Quality: {quality_val:.4f}<br>" +
                        f"95% CI: [{ci_lower_val:.4f}, {ci_upper_val:.4f}]<br>" +
                        f"Category: {category}<br>" +
                        "<extra></extra>"
                    ),
                ))
                
                # Confidence interval line
                fig.add_trace(go.Scatter(
                    x=[ci_lower_val, ci_upper_val],
                    y=[i, i],
                    mode="lines",
                    line=dict(color=color, width=3),
                    showlegend=False,
                    hoverinfo="skip",
                ))
                
            except Exception as e:
                print(f"Error adding model {i} to comparison plot: {e}")
                continue
        
        # Safely prepare tick labels
        try:
            tick_labels = [str(name) for name in valid_models["model_name"]]
        except:
            tick_labels = [f"Model {i}" for i in range(len(valid_models))]
        
        # Customize layout
        track_info = EVALUATION_TRACKS[track]
        fig.update_layout(
            title=f"📊 {track_info['name']} - Performance Comparison",
            xaxis_title="Quality Score",
            yaxis_title="Models",
            height=max(400, len(valid_models) * 40 + 100),
            yaxis=dict(
                tickmode="array",
                tickvals=list(range(len(valid_models))),
                ticktext=tick_labels,
                autorange="reversed",
            ),
            showlegend=False,
            paper_bgcolor="rgba(0,0,0,0)",
            plot_bgcolor="rgba(0,0,0,0)",
        )
        
        return fig
        
    except Exception as e:
        print(f"Error creating performance comparison plot: {e}")
        fig = go.Figure()
        fig.add_annotation(
            text=f"Error creating plot: {str(e)}",
            x=0.5, y=0.5, showarrow=False
        )
        return fig


def create_language_pair_comparison_plot(pairs_df: pd.DataFrame, track: str) -> go.Figure:
    """Create language pair comparison plot showing all models across all pairs."""
    
    if pairs_df.empty:
        fig = go.Figure()
        fig.add_annotation(
            text="No language pair data available", 
            x=0.5, y=0.5, showarrow=False
        )
        return fig
    
    # Get unique language pairs and models
    language_pairs = sorted(pairs_df['Language Pair'].unique())
    models = sorted(pairs_df['Model'].unique())
    
    if len(language_pairs) == 0 or len(models) == 0:
        fig = go.Figure()
        fig.add_annotation(
            text="Insufficient data for comparison", 
            x=0.5, y=0.5, showarrow=False
        )
        return fig
    
    # Create subplot for each metric
    fig = make_subplots(
        rows=2, cols=1,
        subplot_titles=('Quality Score by Language Pair', 'BLEU Score by Language Pair'),
        vertical_spacing=0.1,
        shared_xaxes=True
    )
    
    # Quality Score comparison
    for model in models:
        model_data = pairs_df[pairs_df['Model'] == model]
        category = model_data['Category'].iloc[0] if not model_data.empty else 'community'
        color = MODEL_CATEGORIES.get(category, {}).get('color', '#808080')
        
        fig.add_trace(
            go.Bar(
                name=model,
                x=model_data['Language Pair'],
                y=model_data['Quality Score'],
                marker_color=color,
                opacity=0.8,
                legendgroup=model,
                showlegend=True,
                hovertemplate=(
                    f"<b>{model}</b><br>" +
                    "Language Pair: %{x}<br>" +
                    "Quality Score: %{y:.4f}<br>" +
                    f"Category: {category}<br>" +
                    "<extra></extra>"
                )
            ),
            row=1, col=1
        )
        
        # BLEU Score comparison
        fig.add_trace(
            go.Bar(
                name=model,
                x=model_data['Language Pair'],
                y=model_data['BLEU'],
                marker_color=color,
                opacity=0.8,
                legendgroup=model,
                showlegend=False,
                hovertemplate=(
                    f"<b>{model}</b><br>" +
                    "Language Pair: %{x}<br>" +
                    "BLEU: %{y:.2f}<br>" +
                    f"Category: {category}<br>" +
                    "<extra></extra>"
                )
            ),
            row=2, col=1
        )
    
    # Update layout
    track_info = EVALUATION_TRACKS[track]
    fig.update_layout(
        title=f"📊 {track_info['name']} - Language Pair Performance Comparison",
        height=800,
        barmode='group',
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )
    
    # Rotate x-axis labels for better readability
    fig.update_xaxes(tickangle=45, row=2, col=1)
    fig.update_yaxes(title_text="Quality Score", row=1, col=1)
    fig.update_yaxes(title_text="BLEU Score", row=2, col=1)
    
    return fig


def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
    """Create category-wise comparison plot."""
    
    if df.empty:
        fig = go.Figure()
        fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
        return fig
    
    metric_col = f"{track}_quality"
    
    # Filter to models with data
    valid_models = df[df[metric_col] > 0]
    
    if valid_models.empty:
        fig = go.Figure()
        fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
        return fig
    
    fig = go.Figure()
    
    # Create box plot for each category
    for category, info in MODEL_CATEGORIES.items():
        category_models = valid_models[valid_models["model_category"] == category]
        
        if len(category_models) > 0:
            fig.add_trace(go.Box(
                y=category_models[metric_col],
                name=info["name"],
                marker_color=info["color"],
                boxpoints="all",  # Show all points
                jitter=0.3,
                pointpos=-1.8,
                hovertemplate=(
                    f"<b>{info['name']}</b><br>" +
                    "Quality: %{y:.4f}<br>" +
                    "Model: %{customdata}<br>" +
                    "<extra></extra>"
                ),
                customdata=category_models["model_name"],
            ))
    
    # Customize layout
    track_info = EVALUATION_TRACKS[track]
    fig.update_layout(
        title=f"📈 {track_info['name']} - Performance by Category",
        xaxis_title="Model Category",
        yaxis_title="Quality Score",
        height=500,
        showlegend=False,
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
    )
    
    return fig