Spaces:

akera
/

leaderboard

Sleeping

App Files Files Community

akera commited on Jun 16

Commit

23201ae

verified ·

1 Parent(s): ad7599c

Update src/plotting.py

Browse files

Files changed (1) hide show

src/plotting.py +632 -489

src/plotting.py CHANGED Viewed

@@ -1,8 +1,6 @@
 # src/plotting.py
 import matplotlib.pyplot as plt
 import matplotlib.gridspec as gridspec
-import matplotlib.colors as mcolors
-from colorsys import rgb_to_hls, hls_to_rgb
 import plotly.graph_objects as go
 import plotly.express as px
 from plotly.subplots import make_subplots
@@ -10,587 +8,732 @@ import pandas as pd
 import numpy as np
 from collections import defaultdict
 from typing import Dict, List, Optional, Union
-from config import LANGUAGE_NAMES, ALL_UG40_LANGUAGES, GOOGLE_SUPPORTED_LANGUAGES, METRICS_CONFIG
-plt.style.use('default')
-plt.rcParams['figure.facecolor'] = 'white'
-plt.rcParams['axes.facecolor'] = 'white'
-def create_leaderboard_ranking_plot(df: pd.DataFrame, metric: str = 'quality_score', top_n: int = 15) -> go.Figure:
-    """Create interactive leaderboard ranking plot using Plotly."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(
-            text="No data available",
-            xref="paper", yref="paper",
-            x=0.5, y=0.5, showarrow=False,
-            font=dict(size=16)
         )
-        fig.update_layout(title="No Data Available")
         return fig
-    # Get top N models
-    top_models = df.head(top_n)
-    # Create horizontal bar chart
-    fig = go.Figure(data=[
         go.Bar(
-            y=top_models['model_name'],
-            x=top_models[metric],
-            orientation='h',
-            marker=dict(
-                color=top_models[metric],
-                colorscale='Viridis',
-                showscale=True,
-                colorbar=dict(title=metric.replace('_', ' ').title())
-            ),
-            text=[f"{score:.3f}" for score in top_models[metric]],
-            textposition='auto',
             hovertemplate=(
-                "<b>%{y}</b><br>" +
-                f"{metric.replace('_', ' ').title()}: %{{x:.4f}}<br>" +
-                "Author: %{customdata[0]}<br>" +
-                "Coverage: %{customdata[1]:.1%}<br>" +
-                "<extra></extra>"
             ),
-            customdata=list(zip(top_models['author'], top_models['coverage_rate']))
         )
-    ])
     fig.update_layout(
-        title=f"🏆 SALT Translation Leaderboard - {metric.replace('_', ' ').title()}",
-        xaxis_title=f"{metric.replace('_', ' ').title()} Score",
         yaxis_title="Models",
-        height=max(400, len(top_models) * 30 + 100),
         margin=dict(l=20, r=20, t=60, b=20),
-        plot_bgcolor='white',
-        paper_bgcolor='white'
     )
     # Reverse y-axis to show best model at top
     fig.update_yaxes(autorange="reversed")
     return fig
-def create_metrics_comparison_plot(df: pd.DataFrame, models: List[str] = None, max_models: int = 8) -> go.Figure:
-    """Create radar chart comparing multiple metrics across models."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
-        fig.update_layout(title="No Data Available")
         return fig
-    # Select models to compare
-    if models is None:
-        selected_models = df.head(max_models)
-    else:
-        selected_models = df[df['model_name'].isin(models)].head(max_models)
-    if len(selected_models) == 0:
         fig = go.Figure()
-        fig.add_annotation(text="No models found", x=0.5, y=0.5, showarrow=False)
-        fig.update_layout(title="No Models Found")
         return fig
-    # Metrics to include in radar chart
-    metrics = ['quality_score', 'bleu', 'chrf', 'rouge1', 'rougeL']
-    metric_labels = ['Quality Score', 'BLEU (/100)', 'ChrF', 'ROUGE-1', 'ROUGE-L']
     fig = go.Figure()
-    colors = px.colors.qualitative.Set1[:len(selected_models)]
-    for i, (_, model) in enumerate(selected_models.iterrows()):
-        # Normalize BLEU to 0-1 scale for radar chart
-        values = []
-        for metric in metrics:
-            value = model[metric]
-            if metric == 'bleu':
-                value = value / 100.0  # Normalize BLEU
-            values.append(value)
-        # Close the radar chart
-        values += values[:1]
-        metric_labels_closed = metric_labels + [metric_labels[0]]
-        fig.add_trace(go.Scatterpolar(
-            r=values,
-            theta=metric_labels_closed,
-            fill='toself',
-            name=model['model_name'],
-            line_color=colors[i % len(colors)],
-            fillcolor=colors[i % len(colors)],
-            opacity=0.6
-        ))
-    fig.update_layout(
-        polar=dict(
-            radialaxis=dict(
-                visible=True,
-                range=[0, 1]
             )
         ),
-        showlegend=True,
-        title="📊 Multi-Metric Model Comparison",
-        height=600
     )
     return fig
-def create_language_pair_heatmap(results_dict: Dict, metric: str = 'quality_score') -> go.Figure:
-    """Create heatmap showing performance across language pairs."""
-    if not results_dict or 'pair_metrics' not in results_dict:
         fig = go.Figure()
-        fig.add_annotation(text="No language pair data available", x=0.5, y=0.5, showarrow=False)
-        fig.update_layout(title="No Language Pair Data Available")
         return fig
-    pair_metrics = results_dict['pair_metrics']
-    # Create matrix for heatmap
-    languages = ALL_UG40_LANGUAGES
-    matrix = np.zeros((len(languages), len(languages)))
-    for i, src_lang in enumerate(languages):
-        for j, tgt_lang in enumerate(languages):
-            if src_lang != tgt_lang:
-                pair_key = f"{src_lang}_to_{tgt_lang}"
-                if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
-                    matrix[i, j] = pair_metrics[pair_key][metric]
-                else:
-                    matrix[i, j] = np.nan
-            else:
-                matrix[i, j] = np.nan
-    # Create language labels
-    lang_labels = [LANGUAGE_NAMES.get(lang, lang) for lang in languages]
-    fig = go.Figure(data=go.Heatmap(
-        z=matrix,
-        x=lang_labels,
-        y=lang_labels,
-        colorscale='Viridis',
-        showscale=True,
-        colorbar=dict(title=metric.replace('_', ' ').title()),
-        hovertemplate=(
-            "Source: %{y}<br>" +
-            "Target: %{x}<br>" +
-            f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>" +
-            "<extra></extra>"
         )
-    ))
     fig.update_layout(
-        title=f"🗺️ Language Pair Performance - {metric.replace('_', ' ').title()}",
-        xaxis_title="Target Language",
-        yaxis_title="Source Language",
-        height=600,
-        width=700
     )
     return fig
-def create_coverage_analysis_plot(df: pd.DataFrame) -> go.Figure:
-    """Create plot analyzing test set coverage across submissions."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
-        fig.update_layout(title="No Data Available")
         return fig
     fig = make_subplots(
-        rows=2, cols=2,
         subplot_titles=(
-            "Coverage Distribution",
-            "Language Pairs Covered",
-            "Sample Count vs Quality",
-            "Google Comparable Coverage"
         ),
-        specs=[[{"type": "bar"}, {"type": "scatter"}],
-               [{"type": "scatter"}, {"type": "bar"}]]
     )
-    # Coverage distribution
-    coverage_bins = pd.cut(df['coverage_rate'],
-                          bins=[0, 0.5, 0.8, 0.9, 0.95, 1.0],
-                          labels=['<50%', '50-80%', '80-90%', '90-95%', '95-100%'])
-    coverage_counts = coverage_bins.value_counts()
-    fig.add_trace(
-        go.Bar(x=coverage_counts.index, y=coverage_counts.values, name="Coverage"),
-        row=1, col=1
     )
-    # Language pairs covered vs quality
     fig.add_trace(
-        go.Scatter(
-            x=df['language_pairs_covered'],
-            y=df['quality_score'],
-            mode='markers',
-            text=df['model_name'],
-            name="Quality vs Coverage"
         ),
-        row=1, col=2
     )
-    # Sample count vs quality
     fig.add_trace(
-        go.Scatter(
-            x=df['total_samples'],
-            y=df['quality_score'],
-            mode='markers',
-            text=df['model_name'],
-            name="Quality vs Samples"
         ),
-        row=2, col=1
     )
-    # Google comparable coverage
-    google_coverage = df['google_pairs_covered'].value_counts().sort_index()
-    fig.add_trace(
-        go.Bar(x=google_coverage.index, y=google_coverage.values, name="Google Coverage"),
-        row=2, col=2
-    )
     fig.update_layout(
-        title="📈 Test Set Coverage Analysis",
-        height=800,
-        showlegend=False
     )
     return fig
-def create_model_performance_timeline(df: pd.DataFrame) -> go.Figure:
-    """Create timeline showing model performance over time."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
-        fig.update_layout(title="No Data Available")
         return fig
-    # Convert submission_date to datetime
-    df_copy = df.copy()
-    df_copy['submission_date'] = pd.to_datetime(df_copy['submission_date'])
-    df_copy = df_copy.sort_values('submission_date')
-    fig = go.Figure()
-    # Add scatter plot for each submission
-    fig.add_trace(go.Scatter(
-        x=df_copy['submission_date'],
-        y=df_copy['quality_score'],
-        mode='markers+lines',
-        marker=dict(
-            size=10,
-            color=df_copy['quality_score'],
-            colorscale='Viridis',
-            showscale=True,
-            colorbar=dict(title="Quality Score")
-        ),
-        text=df_copy['model_name'],
-        hovertemplate=(
-            "<b>%{text}</b><br>" +
-            "Date: %{x}<br>" +
-            "Quality Score: %{y:.4f}<br>" +
-            "<extra></extra>"
-        ),
-        name="Models"
-    ))
-    # Add trend line
-    if len(df_copy) > 1:
-        z = np.polyfit(range(len(df_copy)), df_copy['quality_score'], 1)
-        trend_line = np.poly1d(z)(range(len(df_copy)))
-        fig.add_trace(go.Scatter(
-            x=df_copy['submission_date'],
-            y=trend_line,
-            mode='lines',
-            line=dict(dash='dash', color='red'),
-            name="Trend",
-            hoverinfo='skip'
-        ))
-    fig.update_layout(
-        title="📅 Model Performance Timeline",
-        xaxis_title="Submission Date",
-        yaxis_title="Quality Score",
-        height=500
-    )
-    return fig
-def create_google_comparison_plot(df: pd.DataFrame) -> go.Figure:
-    """Create plot comparing models on Google Translate-comparable language pairs."""
-    # Filter models that have Google comparable results
-    google_models = df[df['google_pairs_covered'] > 0].copy()
-    if google_models.empty:
         fig = go.Figure()
         fig.add_annotation(
-            text="No models with Google Translate comparable results",
-            x=0.5, y=0.5, showarrow=False
         )
-        fig.update_layout(title="No Google Comparable Models")
         return fig
     fig = go.Figure()
-    # Create scatter plot
-    fig.add_trace(go.Scatter(
-        x=google_models['google_bleu'],
-        y=google_models['google_quality_score'],
-        mode='markers+text',
-        marker=dict(
-            size=12,
-            color=google_models['google_chrf'],
-            colorscale='Plasma',
-            showscale=True,
-            colorbar=dict(title="ChrF Score")
-        ),
-        text=google_models['model_name'],
-        textposition="top center",
-        hovertemplate=(
-            "<b>%{text}</b><br>" +
-            "BLEU: %{x:.2f}<br>" +
-            "Quality: %{y:.4f}<br>" +
-            "ChrF: %{marker.color:.4f}<br>" +
-            "<extra></extra>"
-        ),
-        name="Models"
-    ))
     fig.update_layout(
-        title="🤖 Google Translate Comparable Performance",
-        xaxis_title="BLEU Score",
-        yaxis_title="Quality Score",
-        height=500
     )
     return fig
-def create_detailed_model_analysis(model_results: Dict, model_name: str) -> go.Figure:
-    """Create detailed analysis plot for a specific model - FIXED version."""
-    if not model_results or 'pair_metrics' not in model_results:
         fig = go.Figure()
-        fig.add_annotation(text="No detailed results available", x=0.5, y=0.5, showarrow=False)
-        fig.update_layout(title=f"No Data for {model_name}")
         return fig
-    pair_metrics = model_results['pair_metrics']
-    # Extract language pair data
     pairs = []
-    bleu_scores = []
-    quality_scores = []
     sample_counts = []
-    google_comparable = []
-    for pair_key, metrics in pair_metrics.items():
-        if 'sample_count' in metrics and metrics['sample_count'] > 0:
-            src, tgt = pair_key.split('_to_')
-            pair_label = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
-            pairs.append(pair_label)
-            bleu_scores.append(metrics.get('bleu', 0))
-            quality_scores.append(metrics.get('quality_score', 0))
-            sample_counts.append(metrics.get('sample_count', 0))
-            is_google = (src in GOOGLE_SUPPORTED_LANGUAGES and tgt in GOOGLE_SUPPORTED_LANGUAGES)
-            google_comparable.append(is_google)
     if not pairs:
         fig = go.Figure()
-        fig.add_annotation(text="No language pair data found", x=0.5, y=0.5, showarrow=False)
-        fig.update_layout(title=f"No Language Pair Data for {model_name}")
         return fig
-    # Create subplot with proper spacing and titles
     fig = make_subplots(
-        rows=2, cols=1,
         subplot_titles=(
-            f"BLEU Scores by Language Pair",
-            f"Quality Scores by Language Pair"
         ),
         vertical_spacing=0.15,
-        row_heights=[0.45, 0.45]
     )
-    # Color code by Google comparable
-    colors = ['#1f77b4' if gc else '#ff7f0e' for gc in google_comparable]
-    # BLEU scores (top subplot)
     fig.add_trace(
         go.Bar(
             x=pairs,
-            y=bleu_scores,
-            marker_color=colors,
-            name="BLEU",
-            text=[f"{score:.1f}" for score in bleu_scores],
-            textposition='outside',
-            textfont=dict(size=10),
-            showlegend=True
         ),
-        row=1, col=1
     )
-    # Quality scores (bottom subplot)
     fig.add_trace(
         go.Bar(
             x=pairs,
-            y=quality_scores,
-            marker_color=colors,
-            name="Quality",
-            text=[f"{score:.3f}" for score in quality_scores],
-            textposition='outside',
-            textfont=dict(size=10),
-            showlegend=False
         ),
-        row=2, col=1
     )
-    # Update layout
     fig.update_layout(
         height=900,
-        title=dict(
-            text=f"📊 Detailed Analysis: {model_name}",
-            x=0.5,
-            xanchor='center'
-        ),
-        showlegend=True,
-        margin=dict(l=50, r=50, t=100, b=150)
-    )
-    # Update x-axes to rotate labels properly
-    fig.update_xaxes(
-        tickangle=45,
-        tickfont=dict(size=10),
-        row=1, col=1
     )
-    fig.update_xaxes(
-        tickangle=45,
-        tickfont=dict(size=10),
-        row=2, col=1
-    )
-    # Update y-axes
-    fig.update_yaxes(title_text="BLEU Score", row=1, col=1)
-    fig.update_yaxes(title_text="Quality Score", row=2, col=1)
-    # Add legend manually for Google vs UG40 only
-    fig.add_trace(
-        go.Scatter(
-            x=[None], y=[None],
-            mode='markers',
-            marker=dict(size=15, color='#1f77b4', symbol='square'),
-            name="Google Comparable",
-            showlegend=True
-        )
-    )
-    fig.add_trace(
-        go.Scatter(
-            x=[None], y=[None],
-            mode='markers',
-            marker=dict(size=15, color='#ff7f0e', symbol='square'),
-            name="UG40 Only",
-            showlegend=True
-        )
-    )
-    return fig
-def create_submission_summary_plot(validation_info: Dict, evaluation_results: Dict) -> go.Figure:
-    """Create summary plot for a new submission."""
-    fig = make_subplots(
-        rows=2, cols=2,
-        subplot_titles=(
-            "Sample Distribution",
-            "Primary Metrics",
-            "Error Analysis",
-            "Coverage Summary"
-        ),
-        specs=[[{"type": "pie"}, {"type": "bar"}],
-               [{"type": "bar"}, {"type": "bar"}]]
-    )
-    # Sample distribution (pie chart)
-    coverage = validation_info.get('coverage', 0.8)
-    fig.add_trace(
-        go.Pie(
-            labels=["Evaluated", "Missing"],
-            values=[coverage * 100, (1 - coverage) * 100],
-            name="Samples"
-        ),
-        row=1, col=1
-    )
-    # Primary metrics
-    if 'summary' in evaluation_results:
-        metrics_data = evaluation_results['summary']['primary_metrics']
-        metric_names = list(metrics_data.keys())
-        metric_values = list(metrics_data.values())
-        fig.add_trace(
-            go.Bar(
-                x=metric_names,
-                y=metric_values,
-                name="Metrics",
-                text=[f"{val:.3f}" for val in metric_values],
-                textposition='auto'
-            ),
-            row=1, col=2
-        )
-    # Error analysis (CER, WER)
-    if 'averages' in evaluation_results:
-        error_metrics = ['cer', 'wer']
-        error_values = [evaluation_results['averages'].get(m, 0) for m in error_metrics]
-        fig.add_trace(
-            go.Bar(
-                x=error_metrics,
-                y=error_values,
-                name="Errors",
-                text=[f"{val:.3f}" for val in error_values],
-                textposition='auto'
-            ),
-            row=2, col=1
-        )
-    # Coverage summary
-    if 'summary' in evaluation_results:
-        summary = evaluation_results['summary']
-        coverage_labels = ["Total Samples", "Lang Pairs", "Google Pairs"]
-        coverage_values = [
-            summary.get('total_samples', 0),
-            summary.get('language_pairs_covered', 0),
-            summary.get('google_comparable_pairs', 0)
-        ]
-        fig.add_trace(
-            go.Bar(
-                x=coverage_labels,
-                y=coverage_values,
-                name="Coverage",
-                text=[f"{val}" for val in coverage_values],
-                textposition='auto'
-            ),
-            row=2, col=2
-        )
-    fig.update_layout(
-        title="📋 Submission Summary",
-        height=700,
-        showlegend=False
-    )
-    return fig

 # src/plotting.py
 import matplotlib.pyplot as plt
 import matplotlib.gridspec as gridspec
 import plotly.graph_objects as go
 import plotly.express as px
 from plotly.subplots import make_subplots
 import numpy as np
 from collections import defaultdict
 from typing import Dict, List, Optional, Union
+from config import (
+    LANGUAGE_NAMES,
+    ALL_UG40_LANGUAGES,
+    GOOGLE_SUPPORTED_LANGUAGES,
+    METRICS_CONFIG,
+    EVALUATION_TRACKS,
+    MODEL_CATEGORIES,
+    CHART_CONFIG,
+    STATISTICAL_CONFIG,
+)
+# Scientific plotting style
+plt.style.use("default")
+plt.rcParams["figure.facecolor"] = "white"
+plt.rcParams["axes.facecolor"] = "white"
+plt.rcParams["font.size"] = 10
+plt.rcParams["axes.labelsize"] = 12
+plt.rcParams["axes.titlesize"] = 14
+plt.rcParams["xtick.labelsize"] = 10
+plt.rcParams["ytick.labelsize"] = 10
+def create_scientific_leaderboard_plot(
+    df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
+) -> go.Figure:
+    """Create scientific leaderboard plot with confidence intervals."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(
+            text="No models available for this track",
+            xref="paper",
+            yref="paper",
+            x=0.5,
+            y=0.5,
+            showarrow=False,
+            font=dict(size=16),
+        )
+        fig.update_layout(title=f"No Data Available - {track.title()} Track")
+        return fig
+    # Get top N models for this track
+    metric_col = f"{track}_{metric}"
+    ci_lower_col = f"{track}_ci_lower"
+    ci_upper_col = f"{track}_ci_upper"
+    if metric_col not in df.columns:
+        fig = go.Figure()
+        fig.add_annotation(
+            text=f"Metric {metric} not available for {track} track",
+            xref="paper",
+            yref="paper",
+            x=0.5,
+            y=0.5,
+            showarrow=False,
         )
         return fig
+    # Filter and sort
+    valid_models = df[(df[metric_col] > 0)].head(top_n)
+    if valid_models.empty:
+        fig = go.Figure()
+        fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
+        return fig
+    # Create color mapping by category
+    category_colors = {}
+    for i, category in enumerate(MODEL_CATEGORIES.keys()):
+        category_colors[category] = MODEL_CATEGORIES[category]["color"]
+    colors = [
+        category_colors.get(cat, "#808080") for cat in valid_models["model_category"]
+    ]
+    # Main bar plot
+    fig = go.Figure()
+    # Add bars with error bars if confidence intervals available
+    if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
+        error_y = dict(
+            type="data",
+            array=valid_models[ci_upper_col] - valid_models[metric_col],
+            arrayminus=valid_models[metric_col] - valid_models[ci_lower_col],
+            visible=True,
+            thickness=2,
+            width=4,
+        )
+    else:
+        error_y = None
+    fig.add_trace(
         go.Bar(
+            y=valid_models["model_name"],
+            x=valid_models[metric_col],
+            orientation="h",
+            marker=dict(color=colors, line=dict(color="black", width=0.5)),
+            error_x=error_y,
+            text=[f"{score:.3f}" for score in valid_models[metric_col]],
+            textposition="auto",
             hovertemplate=(
+                "<b>%{y}</b><br>"
+                + f"{metric.title()}: %{{x:.4f}}<br>"
+                + "Category: %{customdata[0]}<br>"
+                + "Author: %{customdata[1]}<br>"
+                + "Samples: %{customdata[2]}<br>"
+                + "<extra></extra>"
+            ),
+            customdata=list(
+                zip(
+                    valid_models["model_category"],
+                    valid_models["author"],
+                    valid_models.get(f"{track}_samples", [0] * len(valid_models)),
+                )
             ),
         )
+    )
+    # Customize layout
+    track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
+        title=f"🏆 {track_info['name']} - {metric.title()} Score",
+        xaxis_title=f"{metric.title()} Score (with 95% CI)",
         yaxis_title="Models",
+        height=max(400, len(valid_models) * 35 + 100),
         margin=dict(l=20, r=20, t=60, b=20),
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+        font=dict(size=12),
     )
     # Reverse y-axis to show best model at top
     fig.update_yaxes(autorange="reversed")
+    # Add category legend
+    for category, info in MODEL_CATEGORIES.items():
+        if category in valid_models["model_category"].values:
+            fig.add_trace(
+                go.Scatter(
+                    x=[None],
+                    y=[None],
+                    mode="markers",
+                    marker=dict(size=10, color=info["color"]),
+                    name=info["name"],
+                    showlegend=True,
+                )
+            )
     return fig
+def create_language_pair_heatmap_scientific(
+    model_results: Dict, track: str, metric: str = "quality_score"
+) -> go.Figure:
+    """Create research-grade language pair heatmap with proper axes."""
+    if not model_results or "tracks" not in model_results:
+        fig = go.Figure()
+        fig.add_annotation(
+            text="No model results available", x=0.5, y=0.5, showarrow=False
+        )
+        return fig
+    track_data = model_results["tracks"].get(track, {})
+    if track_data.get("error") or "pair_metrics" not in track_data:
+        fig = go.Figure()
+        fig.add_annotation(
+            text=f"No data available for {track} track", x=0.5, y=0.5, showarrow=False
+        )
+        return fig
+    pair_metrics = track_data["pair_metrics"]
+    track_languages = EVALUATION_TRACKS[track]["languages"]
+    # Create matrix for heatmap
+    n_langs = len(track_languages)
+    matrix = np.full((n_langs, n_langs), np.nan)
+    for i, src_lang in enumerate(track_languages):
+        for j, tgt_lang in enumerate(track_languages):
+            if src_lang != tgt_lang:
+                pair_key = f"{src_lang}_to_{tgt_lang}"
+                if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
+                    matrix[i, j] = pair_metrics[pair_key][metric]["mean"]
+    # Create language labels
+    lang_labels = [LANGUAGE_NAMES.get(lang, lang.upper()) for lang in track_languages]
+    # Create heatmap
+    fig = go.Figure(
+        data=go.Heatmap(
+            z=matrix,
+            x=lang_labels,
+            y=lang_labels,
+            colorscale="Viridis",
+            showscale=True,
+            colorbar=dict(
+                title=f"{metric.replace('_', ' ').title()}",
+                titleside="right",
+                len=0.8,
+            ),
+            hovertemplate=(
+                "Source: %{y}<br>"
+                + "Target: %{x}<br>"
+                + f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>"
+                + "<extra></extra>"
+            ),
+            zmin=0,
+            zmax=1 if metric == "quality_score" else None,
+        )
+    )
+    # Customize layout
+    track_info = EVALUATION_TRACKS[track]
+    fig.update_layout(
+        title=f"🗺️ {track_info['name']} - {metric.replace('_', ' ').title()} by Language Pair",
+        xaxis_title="Target Language",
+        yaxis_title="Source Language",
+        height=600,
+        width=700,
+        font=dict(size=12),
+        xaxis=dict(side="bottom"),
+        yaxis=dict(autorange="reversed"),  # Source languages from top to bottom
+    )
+    return fig
+def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
+    """Create statistical comparison plot showing confidence intervals."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
+    metric_col = f"{track}_quality"
+    ci_lower_col = f"{track}_ci_lower"
+    ci_upper_col = f"{track}_ci_upper"
+    # Filter to models with data for this track
+    valid_models = df[
+        (df[metric_col] > 0) & (df[ci_lower_col].notna()) & (df[ci_upper_col].notna())
+    ].head(10)
+    if valid_models.empty:
         fig = go.Figure()
+        fig.add_annotation(
+            text="No models with confidence intervals", x=0.5, y=0.5, showarrow=False
+        )
         return fig
     fig = go.Figure()
+    # Add confidence intervals as error bars
+    for i, (_, model) in enumerate(valid_models.iterrows()):
+        category = model["model_category"]
+        color = MODEL_CATEGORIES.get(category, {}).get("color", "#808080")
+        # Main point
+        fig.add_trace(
+            go.Scatter(
+                x=[model[metric_col]],
+                y=[i],
+                mode="markers",
+                marker=dict(
+                    size=12,
+                    color=color,
+                    line=dict(color="black", width=1),
+                ),
+                name=model["model_name"],
+                showlegend=False,
+                hovertemplate=(
+                    f"<b>{model['model_name']}</b><br>"
+                    + f"Quality: {model[metric_col]:.4f}<br>"
+                    + f"95% CI: [{model[ci_lower_col]:.4f}, {model[ci_upper_col]:.4f}]<br>"
+                    + f"Category: {category}<br>"
+                    + "<extra></extra>"
+                ),
+            )
+        )
+        # Confidence interval line
+        fig.add_trace(
+            go.Scatter(
+                x=[model[ci_lower_col], model[ci_upper_col]],
+                y=[i, i],
+                mode="lines",
+                line=dict(color=color, width=3),
+                showlegend=False,
+                hoverinfo="skip",
             )
+        )
+        # CI endpoints
+        fig.add_trace(
+            go.Scatter(
+                x=[model[ci_lower_col], model[ci_upper_col]],
+                y=[i, i],
+                mode="markers",
+                marker=dict(
+                    symbol="line-ns",
+                    size=10,
+                    color=color,
+                    line=dict(width=2),
+                ),
+                showlegend=False,
+                hoverinfo="skip",
+            )
+        )
+    # Customize layout
+    track_info = EVALUATION_TRACKS[track]
+    fig.update_layout(
+        title=f"📊 {track_info['name']} - Statistical Comparison",
+        xaxis_title="Quality Score",
+        yaxis_title="Models",
+        height=max(400, len(valid_models) * 40 + 100),
+        yaxis=dict(
+            tickmode="array",
+            tickvals=list(range(len(valid_models))),
+            ticktext=valid_models["model_name"].tolist(),
+            autorange="reversed",
         ),
+        showlegend=False,
+        plot_bgcolor="white",
+        paper_bgcolor="white",
     )
     return fig
+def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
+    """Create category-wise comparison plot."""
+    if df.empty:
         fig = go.Figure()
+        fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
+    metric_col = f"{track}_quality"
+    adequate_col = f"{track}_adequate"
+    # Filter to adequate models
+    valid_models = df[df[adequate_col] & (df[metric_col] > 0)]
+    if valid_models.empty:
+        fig = go.Figure()
+        fig.add_annotation(
+            text="No adequate models found", x=0.5, y=0.5, showarrow=False
         )
+        return fig
+    fig = go.Figure()
+    # Create box plot for each category
+    for category, info in MODEL_CATEGORIES.items():
+        category_models = valid_models[valid_models["model_category"] == category]
+        if len(category_models) > 0:
+            fig.add_trace(
+                go.Box(
+                    y=category_models[metric_col],
+                    name=info["name"],
+                    marker_color=info["color"],
+                    boxpoints="all",  # Show all points
+                    jitter=0.3,
+                    pointpos=-1.8,
+                    hovertemplate=(
+                        f"<b>{info['name']}</b><br>"
+                        + "Quality: %{y:.4f}<br>"
+                        + "Model: %{customdata}<br>"
+                        + "<extra></extra>"
+                    ),
+                    customdata=category_models["model_name"],
+                )
+            )
+    # Customize layout
+    track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
+        title=f"📈 {track_info['name']} - Performance by Category",
+        xaxis_title="Model Category",
+        yaxis_title="Quality Score",
+        height=500,
+        showlegend=False,
+        plot_bgcolor="white",
+        paper_bgcolor="white",
     )
     return fig
+def create_adequacy_analysis_plot(df: pd.DataFrame) -> go.Figure:
+    """Create analysis plot for statistical adequacy across tracks."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
     fig = make_subplots(
+        rows=2,
+        cols=2,
         subplot_titles=(
+            "Sample Sizes by Track",
+            "Statistical Adequacy Distribution",
+            "Scientific Adequacy Scores",
+            "Model Categories Distribution",
         ),
+        specs=[
+            [{"type": "bar"}, {"type": "pie"}],
+            [{"type": "histogram"}, {"type": "bar"}],
+        ],
     )
+    # Sample sizes by track
+    track_names = []
+    sample_counts = []
+    for track in EVALUATION_TRACKS.keys():
+        samples_col = f"{track}_samples"
+        if samples_col in df.columns:
+            total_samples = df[df[samples_col] > 0][samples_col].sum()
+            track_names.append(track.replace("_", " ").title())
+            sample_counts.append(total_samples)
+    if track_names:
+        fig.add_trace(
+            go.Bar(x=track_names, y=sample_counts, name="Samples"), row=1, col=1
+        )
+    # Statistical adequacy distribution
+    adequacy_bins = pd.cut(
+        df["scientific_adequacy_score"],
+        bins=[0, 0.3, 0.6, 0.8, 1.0],
+        labels=["Poor", "Fair", "Good", "Excellent"],
     )
+    adequacy_counts = adequacy_bins.value_counts()
+    if not adequacy_counts.empty:
+        fig.add_trace(
+            go.Pie(
+                labels=adequacy_counts.index,
+                values=adequacy_counts.values,
+                name="Adequacy",
+            ),
+            row=1,
+            col=2,
+        )
+    # Scientific adequacy scores histogram
     fig.add_trace(
+        go.Histogram(
+            x=df["scientific_adequacy_score"], nbinsx=20, name="Adequacy Scores"
         ),
+        row=2,
+        col=1,
     )
+    # Model categories distribution
+    category_counts = df["model_category"].value_counts()
+    category_colors = [
+        MODEL_CATEGORIES.get(cat, {}).get("color", "#808080")
+        for cat in category_counts.index
+    ]
     fig.add_trace(
+        go.Bar(
+            x=category_counts.index,
+            y=category_counts.values,
+            marker_color=category_colors,
+            name="Categories",
         ),
+        row=2,
+        col=2,
     )
     fig.update_layout(
+        title="📊 Scientific Evaluation Analysis", height=800, showlegend=False
     )
     return fig
+def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
+    """Create cross-track performance correlation analysis."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
+    # Get models with data in multiple tracks
+    quality_cols = [f"{track}_quality" for track in EVALUATION_TRACKS.keys()]
+    available_cols = [col for col in quality_cols if col in df.columns]
+    if len(available_cols) < 2:
+        fig = go.Figure()
+        fig.add_annotation(
+            text="Need at least 2 tracks for comparison", x=0.5, y=0.5, showarrow=False
+        )
+        return fig
+    # Filter to models with data in multiple tracks
+    multi_track_models = df.copy()
+    for col in available_cols:
+        multi_track_models = multi_track_models[multi_track_models[col] > 0]
+    if len(multi_track_models) < 3:
+        fig = go.Figure()
+        fig.add_annotation(
+            text="Insufficient models for cross-track analysis",
+            x=0.5,
+            y=0.5,
+            showarrow=False,
+        )
+        return fig
+    # Create scatter plot matrix
+    track_pairs = [
+        (available_cols[i], available_cols[j])
+        for i in range(len(available_cols))
+        for j in range(i + 1, len(available_cols))
+    ]
+    if not track_pairs:
         fig = go.Figure()
         fig.add_annotation(
+            text="No track pairs available", x=0.5, y=0.5, showarrow=False
         )
         return fig
+    # Use first pair for demonstration
+    x_col, y_col = track_pairs[0]
+    x_track = x_col.replace("_quality", "").replace("_", " ").title()
+    y_track = y_col.replace("_quality", "").replace("_", " ").title()
     fig = go.Figure()
+    # Color by category
+    for category, info in MODEL_CATEGORIES.items():
+        category_models = multi_track_models[
+            multi_track_models["model_category"] == category
+        ]
+        if len(category_models) > 0:
+            fig.add_trace(
+                go.Scatter(
+                    x=category_models[x_col],
+                    y=category_models[y_col],
+                    mode="markers",
+                    marker=dict(
+                        size=10,
+                        color=info["color"],
+                        line=dict(color="black", width=1),
+                    ),
+                    name=info["name"],
+                    text=category_models["model_name"],
+                    hovertemplate=(
+                        "<b>%{text}</b><br>"
+                        + f"{x_track}: %{{x:.4f}}<br>"
+                        + f"{y_track}: %{{y:.4f}}<br>"
+                        + f"Category: {info['name']}<br>"
+                        + "<extra></extra>"
+                    ),
+                )
+            )
+    # Add diagonal line for reference
+    min_val = min(multi_track_models[x_col].min(), multi_track_models[y_col].min())
+    max_val = max(multi_track_models[x_col].max(), multi_track_models[y_col].max())
+    fig.add_trace(
+        go.Scatter(
+            x=[min_val, max_val],
+            y=[min_val, max_val],
+            mode="lines",
+            line=dict(dash="dash", color="gray", width=2),
+            name="Perfect Correlation",
+            showlegend=False,
+            hoverinfo="skip",
+        )
+    )
     fig.update_layout(
+        title=f"🔄 Cross-Track Performance: {x_track} vs {y_track}",
+        xaxis_title=f"{x_track} Quality Score",
+        yaxis_title=f"{y_track} Quality Score",
+        height=600,
+        width=600,
+        plot_bgcolor="white",
+        paper_bgcolor="white",
     )
     return fig
+def create_scientific_model_detail_plot(
+    model_results: Dict, model_name: str, track: str
+) -> go.Figure:
+    """Create detailed scientific analysis for a specific model."""
+    if not model_results or "tracks" not in model_results:
+        fig = go.Figure()
+        fig.add_annotation(
+            text="No model results available", x=0.5, y=0.5, showarrow=False
+        )
+        return fig
+    track_data = model_results["tracks"].get(track, {})
+    if track_data.get("error") or "pair_metrics" not in track_data:
         fig = go.Figure()
+        fig.add_annotation(
+            text=f"No data for {track} track", x=0.5, y=0.5, showarrow=False
+        )
         return fig
+    pair_metrics = track_data["pair_metrics"]
+    track_languages = EVALUATION_TRACKS[track]["languages"]
+    # Extract data for plotting
     pairs = []
+    quality_means = []
+    quality_cis = []
+    bleu_means = []
     sample_counts = []
+    for src in track_languages:
+        for tgt in track_languages:
+            if src == tgt:
+                continue
+            pair_key = f"{src}_to_{tgt}"
+            if pair_key in pair_metrics:
+                metrics = pair_metrics[pair_key]
+                if "quality_score" in metrics and "sample_count" in metrics:
+                    pair_label = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
+                    pairs.append(pair_label)
+                    quality_stats = metrics["quality_score"]
+                    quality_means.append(quality_stats["mean"])
+                    quality_cis.append(
+                        [quality_stats["ci_lower"], quality_stats["ci_upper"]]
+                    )
+                    bleu_stats = metrics.get("bleu", {"mean": 0})
+                    bleu_means.append(bleu_stats["mean"])
+                    sample_counts.append(metrics["sample_count"])
     if not pairs:
         fig = go.Figure()
+        fig.add_annotation(
+            text="No language pair data available", x=0.5, y=0.5, showarrow=False
+        )
         return fig
+    # Create subplots
     fig = make_subplots(
+        rows=2,
+        cols=1,
         subplot_titles=(
+            "Quality Scores by Language Pair (with 95% CI)",
+            "BLEU Scores by Language Pair",
         ),
         vertical_spacing=0.15,
     )
+    # Quality scores with confidence intervals
+    error_y = dict(
+        type="data",
+        array=[ci[1] - mean for ci, mean in zip(quality_cis, quality_means)],
+        arrayminus=[mean - ci[0] for ci, mean in zip(quality_cis, quality_means)],
+        visible=True,
+        thickness=2,
+        width=4,
+    )
     fig.add_trace(
         go.Bar(
             x=pairs,
+            y=quality_means,
+            error_y=error_y,
+            name="Quality Score",
+            marker_color="steelblue",
+            text=[f"{score:.3f}" for score in quality_means],
+            textposition="outside",
+            hovertemplate=(
+                "<b>%{x}</b><br>"
+                + "Quality: %{y:.4f}<br>"
+                + "Samples: %{customdata}<br>"
+                + "<extra></extra>"
+            ),
+            customdata=sample_counts,
         ),
+        row=1,
+        col=1,
     )
+    # BLEU scores
     fig.add_trace(
         go.Bar(
             x=pairs,
+            y=bleu_means,
+            name="BLEU Score",
+            marker_color="coral",
+            text=[f"{score:.1f}" for score in bleu_means],
+            textposition="outside",
         ),
+        row=2,
+        col=1,
     )
+    # Customize layout
+    track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
+        title=f"🔬 Detailed Analysis: {model_name} - {track_info['name']}",
         height=900,
+        showlegend=False,
+        margin=dict(l=50, r=50, t=100, b=150),
     )
+    # Rotate x-axis labels
+    fig.update_xaxes(tickangle=45, row=1, col=1)
+    fig.update_xaxes(tickangle=45, row=2, col=1)
+    return fig