Spaces:

akera
/

leaderboard

Running

App Files Files Community

akera commited on Jun 16

Commit

2644208

verified ·

1 Parent(s): 4a955b1

Update src/plotting.py

Browse files

Files changed (1) hide show

src/plotting.py +276 -338

src/plotting.py CHANGED Viewed

@@ -17,6 +17,7 @@ from config import (
     MODEL_CATEGORIES,
     CHART_CONFIG,
     STATISTICAL_CONFIG,
 )
 # Scientific plotting style
@@ -34,58 +35,50 @@ def create_scientific_leaderboard_plot(
     df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
 ) -> go.Figure:
     """Create scientific leaderboard plot with confidence intervals."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(
             text="No models available for this track",
-            xref="paper",
-            yref="paper",
-            x=0.5,
-            y=0.5,
-            showarrow=False,
-            font=dict(size=16),
         )
         fig.update_layout(title=f"No Data Available - {track.title()} Track")
         return fig
     # Get top N models for this track
     metric_col = f"{track}_{metric}"
     ci_lower_col = f"{track}_ci_lower"
     ci_upper_col = f"{track}_ci_upper"
     if metric_col not in df.columns:
         fig = go.Figure()
         fig.add_annotation(
             text=f"Metric {metric} not available for {track} track",
-            xref="paper",
-            yref="paper",
-            x=0.5,
-            y=0.5,
-            showarrow=False,
         )
         return fig
     # Filter and sort
     valid_models = df[(df[metric_col] > 0)].head(top_n)
     if valid_models.empty:
         fig = go.Figure()
         fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
         return fig
     # Create color mapping by category
     category_colors = {}
     for i, category in enumerate(MODEL_CATEGORIES.keys()):
         category_colors[category] = MODEL_CATEGORIES[category]["color"]
-    colors = [
-        category_colors.get(cat, "#808080") for cat in valid_models["model_category"]
-    ]
     # Main bar plot
     fig = go.Figure()
     # Add bars with error bars if confidence intervals available
     if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
         error_y = dict(
@@ -98,34 +91,30 @@ def create_scientific_leaderboard_plot(
         )
     else:
         error_y = None
-    fig.add_trace(
-        go.Bar(
-            y=valid_models["model_name"],
-            x=valid_models[metric_col],
-            orientation="h",
-            marker=dict(color=colors, line=dict(color="black", width=0.5)),
-            error_x=error_y,
-            text=[f"{score:.3f}" for score in valid_models[metric_col]],
-            textposition="auto",
-            hovertemplate=(
-                "<b>%{y}</b><br>"
-                + f"{metric.title()}: %{{x:.4f}}<br>"
-                + "Category: %{customdata[0]}<br>"
-                + "Author: %{customdata[1]}<br>"
-                + "Samples: %{customdata[2]}<br>"
-                + "<extra></extra>"
-            ),
-            customdata=list(
-                zip(
-                    valid_models["model_category"],
-                    valid_models["author"],
-                    valid_models.get(f"{track}_samples", [0] * len(valid_models)),
-                )
-            ),
-        )
-    )
     # Customize layout
     track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
@@ -138,24 +127,21 @@ def create_scientific_leaderboard_plot(
         paper_bgcolor="white",
         font=dict(size=12),
     )
     # Reverse y-axis to show best model at top
     fig.update_yaxes(autorange="reversed")
     # Add category legend
     for category, info in MODEL_CATEGORIES.items():
         if category in valid_models["model_category"].values:
-            fig.add_trace(
-                go.Scatter(
-                    x=[None],
-                    y=[None],
-                    mode="markers",
-                    marker=dict(size=10, color=info["color"]),
-                    name=info["name"],
-                    showlegend=True,
-                )
-            )
     return fig
@@ -163,63 +149,57 @@ def create_language_pair_heatmap_scientific(
     model_results: Dict, track: str, metric: str = "quality_score"
 ) -> go.Figure:
     """Create research-grade language pair heatmap with proper axes."""
     if not model_results or "tracks" not in model_results:
         fig = go.Figure()
-        fig.add_annotation(
-            text="No model results available", x=0.5, y=0.5, showarrow=False
-        )
         return fig
     track_data = model_results["tracks"].get(track, {})
     if track_data.get("error") or "pair_metrics" not in track_data:
         fig = go.Figure()
-        fig.add_annotation(
-            text=f"No data available for {track} track", x=0.5, y=0.5, showarrow=False
-        )
         return fig
     pair_metrics = track_data["pair_metrics"]
     track_languages = EVALUATION_TRACKS[track]["languages"]
     # Create matrix for heatmap
     n_langs = len(track_languages)
     matrix = np.full((n_langs, n_langs), np.nan)
     for i, src_lang in enumerate(track_languages):
         for j, tgt_lang in enumerate(track_languages):
             if src_lang != tgt_lang:
                 pair_key = f"{src_lang}_to_{tgt_lang}"
                 if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
                     matrix[i, j] = pair_metrics[pair_key][metric]["mean"]
     # Create language labels
     lang_labels = [LANGUAGE_NAMES.get(lang, lang.upper()) for lang in track_languages]
     # Create heatmap
-    fig = go.Figure(
-        data=go.Heatmap(
-            z=matrix,
-            x=lang_labels,
-            y=lang_labels,
-            colorscale="Viridis",
-            showscale=True,
-            colorbar=dict(
-                title=f"{metric.replace('_', ' ').title()}",
-                titleside="right",
-                len=0.8,
-            ),
-            hovertemplate=(
-                "Source: %{y}<br>"
-                + "Target: %{x}<br>"
-                + f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>"
-                + "<extra></extra>"
-            ),
-            zmin=0,
-            zmax=1 if metric == "quality_score" else None,
-        )
-    )
     # Customize layout
     track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
@@ -232,93 +212,87 @@ def create_language_pair_heatmap_scientific(
         xaxis=dict(side="bottom"),
         yaxis=dict(autorange="reversed"),  # Source languages from top to bottom
     )
     return fig
 def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
     """Create statistical comparison plot showing confidence intervals."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
     metric_col = f"{track}_quality"
     ci_lower_col = f"{track}_ci_lower"
     ci_upper_col = f"{track}_ci_upper"
     # Filter to models with data for this track
     valid_models = df[
-        (df[metric_col] > 0) & (df[ci_lower_col].notna()) & (df[ci_upper_col].notna())
     ].head(10)
     if valid_models.empty:
         fig = go.Figure()
-        fig.add_annotation(
-            text="No models with confidence intervals", x=0.5, y=0.5, showarrow=False
-        )
         return fig
     fig = go.Figure()
     # Add confidence intervals as error bars
     for i, (_, model) in enumerate(valid_models.iterrows()):
         category = model["model_category"]
         color = MODEL_CATEGORIES.get(category, {}).get("color", "#808080")
         # Main point
-        fig.add_trace(
-            go.Scatter(
-                x=[model[metric_col]],
-                y=[i],
-                mode="markers",
-                marker=dict(
-                    size=12,
-                    color=color,
-                    line=dict(color="black", width=1),
-                ),
-                name=model["model_name"],
-                showlegend=False,
-                hovertemplate=(
-                    f"<b>{model['model_name']}</b><br>"
-                    + f"Quality: {model[metric_col]:.4f}<br>"
-                    + f"95% CI: [{model[ci_lower_col]:.4f}, {model[ci_upper_col]:.4f}]<br>"
-                    + f"Category: {category}<br>"
-                    + "<extra></extra>"
-                ),
-            )
-        )
         # Confidence interval line
-        fig.add_trace(
-            go.Scatter(
-                x=[model[ci_lower_col], model[ci_upper_col]],
-                y=[i, i],
-                mode="lines",
-                line=dict(color=color, width=3),
-                showlegend=False,
-                hoverinfo="skip",
-            )
-        )
         # CI endpoints
-        fig.add_trace(
-            go.Scatter(
-                x=[model[ci_lower_col], model[ci_upper_col]],
-                y=[i, i],
-                mode="markers",
-                marker=dict(
-                    symbol="line-ns",
-                    size=10,
-                    color=color,
-                    line=dict(width=2),
-                ),
-                showlegend=False,
-                hoverinfo="skip",
-            )
-        )
     # Customize layout
     track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
@@ -336,56 +310,52 @@ def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figur
         plot_bgcolor="white",
         paper_bgcolor="white",
     )
     return fig
 def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
     """Create category-wise comparison plot."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
     metric_col = f"{track}_quality"
     adequate_col = f"{track}_adequate"
     # Filter to adequate models
     valid_models = df[df[adequate_col] & (df[metric_col] > 0)]
     if valid_models.empty:
         fig = go.Figure()
-        fig.add_annotation(
-            text="No adequate models found", x=0.5, y=0.5, showarrow=False
-        )
         return fig
     fig = go.Figure()
     # Create box plot for each category
     for category, info in MODEL_CATEGORIES.items():
         category_models = valid_models[valid_models["model_category"] == category]
         if len(category_models) > 0:
-            fig.add_trace(
-                go.Box(
-                    y=category_models[metric_col],
-                    name=info["name"],
-                    marker_color=info["color"],
-                    boxpoints="all",  # Show all points
-                    jitter=0.3,
-                    pointpos=-1.8,
-                    hovertemplate=(
-                        f"<b>{info['name']}</b><br>"
-                        + "Quality: %{y:.4f}<br>"
-                        + "Model: %{customdata}<br>"
-                        + "<extra></extra>"
-                    ),
-                    customdata=category_models["model_name"],
-                )
-            )
     # Customize layout
     track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
@@ -397,202 +367,183 @@ def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
         plot_bgcolor="white",
         paper_bgcolor="white",
     )
     return fig
 def create_adequacy_analysis_plot(df: pd.DataFrame) -> go.Figure:
     """Create analysis plot for statistical adequacy across tracks."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
     fig = make_subplots(
-        rows=2,
-        cols=2,
         subplot_titles=(
             "Sample Sizes by Track",
-            "Statistical Adequacy Distribution",
             "Scientific Adequacy Scores",
-            "Model Categories Distribution",
         ),
         specs=[
             [{"type": "bar"}, {"type": "pie"}],
-            [{"type": "histogram"}, {"type": "bar"}],
-        ],
     )
     # Sample sizes by track
     track_names = []
     sample_counts = []
     for track in EVALUATION_TRACKS.keys():
         samples_col = f"{track}_samples"
         if samples_col in df.columns:
             total_samples = df[df[samples_col] > 0][samples_col].sum()
             track_names.append(track.replace("_", " ").title())
             sample_counts.append(total_samples)
     if track_names:
         fig.add_trace(
-            go.Bar(x=track_names, y=sample_counts, name="Samples"), row=1, col=1
         )
     # Statistical adequacy distribution
     adequacy_bins = pd.cut(
-        df["scientific_adequacy_score"],
         bins=[0, 0.3, 0.6, 0.8, 1.0],
-        labels=["Poor", "Fair", "Good", "Excellent"],
     )
     adequacy_counts = adequacy_bins.value_counts()
     if not adequacy_counts.empty:
         fig.add_trace(
             go.Pie(
                 labels=adequacy_counts.index,
                 values=adequacy_counts.values,
-                name="Adequacy",
             ),
-            row=1,
-            col=2,
         )
     # Scientific adequacy scores histogram
     fig.add_trace(
         go.Histogram(
-            x=df["scientific_adequacy_score"], nbinsx=20, name="Adequacy Scores"
         ),
-        row=2,
-        col=1,
     )
     # Model categories distribution
     category_counts = df["model_category"].value_counts()
-    category_colors = [
-        MODEL_CATEGORIES.get(cat, {}).get("color", "#808080")
-        for cat in category_counts.index
-    ]
     fig.add_trace(
         go.Bar(
             x=category_counts.index,
             y=category_counts.values,
             marker_color=category_colors,
-            name="Categories",
         ),
-        row=2,
-        col=2,
     )
     fig.update_layout(
-        title="📊 Scientific Evaluation Analysis", height=800, showlegend=False
     )
     return fig
 def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
     """Create cross-track performance correlation analysis."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
     # Get models with data in multiple tracks
     quality_cols = [f"{track}_quality" for track in EVALUATION_TRACKS.keys()]
     available_cols = [col for col in quality_cols if col in df.columns]
     if len(available_cols) < 2:
         fig = go.Figure()
-        fig.add_annotation(
-            text="Need at least 2 tracks for comparison", x=0.5, y=0.5, showarrow=False
-        )
         return fig
     # Filter to models with data in multiple tracks
     multi_track_models = df.copy()
     for col in available_cols:
         multi_track_models = multi_track_models[multi_track_models[col] > 0]
     if len(multi_track_models) < 3:
         fig = go.Figure()
-        fig.add_annotation(
-            text="Insufficient models for cross-track analysis",
-            x=0.5,
-            y=0.5,
-            showarrow=False,
-        )
         return fig
     # Create scatter plot matrix
-    track_pairs = [
-        (available_cols[i], available_cols[j])
-        for i in range(len(available_cols))
-        for j in range(i + 1, len(available_cols))
-    ]
     if not track_pairs:
         fig = go.Figure()
-        fig.add_annotation(
-            text="No track pairs available", x=0.5, y=0.5, showarrow=False
-        )
         return fig
     # Use first pair for demonstration
     x_col, y_col = track_pairs[0]
     x_track = x_col.replace("_quality", "").replace("_", " ").title()
     y_track = y_col.replace("_quality", "").replace("_", " ").title()
     fig = go.Figure()
     # Color by category
     for category, info in MODEL_CATEGORIES.items():
-        category_models = multi_track_models[
-            multi_track_models["model_category"] == category
-        ]
         if len(category_models) > 0:
-            fig.add_trace(
-                go.Scatter(
-                    x=category_models[x_col],
-                    y=category_models[y_col],
-                    mode="markers",
-                    marker=dict(
-                        size=10,
-                        color=info["color"],
-                        line=dict(color="black", width=1),
-                    ),
-                    name=info["name"],
-                    text=category_models["model_name"],
-                    hovertemplate=(
-                        "<b>%{text}</b><br>"
-                        + f"{x_track}: %{{x:.4f}}<br>"
-                        + f"{y_track}: %{{y:.4f}}<br>"
-                        + f"Category: {info['name']}<br>"
-                        + "<extra></extra>"
-                    ),
-                )
-            )
     # Add diagonal line for reference
     min_val = min(multi_track_models[x_col].min(), multi_track_models[y_col].min())
     max_val = max(multi_track_models[x_col].max(), multi_track_models[y_col].max())
-    fig.add_trace(
-        go.Scatter(
-            x=[min_val, max_val],
-            y=[min_val, max_val],
-            mode="lines",
-            line=dict(dash="dash", color="gray", width=2),
-            name="Perfect Correlation",
-            showlegend=False,
-            hoverinfo="skip",
-        )
-    )
     fig.update_layout(
         title=f"🔄 Cross-Track Performance: {x_track} vs {y_track}",
         xaxis_title=f"{x_track} Quality Score",
@@ -602,82 +553,71 @@ def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
         plot_bgcolor="white",
         paper_bgcolor="white",
     )
     return fig
-def create_scientific_model_detail_plot(
-    model_results: Dict, model_name: str, track: str
-) -> go.Figure:
     """Create detailed scientific analysis for a specific model."""
     if not model_results or "tracks" not in model_results:
         fig = go.Figure()
-        fig.add_annotation(
-            text="No model results available", x=0.5, y=0.5, showarrow=False
-        )
         return fig
     track_data = model_results["tracks"].get(track, {})
     if track_data.get("error") or "pair_metrics" not in track_data:
         fig = go.Figure()
-        fig.add_annotation(
-            text=f"No data for {track} track", x=0.5, y=0.5, showarrow=False
-        )
         return fig
     pair_metrics = track_data["pair_metrics"]
     track_languages = EVALUATION_TRACKS[track]["languages"]
     # Extract data for plotting
     pairs = []
     quality_means = []
     quality_cis = []
     bleu_means = []
     sample_counts = []
     for src in track_languages:
         for tgt in track_languages:
             if src == tgt:
                 continue
             pair_key = f"{src}_to_{tgt}"
             if pair_key in pair_metrics:
                 metrics = pair_metrics[pair_key]
                 if "quality_score" in metrics and "sample_count" in metrics:
                     pair_label = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
                     pairs.append(pair_label)
                     quality_stats = metrics["quality_score"]
                     quality_means.append(quality_stats["mean"])
-                    quality_cis.append(
-                        [quality_stats["ci_lower"], quality_stats["ci_upper"]]
-                    )
                     bleu_stats = metrics.get("bleu", {"mean": 0})
                     bleu_means.append(bleu_stats["mean"])
                     sample_counts.append(metrics["sample_count"])
     if not pairs:
         fig = go.Figure()
-        fig.add_annotation(
-            text="No language pair data available", x=0.5, y=0.5, showarrow=False
-        )
         return fig
     # Create subplots
     fig = make_subplots(
-        rows=2,
-        cols=1,
         subplot_titles=(
             "Quality Scores by Language Pair (with 95% CI)",
-            "BLEU Scores by Language Pair",
         ),
         vertical_spacing=0.15,
     )
     # Quality scores with confidence intervals
     error_y = dict(
         type="data",
@@ -687,7 +627,7 @@ def create_scientific_model_detail_plot(
         thickness=2,
         width=4,
     )
     fig.add_trace(
         go.Bar(
             x=pairs,
@@ -698,17 +638,16 @@ def create_scientific_model_detail_plot(
             text=[f"{score:.3f}" for score in quality_means],
             textposition="outside",
             hovertemplate=(
-                "<b>%{x}</b><br>"
-                + "Quality: %{y:.4f}<br>"
-                + "Samples: %{customdata}<br>"
-                + "<extra></extra>"
             ),
             customdata=sample_counts,
         ),
-        row=1,
-        col=1,
     )
     # BLEU scores
     fig.add_trace(
         go.Bar(
@@ -719,10 +658,9 @@ def create_scientific_model_detail_plot(
             text=[f"{score:.1f}" for score in bleu_means],
             textposition="outside",
         ),
-        row=2,
-        col=1,
     )
     # Customize layout
     track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
@@ -731,9 +669,9 @@ def create_scientific_model_detail_plot(
         showlegend=False,
         margin=dict(l=50, r=50, t=100, b=150),
     )
     # Rotate x-axis labels
     fig.update_xaxes(tickangle=45, row=1, col=1)
     fig.update_xaxes(tickangle=45, row=2, col=1)
-    return fig

     MODEL_CATEGORIES,
     CHART_CONFIG,
     STATISTICAL_CONFIG,
+    SAMPLE_SIZE_RECOMMENDATIONS,
 )
 # Scientific plotting style
     df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
 ) -> go.Figure:
     """Create scientific leaderboard plot with confidence intervals."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(
             text="No models available for this track",
+            xref="paper", yref="paper",
+            x=0.5, y=0.5, showarrow=False,
+            font=dict(size=16)
         )
         fig.update_layout(title=f"No Data Available - {track.title()} Track")
         return fig
     # Get top N models for this track
     metric_col = f"{track}_{metric}"
     ci_lower_col = f"{track}_ci_lower"
     ci_upper_col = f"{track}_ci_upper"
     if metric_col not in df.columns:
         fig = go.Figure()
         fig.add_annotation(
             text=f"Metric {metric} not available for {track} track",
+            xref="paper", yref="paper",
+            x=0.5, y=0.5, showarrow=False,
         )
         return fig
     # Filter and sort
     valid_models = df[(df[metric_col] > 0)].head(top_n)
     if valid_models.empty:
         fig = go.Figure()
         fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
         return fig
     # Create color mapping by category
     category_colors = {}
     for i, category in enumerate(MODEL_CATEGORIES.keys()):
         category_colors[category] = MODEL_CATEGORIES[category]["color"]
+    colors = [category_colors.get(cat, "#808080") for cat in valid_models["model_category"]]
     # Main bar plot
     fig = go.Figure()
     # Add bars with error bars if confidence intervals available
     if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
         error_y = dict(
         )
     else:
         error_y = None
+    fig.add_trace(go.Bar(
+        y=valid_models["model_name"],
+        x=valid_models[metric_col],
+        orientation="h",
+        marker=dict(color=colors, line=dict(color="black", width=0.5)),
+        error_x=error_y,
+        text=[f"{score:.3f}" for score in valid_models[metric_col]],
+        textposition="auto",
+        hovertemplate=(
+            "<b>%{y}</b><br>" +
+            f"{metric.title()}: %{{x:.4f}}<br>" +
+            "Category: %{customdata[0]}<br>" +
+            "Author: %{customdata[1]}<br>" +
+            "Samples: %{customdata[2]}<br>" +
+            "<extra></extra>"
+        ),
+        customdata=list(zip(
+            valid_models["model_category"],
+            valid_models["author"],
+            valid_models.get(f"{track}_samples", [0] * len(valid_models))
+        )),
+    ))
     # Customize layout
     track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
         paper_bgcolor="white",
         font=dict(size=12),
     )
     # Reverse y-axis to show best model at top
     fig.update_yaxes(autorange="reversed")
     # Add category legend
     for category, info in MODEL_CATEGORIES.items():
         if category in valid_models["model_category"].values:
+            fig.add_trace(go.Scatter(
+                x=[None], y=[None],
+                mode="markers",
+                marker=dict(size=10, color=info["color"]),
+                name=info["name"],
+                showlegend=True,
+            ))
     return fig
     model_results: Dict, track: str, metric: str = "quality_score"
 ) -> go.Figure:
     """Create research-grade language pair heatmap with proper axes."""
     if not model_results or "tracks" not in model_results:
         fig = go.Figure()
+        fig.add_annotation(text="No model results available", x=0.5, y=0.5, showarrow=False)
         return fig
     track_data = model_results["tracks"].get(track, {})
     if track_data.get("error") or "pair_metrics" not in track_data:
         fig = go.Figure()
+        fig.add_annotation(text=f"No data available for {track} track", x=0.5, y=0.5, showarrow=False)
         return fig
     pair_metrics = track_data["pair_metrics"]
     track_languages = EVALUATION_TRACKS[track]["languages"]
     # Create matrix for heatmap
     n_langs = len(track_languages)
     matrix = np.full((n_langs, n_langs), np.nan)
     for i, src_lang in enumerate(track_languages):
         for j, tgt_lang in enumerate(track_languages):
             if src_lang != tgt_lang:
                 pair_key = f"{src_lang}_to_{tgt_lang}"
                 if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
                     matrix[i, j] = pair_metrics[pair_key][metric]["mean"]
     # Create language labels
     lang_labels = [LANGUAGE_NAMES.get(lang, lang.upper()) for lang in track_languages]
     # Create heatmap
+    fig = go.Figure(data=go.Heatmap(
+        z=matrix,
+        x=lang_labels,
+        y=lang_labels,
+        colorscale="Viridis",
+        showscale=True,
+        colorbar=dict(
+            title=f"{metric.replace('_', ' ').title()}",
+            titleside="right",
+            len=0.8,
+        ),
+        hovertemplate=(
+            "Source: %{y}<br>" +
+            "Target: %{x}<br>" +
+            f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>" +
+            "<extra></extra>"
+        ),
+        zmin=0,
+        zmax=1 if metric == "quality_score" else None,
+    ))
     # Customize layout
     track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
         xaxis=dict(side="bottom"),
         yaxis=dict(autorange="reversed"),  # Source languages from top to bottom
     )
     return fig
 def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
     """Create statistical comparison plot showing confidence intervals."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
     metric_col = f"{track}_quality"
     ci_lower_col = f"{track}_ci_lower"
     ci_upper_col = f"{track}_ci_upper"
     # Filter to models with data for this track
     valid_models = df[
+        (df[metric_col] > 0) &
+        (df[ci_lower_col].notna()) &
+        (df[ci_upper_col].notna())
     ].head(10)
     if valid_models.empty:
         fig = go.Figure()
+        fig.add_annotation(text="No models with confidence intervals", x=0.5, y=0.5, showarrow=False)
         return fig
     fig = go.Figure()
     # Add confidence intervals as error bars
     for i, (_, model) in enumerate(valid_models.iterrows()):
         category = model["model_category"]
         color = MODEL_CATEGORIES.get(category, {}).get("color", "#808080")
         # Main point
+        fig.add_trace(go.Scatter(
+            x=[model[metric_col]],
+            y=[i],
+            mode="markers",
+            marker=dict(
+                size=12,
+                color=color,
+                line=dict(color="black", width=1),
+            ),
+            name=model["model_name"],
+            showlegend=False,
+            hovertemplate=(
+                f"<b>{model['model_name']}</b><br>" +
+                f"Quality: {model[metric_col]:.4f}<br>" +
+                f"95% CI: [{model[ci_lower_col]:.4f}, {model[ci_upper_col]:.4f}]<br>" +
+                f"Category: {category}<br>" +
+                "<extra></extra>"
+            ),
+        ))
         # Confidence interval line
+        fig.add_trace(go.Scatter(
+            x=[model[ci_lower_col], model[ci_upper_col]],
+            y=[i, i],
+            mode="lines",
+            line=dict(color=color, width=3),
+            showlegend=False,
+            hoverinfo="skip",
+        ))
         # CI endpoints
+        fig.add_trace(go.Scatter(
+            x=[model[ci_lower_col], model[ci_upper_col]],
+            y=[i, i],
+            mode="markers",
+            marker=dict(
+                symbol="line-ns",
+                size=10,
+                color=color,
+                line=dict(width=2),
+            ),
+            showlegend=False,
+            hoverinfo="skip",
+        ))
     # Customize layout
     track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
         plot_bgcolor="white",
         paper_bgcolor="white",
     )
     return fig
 def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
     """Create category-wise comparison plot."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
     metric_col = f"{track}_quality"
     adequate_col = f"{track}_adequate"
     # Filter to adequate models
     valid_models = df[df[adequate_col] & (df[metric_col] > 0)]
     if valid_models.empty:
         fig = go.Figure()
+        fig.add_annotation(text="No adequate models found", x=0.5, y=0.5, showarrow=False)
         return fig
     fig = go.Figure()
     # Create box plot for each category
     for category, info in MODEL_CATEGORIES.items():
         category_models = valid_models[valid_models["model_category"] == category]
         if len(category_models) > 0:
+            fig.add_trace(go.Box(
+                y=category_models[metric_col],
+                name=info["name"],
+                marker_color=info["color"],
+                boxpoints="all",  # Show all points
+                jitter=0.3,
+                pointpos=-1.8,
+                hovertemplate=(
+                    f"<b>{info['name']}</b><br>" +
+                    "Quality: %{y:.4f}<br>" +
+                    "Model: %{customdata}<br>" +
+                    "<extra></extra>"
+                ),
+                customdata=category_models["model_name"],
+            ))
     # Customize layout
     track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
         plot_bgcolor="white",
         paper_bgcolor="white",
     )
     return fig
 def create_adequacy_analysis_plot(df: pd.DataFrame) -> go.Figure:
     """Create analysis plot for statistical adequacy across tracks."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
     fig = make_subplots(
+        rows=2, cols=2,
         subplot_titles=(
             "Sample Sizes by Track",
+            "Statistical Adequacy Distribution",
             "Scientific Adequacy Scores",
+            "Model Categories Distribution"
         ),
         specs=[
             [{"type": "bar"}, {"type": "pie"}],
+            [{"type": "histogram"}, {"type": "bar"}]
+        ]
     )
     # Sample sizes by track
     track_names = []
     sample_counts = []
     for track in EVALUATION_TRACKS.keys():
         samples_col = f"{track}_samples"
         if samples_col in df.columns:
             total_samples = df[df[samples_col] > 0][samples_col].sum()
             track_names.append(track.replace("_", " ").title())
             sample_counts.append(total_samples)
     if track_names:
         fig.add_trace(
+            go.Bar(x=track_names, y=sample_counts, name="Samples"),
+            row=1, col=1
         )
     # Statistical adequacy distribution
     adequacy_bins = pd.cut(
+        df["scientific_adequacy_score"],
         bins=[0, 0.3, 0.6, 0.8, 1.0],
+        labels=["Poor", "Fair", "Good", "Excellent"]
     )
     adequacy_counts = adequacy_bins.value_counts()
     if not adequacy_counts.empty:
         fig.add_trace(
             go.Pie(
                 labels=adequacy_counts.index,
                 values=adequacy_counts.values,
+                name="Adequacy"
             ),
+            row=1, col=2
         )
     # Scientific adequacy scores histogram
     fig.add_trace(
         go.Histogram(
+            x=df["scientific_adequacy_score"],
+            nbinsx=20,
+            name="Adequacy Scores"
         ),
+        row=2, col=1
     )
     # Model categories distribution
     category_counts = df["model_category"].value_counts()
+    category_colors = [MODEL_CATEGORIES.get(cat, {}).get("color", "#808080") for cat in category_counts.index]
     fig.add_trace(
         go.Bar(
             x=category_counts.index,
             y=category_counts.values,
             marker_color=category_colors,
+            name="Categories"
         ),
+        row=2, col=2
     )
     fig.update_layout(
+        title="📊 Scientific Evaluation Analysis",
+        height=800,
+        showlegend=False
     )
     return fig
 def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
     """Create cross-track performance correlation analysis."""
     if df.empty:
         fig = go.Figure()
         fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
         return fig
     # Get models with data in multiple tracks
     quality_cols = [f"{track}_quality" for track in EVALUATION_TRACKS.keys()]
     available_cols = [col for col in quality_cols if col in df.columns]
     if len(available_cols) < 2:
         fig = go.Figure()
+        fig.add_annotation(text="Need at least 2 tracks for comparison", x=0.5, y=0.5, showarrow=False)
         return fig
     # Filter to models with data in multiple tracks
     multi_track_models = df.copy()
     for col in available_cols:
         multi_track_models = multi_track_models[multi_track_models[col] > 0]
     if len(multi_track_models) < 3:
         fig = go.Figure()
+        fig.add_annotation(text="Insufficient models for cross-track analysis", x=0.5, y=0.5, showarrow=False)
         return fig
     # Create scatter plot matrix
+    track_pairs = [(available_cols[i], available_cols[j])
+                  for i in range(len(available_cols))
+                  for j in range(i+1, len(available_cols))]
     if not track_pairs:
         fig = go.Figure()
+        fig.add_annotation(text="No track pairs available", x=0.5, y=0.5, showarrow=False)
         return fig
     # Use first pair for demonstration
     x_col, y_col = track_pairs[0]
     x_track = x_col.replace("_quality", "").replace("_", " ").title()
     y_track = y_col.replace("_quality", "").replace("_", " ").title()
     fig = go.Figure()
     # Color by category
     for category, info in MODEL_CATEGORIES.items():
+        category_models = multi_track_models[multi_track_models["model_category"] == category]
         if len(category_models) > 0:
+            fig.add_trace(go.Scatter(
+                x=category_models[x_col],
+                y=category_models[y_col],
+                mode="markers",
+                marker=dict(
+                    size=10,
+                    color=info["color"],
+                    line=dict(color="black", width=1),
+                ),
+                name=info["name"],
+                text=category_models["model_name"],
+                hovertemplate=(
+                    "<b>%{text}</b><br>" +
+                    f"{x_track}: %{{x:.4f}}<br>" +
+                    f"{y_track}: %{{y:.4f}}<br>" +
+                    f"Category: {info['name']}<br>" +
+                    "<extra></extra>"
+                ),
+            ))
     # Add diagonal line for reference
     min_val = min(multi_track_models[x_col].min(), multi_track_models[y_col].min())
     max_val = max(multi_track_models[x_col].max(), multi_track_models[y_col].max())
+    fig.add_trace(go.Scatter(
+        x=[min_val, max_val],
+        y=[min_val, max_val],
+        mode="lines",
+        line=dict(dash="dash", color="gray", width=2),
+        name="Perfect Correlation",
+        showlegend=False,
+        hoverinfo="skip",
+    ))
     fig.update_layout(
         title=f"🔄 Cross-Track Performance: {x_track} vs {y_track}",
         xaxis_title=f"{x_track} Quality Score",
         plot_bgcolor="white",
         paper_bgcolor="white",
     )
     return fig
+def create_scientific_model_detail_plot(model_results: Dict, model_name: str, track: str) -> go.Figure:
     """Create detailed scientific analysis for a specific model."""
     if not model_results or "tracks" not in model_results:
         fig = go.Figure()
+        fig.add_annotation(text="No model results available", x=0.5, y=0.5, showarrow=False)
         return fig
     track_data = model_results["tracks"].get(track, {})
     if track_data.get("error") or "pair_metrics" not in track_data:
         fig = go.Figure()
+        fig.add_annotation(text=f"No data for {track} track", x=0.5, y=0.5, showarrow=False)
         return fig
     pair_metrics = track_data["pair_metrics"]
     track_languages = EVALUATION_TRACKS[track]["languages"]
     # Extract data for plotting
     pairs = []
     quality_means = []
     quality_cis = []
     bleu_means = []
     sample_counts = []
     for src in track_languages:
         for tgt in track_languages:
             if src == tgt:
                 continue
             pair_key = f"{src}_to_{tgt}"
             if pair_key in pair_metrics:
                 metrics = pair_metrics[pair_key]
                 if "quality_score" in metrics and "sample_count" in metrics:
                     pair_label = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
                     pairs.append(pair_label)
                     quality_stats = metrics["quality_score"]
                     quality_means.append(quality_stats["mean"])
+                    quality_cis.append([quality_stats["ci_lower"], quality_stats["ci_upper"]])
                     bleu_stats = metrics.get("bleu", {"mean": 0})
                     bleu_means.append(bleu_stats["mean"])
                     sample_counts.append(metrics["sample_count"])
     if not pairs:
         fig = go.Figure()
+        fig.add_annotation(text="No language pair data available", x=0.5, y=0.5, showarrow=False)
         return fig
     # Create subplots
     fig = make_subplots(
+        rows=2, cols=1,
         subplot_titles=(
             "Quality Scores by Language Pair (with 95% CI)",
+            "BLEU Scores by Language Pair"
         ),
         vertical_spacing=0.15,
     )
     # Quality scores with confidence intervals
     error_y = dict(
         type="data",
         thickness=2,
         width=4,
     )
     fig.add_trace(
         go.Bar(
             x=pairs,
             text=[f"{score:.3f}" for score in quality_means],
             textposition="outside",
             hovertemplate=(
+                "<b>%{x}</b><br>" +
+                "Quality: %{y:.4f}<br>" +
+                "Samples: %{customdata}<br>" +
+                "<extra></extra>"
             ),
             customdata=sample_counts,
         ),
+        row=1, col=1
     )
     # BLEU scores
     fig.add_trace(
         go.Bar(
             text=[f"{score:.1f}" for score in bleu_means],
             textposition="outside",
         ),
+        row=2, col=1
     )
     # Customize layout
     track_info = EVALUATION_TRACKS[track]
     fig.update_layout(
         showlegend=False,
         margin=dict(l=50, r=50, t=100, b=150),
     )
     # Rotate x-axis labels
     fig.update_xaxes(tickangle=45, row=1, col=1)
     fig.update_xaxes(tickangle=45, row=2, col=1)
+    return fig