"""Logic helpers for the **Overview** tab.""" from typing import List from .state import app_state from .utils import compute_model_rankings_new, create_model_summary_card_new __all__ = ["create_overview"] def create_overview( selected_models: List[str], top_n: int, score_significant_only: bool = False, quality_significant_only: bool = False, sort_by: str = "quality_asc", min_cluster_size: int = 1, ) -> str: """Return the HTML snippet that summarises model performance.""" if not app_state["metrics"]: return "Please load data first using the 'Load Data' tab." if not selected_models: return "Please select at least one model to display." # 1. Compute global rankings and filter to selection model_rankings = compute_model_rankings_new(app_state["metrics"]) filtered_rankings = [ (name, stats) for name, stats in model_rankings if name in selected_models ] # Sort so "all" appears first, then the rest by their rankings all_models = [(name, stats) for name, stats in filtered_rankings if name == "all"] other_models = [(name, stats) for name, stats in filtered_rankings if name != "all"] filtered_rankings = all_models + other_models if not filtered_rankings: return "No data available for selected models." # 2. Assemble HTML overview_html = """

Top distinctive clusters where each model shows unique behavioural patterns. Frequency shows what percentage of a model's battles resulted in that behavioural pattern.

ℹ️ What do "proportion delta", "Quality Δ", and significance tags mean?
Proportion Delta
For each cluster we compute how often this model appears in that cluster compared with the average across all models.
• A positive value (e.g. +0.15) means the model hits the behaviour more often than average.
• A negative value (e.g. -0.08) means it appears less often.
It is derived from the proportion_delta field in model_cluster_scores.json.

Quality Δ
The difference between the cluster's quality score(s) for this model and the model's overall quality baseline, shown for each individual metric (e.g., helpfulness, accuracy).
Positive values (green) indicate the model performs better than its average in that behaviour; negative values (red) indicate worse.
This is derived from the quality_delta metric dictionary in model_cluster_scores.json.

Significance Tags (FREQ/QUAL)
The FREQ and QUAL tags indicate statistical significance based on confidence intervals:
FREQ (green): The proportion delta is statistically significant (confidence interval doesn't include zero)
QUAL (blue): At least one quality metric delta is statistically significant
These tags help identify which behavioral patterns are reliably different from the model's baseline performance.
""" for model_name, _ in filtered_rankings: card_html = create_model_summary_card_new( model_name, app_state["metrics"], # top_n etc. top_n, score_significant_only=score_significant_only, quality_significant_only=quality_significant_only, sort_by=sort_by, min_cluster_size=min_cluster_size, ) overview_html += card_html overview_html += "
" return overview_html