File size: 4,467 Bytes
4862c84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Logic helpers for the **Overview** tab."""
from typing import List

from .state import app_state
from .utils import compute_model_rankings_new, create_model_summary_card_new

__all__ = ["create_overview"]


def create_overview(
    selected_models: List[str],
    top_n: int,
    score_significant_only: bool = False,
    quality_significant_only: bool = False,
    sort_by: str = "quality_asc",
    min_cluster_size: int = 1,
) -> str:
    """Return the HTML snippet that summarises model performance."""
    if not app_state["metrics"]:
        return "Please load data first using the 'Load Data' tab."

    if not selected_models:
        return "Please select at least one model to display."

    # 1. Compute global rankings and filter to selection
    model_rankings = compute_model_rankings_new(app_state["metrics"])
    filtered_rankings = [
        (name, stats) for name, stats in model_rankings if name in selected_models
    ]

    # Sort so "all" appears first, then the rest by their rankings
    all_models = [(name, stats) for name, stats in filtered_rankings if name == "all"]
    other_models = [(name, stats) for name, stats in filtered_rankings if name != "all"]
    filtered_rankings = all_models + other_models

    if not filtered_rankings:
        return "No data available for selected models."

    # 2. Assemble HTML
    overview_html = """
    <div style="max-width: 1600px; margin: 0 auto;">
        <p style="color: #666; margin-bottom: 10px;">
            Top distinctive clusters where each model shows unique behavioural patterns.
            Frequency shows what percentage of a model's battles resulted in that behavioural pattern.
        </p>

        <details style="margin-bottom:25px;">
            <summary style="cursor:pointer; color:#4c6ef5; font-weight:600;">ℹ️  What do "proportion delta", "Quality Δ", and significance tags mean?</summary>
            <div style="margin-top:12px; font-size:14px; line-height:1.5; color:#333;">
                <strong>Proportion Delta</strong><br>
                For each cluster we compute how often <em>this model</em> appears in that cluster compared with the average across all models.<br>
                • A positive value (e.g. <code>+0.15</code>) means the model hits the behaviour more often than average.<br>
                • A negative value (e.g. <code>-0.08</code>) means it appears less often.<br>
                It is derived from the&nbsp;<code>proportion_delta</code>&nbsp;field in <code>model_cluster_scores.json</code>.<br><br>
                <strong>Quality Δ</strong><br>
                The difference between the cluster's quality score(s) for this model and the model's <em>overall</em> quality baseline, shown for each individual metric (e.g., helpfulness, accuracy).<br>
                Positive values (green) indicate the model performs better than its average in that behaviour; negative values (red) indicate worse.<br>
                This is derived from the <code>quality_delta</code> metric dictionary in <code>model_cluster_scores.json</code>.<br><br>
                <strong>Significance Tags (FREQ/QUAL)</strong><br>
                The <span style="background: #28a745; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">FREQ</span> and <span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">QUAL</span> tags indicate <em>statistical significance</em> based on confidence intervals:<br>
                • <strong>FREQ</strong> (green): The proportion delta is statistically significant (confidence interval doesn't include zero)<br>
                • <strong>QUAL</strong> (blue): At least one quality metric delta is statistically significant<br>
                These tags help identify which behavioral patterns are reliably different from the model's baseline performance.
            </div>
        </details>
    """

    for model_name, _ in filtered_rankings:
        card_html = create_model_summary_card_new(
            model_name,
            app_state["metrics"],
            # top_n etc.
            top_n,
            score_significant_only=score_significant_only,
            quality_significant_only=quality_significant_only,
            sort_by=sort_by,
            min_cluster_size=min_cluster_size,
        )
        overview_html += card_html

    overview_html += "</div>"
    return overview_html