Spaces:
Sleeping
Sleeping
File size: 4,467 Bytes
4862c84 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
"""Logic helpers for the **Overview** tab."""
from typing import List
from .state import app_state
from .utils import compute_model_rankings_new, create_model_summary_card_new
__all__ = ["create_overview"]
def create_overview(
selected_models: List[str],
top_n: int,
score_significant_only: bool = False,
quality_significant_only: bool = False,
sort_by: str = "quality_asc",
min_cluster_size: int = 1,
) -> str:
"""Return the HTML snippet that summarises model performance."""
if not app_state["metrics"]:
return "Please load data first using the 'Load Data' tab."
if not selected_models:
return "Please select at least one model to display."
# 1. Compute global rankings and filter to selection
model_rankings = compute_model_rankings_new(app_state["metrics"])
filtered_rankings = [
(name, stats) for name, stats in model_rankings if name in selected_models
]
# Sort so "all" appears first, then the rest by their rankings
all_models = [(name, stats) for name, stats in filtered_rankings if name == "all"]
other_models = [(name, stats) for name, stats in filtered_rankings if name != "all"]
filtered_rankings = all_models + other_models
if not filtered_rankings:
return "No data available for selected models."
# 2. Assemble HTML
overview_html = """
<div style="max-width: 1600px; margin: 0 auto;">
<p style="color: #666; margin-bottom: 10px;">
Top distinctive clusters where each model shows unique behavioural patterns.
Frequency shows what percentage of a model's battles resulted in that behavioural pattern.
</p>
<details style="margin-bottom:25px;">
<summary style="cursor:pointer; color:#4c6ef5; font-weight:600;">ℹ️ What do "proportion delta", "Quality Δ", and significance tags mean?</summary>
<div style="margin-top:12px; font-size:14px; line-height:1.5; color:#333;">
<strong>Proportion Delta</strong><br>
For each cluster we compute how often <em>this model</em> appears in that cluster compared with the average across all models.<br>
• A positive value (e.g. <code>+0.15</code>) means the model hits the behaviour more often than average.<br>
• A negative value (e.g. <code>-0.08</code>) means it appears less often.<br>
It is derived from the <code>proportion_delta</code> field in <code>model_cluster_scores.json</code>.<br><br>
<strong>Quality Δ</strong><br>
The difference between the cluster's quality score(s) for this model and the model's <em>overall</em> quality baseline, shown for each individual metric (e.g., helpfulness, accuracy).<br>
Positive values (green) indicate the model performs better than its average in that behaviour; negative values (red) indicate worse.<br>
This is derived from the <code>quality_delta</code> metric dictionary in <code>model_cluster_scores.json</code>.<br><br>
<strong>Significance Tags (FREQ/QUAL)</strong><br>
The <span style="background: #28a745; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">FREQ</span> and <span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">QUAL</span> tags indicate <em>statistical significance</em> based on confidence intervals:<br>
• <strong>FREQ</strong> (green): The proportion delta is statistically significant (confidence interval doesn't include zero)<br>
• <strong>QUAL</strong> (blue): At least one quality metric delta is statistically significant<br>
These tags help identify which behavioral patterns are reliably different from the model's baseline performance.
</div>
</details>
"""
for model_name, _ in filtered_rankings:
card_html = create_model_summary_card_new(
model_name,
app_state["metrics"],
# top_n etc.
top_n,
score_significant_only=score_significant_only,
quality_significant_only=quality_significant_only,
sort_by=sort_by,
min_cluster_size=min_cluster_size,
)
overview_html += card_html
overview_html += "</div>"
return overview_html |