import gradio as gr from typing import Tuple def render_score_card( ttr_output: float, ttr_train: float, similarity: float, mad_score: float, risk_level: str, ) -> Tuple[str, str, str]: """Renders the evaluation summary and score details.""" color = {"High": "#e57373", "Medium": "#ffb74d", "Low": "#81c784"}[risk_level] risk_explanations = { "High": """ 🚨 **High Risk Detected** Your model outputs are **very similar** to your planned training data. This suggests a **strong feedback loop**, meaning the model is likely to reinforce existing patterns rather than learning new behaviors. **What You Can Do**: - Replace synthetic data with more **diverse real user input** - Use **paraphrasing techniques** before reuse - Add **augmentation or filtering** before retraining """, "Medium": """ 🟠 **Moderate Risk Identified** There is some overlap between your outputs and training content. Your model may partially reinforce existing phrasing patterns. **Suggestions**: - Mix synthetic and real inputs carefully - Monitor training logs for semantic redundancy """, "Low": """ 🟒 **Low Risk Score** Your model output and training data appear **diverse** and distinct. This is a good sign that your model is learning from **new and varied sources**. **You’re on the right track!** """, } summary = f""" ### πŸ” Evaluation Summary **Lexical Diversity (Output):** {ttr_output:.2f} TTR = unique words / total words **Lexical Diversity (Training Set):** {ttr_train:.2f} Broader content = higher TTR **Semantic Similarity (Cosine):** {similarity:.2f} Cosine similarity between embeddings
MAD Risk Score: {mad_score:.2f} β†’ {risk_level} Risk
""" details = f"""
πŸ“Š Score Breakdown TTR Component (0.3 Γ— (1 - TTR)): {(1 - ttr_output) * 0.3:.2f} Similarity Component (0.7 Γ— Cosine): {similarity * 0.7:.2f} MAD Score = 0.3 Γ— (1 - TTR) + 0.7 Γ— Semantic Similarity
""" explanation = f"""
πŸ” What does this score mean? {risk_explanations[risk_level]}
""" return summary, details, explanation