Spaces:

akera
/

leaderboard

Sleeping

App Files Files Community

akera commited on Jun 16

Commit

f0df659

verified ·

1 Parent(s): d5b83bc

Update src/evaluation.py

Browse files

Files changed (1) hide show

src/evaluation.py +543 -336

src/evaluation.py CHANGED Viewed

@@ -6,414 +6,621 @@ from rouge_score import rouge_scorer
 import Levenshtein
 from collections import defaultdict
 from transformers.models.whisper.english_normalizer import BasicTextNormalizer
-from typing import Dict, List, Tuple
-from config import ALL_UG40_LANGUAGES, GOOGLE_SUPPORTED_LANGUAGES, METRICS_CONFIG
 from src.utils import get_all_language_pairs, get_google_comparable_pairs
 def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
-    """Calculate all metrics for a single sentence pair - Fixed to match reference implementation."""
     # Handle empty predictions
     if not prediction or not isinstance(prediction, str):
         prediction = ""
     if not reference or not isinstance(reference, str):
         reference = ""
     # Normalize texts
     normalizer = BasicTextNormalizer()
     pred_norm = normalizer(prediction)
     ref_norm = normalizer(reference)
     metrics = {}
-    # BLEU score (keep as 0-100 scale initially)
     try:
         bleu = BLEU(effective_order=True)
-        metrics['bleu'] = bleu.sentence_score(pred_norm, [ref_norm]).score
     except:
-        metrics['bleu'] = 0.0
     # ChrF score (normalize to 0-1)
     try:
         chrf = CHRF()
-        metrics['chrf'] = chrf.sentence_score(pred_norm, [ref_norm]).score / 100.0
     except:
-        metrics['chrf'] = 0.0
     # Character Error Rate (CER)
     try:
         if len(ref_norm) > 0:
-            metrics['cer'] = Levenshtein.distance(ref_norm, pred_norm) / len(ref_norm)
         else:
-            metrics['cer'] = 1.0 if len(pred_norm) > 0 else 0.0
     except:
-        metrics['cer'] = 1.0
     # Word Error Rate (WER)
     try:
         ref_words = ref_norm.split()
         pred_words = pred_norm.split()
         if len(ref_words) > 0:
-            metrics['wer'] = Levenshtein.distance(ref_words, pred_words) / len(ref_words)
         else:
-            metrics['wer'] = 1.0 if len(pred_words) > 0 else 0.0
     except:
-        metrics['wer'] = 1.0
     # Length ratio
     try:
         if len(ref_norm) > 0:
-            metrics['len_ratio'] = len(pred_norm) / len(ref_norm)
         else:
-            metrics['len_ratio'] = 1.0 if len(pred_norm) == 0 else float('inf')
     except:
-        metrics['len_ratio'] = 1.0
     # ROUGE scores
     try:
-        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
         rouge_scores = scorer.score(ref_norm, pred_norm)
-        metrics['rouge1'] = rouge_scores['rouge1'].fmeasure
-        metrics['rouge2'] = rouge_scores['rouge2'].fmeasure
-        metrics['rougeL'] = rouge_scores['rougeL'].fmeasure
     except:
-        metrics['rouge1'] = 0.0
-        metrics['rouge2'] = 0.0
-        metrics['rougeL'] = 0.0
-    # Quality score (composite metric) - Fixed to match reference
     try:
         quality_components = [
-            metrics['bleu'] / 100.0,  # Normalize BLEU to 0-1
-            metrics['chrf'],          # Already 0-1
-            1.0 - min(metrics['cer'], 1.0),  # Invert error rates
-            1.0 - min(metrics['wer'], 1.0),
-            metrics['rouge1'],
-            metrics['rougeL']
         ]
-        metrics['quality_score'] = np.mean(quality_components)
-    except Exception as e:
-        # Fallback without ROUGE
-        print(f"Error calculating quality score: {e}")
-        try:
-            fallback_components = [
-                metrics['bleu'] / 100.0,
-                metrics['chrf'],
-                1.0 - min(metrics['cer'], 1.0),
-                1.0 - min(metrics['wer'], 1.0)
-            ]
-            metrics['quality_score'] = np.mean(fallback_components)
-        except:
-            metrics['quality_score'] = 0.0
     return metrics
-def evaluate_predictions(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
-    """Evaluate predictions against test set targets."""
-    print("Starting evaluation...")
-    # Merge predictions with test set (which contains targets)
-    merged = test_set.merge(
-        predictions,
-        on='sample_id',
-        how='inner',
-        suffixes=('', '_pred')
     )
     if len(merged) == 0:
         return {
-            'error': 'No matching samples found between predictions and test set',
-            'evaluated_samples': 0
         }
-    print(f"Evaluating {len(merged)} samples...")
     # Calculate metrics for each sample
     sample_metrics = []
     for idx, row in merged.iterrows():
-        metrics = calculate_sentence_metrics(row['target_text'], row['prediction'])
-        metrics['sample_id'] = row['sample_id']
-        metrics['source_language'] = row['source_language']
-        metrics['target_language'] = row['target_language']
-        metrics['google_comparable'] = row.get('google_comparable', False)
         sample_metrics.append(metrics)
     sample_df = pd.DataFrame(sample_metrics)
-    # Aggregate by language pairs - Fixed aggregation
     pair_metrics = {}
     overall_metrics = defaultdict(list)
-    google_comparable_metrics = defaultdict(list)
     # Calculate metrics for each language pair
-    for src_lang in ALL_UG40_LANGUAGES:
-        for tgt_lang in ALL_UG40_LANGUAGES:
-            if src_lang != tgt_lang:
-                pair_data = sample_df[
-                    (sample_df['source_language'] == src_lang) &
-                    (sample_df['target_language'] == tgt_lang)
-                ]
-                if len(pair_data) > 0:
-                    pair_key = f"{src_lang}_to_{tgt_lang}"
-                    pair_metrics[pair_key] = {}
-                    # Calculate averages for this pair
-                    for metric in METRICS_CONFIG['primary_metrics'] + METRICS_CONFIG['secondary_metrics']:
-                        if metric in pair_data.columns:
-                            # Filter out invalid values
-                            valid_values = pair_data[metric].replace([np.inf, -np.inf], np.nan).dropna()
-                            if len(valid_values) > 0:
-                                avg_value = float(valid_values.mean())
-                                pair_metrics[pair_key][metric] = avg_value
-                                # Add to overall averages
-                                overall_metrics[metric].append(avg_value)
-                                # Add to Google comparable if applicable
-                                if (src_lang in GOOGLE_SUPPORTED_LANGUAGES and
-                                    tgt_lang in GOOGLE_SUPPORTED_LANGUAGES):
-                                    google_comparable_metrics[metric].append(avg_value)
-                    pair_metrics[pair_key]['sample_count'] = len(pair_data)
-    # Calculate overall averages
-    averages = {}
     for metric in overall_metrics:
         if overall_metrics[metric]:
-            averages[metric] = float(np.mean(overall_metrics[metric]))
-        else:
-            averages[metric] = 0.0
-    # Calculate Google comparable averages
-    google_averages = {}
-    for metric in google_comparable_metrics:
-        if google_comparable_metrics[metric]:
-            google_averages[metric] = float(np.mean(google_comparable_metrics[metric]))
-        else:
-            google_averages[metric] = 0.0
     # Generate evaluation summary
     summary = {
-        'total_samples': len(sample_df),
-        'language_pairs_covered': len([k for k in pair_metrics if pair_metrics[k].get('sample_count', 0) > 0]),
-        'google_comparable_pairs': len([k for k in pair_metrics
-                                      if '_to_' in k and
-                                      k.split('_to_')[0] in GOOGLE_SUPPORTED_LANGUAGES and
-                                      k.split('_to_')[1] in GOOGLE_SUPPORTED_LANGUAGES and
-                                      pair_metrics[k].get('sample_count', 0) > 0]),
-        'primary_metrics': {metric: averages.get(metric, 0.0)
-                          for metric in METRICS_CONFIG['primary_metrics']},
-        'secondary_metrics': {metric: averages.get(metric, 0.0)
-                            for metric in METRICS_CONFIG['secondary_metrics']}
     }
     return {
-        'sample_metrics': sample_df,
-        'pair_metrics': pair_metrics,
-        'averages': averages,
-        'google_comparable_averages': google_averages,
-        'summary': summary,
-        'evaluated_samples': len(sample_df),
-        'error': None
     }
-# Keep the rest of the functions unchanged...
-def compare_with_baseline(results: Dict, baseline_results: Dict = None) -> Dict:
-    """Compare results with baseline (e.g., Google Translate)."""
-    if baseline_results is None:
-        return {
-            'comparison_available': False,
-            'message': 'No baseline available for comparison'
-        }
-    comparison = {
-        'comparison_available': True,
-        'overall_comparison': {},
-        'pair_comparisons': {},
-        'better_pairs': [],
-        'worse_pairs': []
     }
-    # Compare overall metrics
-    for metric in METRICS_CONFIG['primary_metrics']:
-        if metric in results['averages'] and metric in baseline_results['averages']:
-            user_score = results['averages'][metric]
-            baseline_score = baseline_results['averages'][metric]
-            # For error metrics (cer, wer), lower is better
-            if metric in ['cer', 'wer']:
-                improvement = baseline_score - user_score  # Positive = improvement
-            else:
-                improvement = user_score - baseline_score  # Positive = improvement
-            comparison['overall_comparison'][metric] = {
-                'user_score': user_score,
-                'baseline_score': baseline_score,
-                'improvement': improvement,
-                'improvement_percent': (improvement / max(baseline_score, 0.001)) * 100
             }
-    # Compare by language pairs (only Google comparable ones)
-    google_pairs = [k for k in results['pair_metrics']
-                   if '_to_' in k and
-                   k.split('_to_')[0] in GOOGLE_SUPPORTED_LANGUAGES and
-                   k.split('_to_')[1] in GOOGLE_SUPPORTED_LANGUAGES]
-    for pair in google_pairs:
-        if pair in baseline_results['pair_metrics']:
-            pair_comparison = {}
-            for metric in METRICS_CONFIG['primary_metrics']:
-                if (metric in results['pair_metrics'][pair] and
-                    metric in baseline_results['pair_metrics'][pair]):
-                    user_score = results['pair_metrics'][pair][metric]
-                    baseline_score = baseline_results['pair_metrics'][pair][metric]
-                    if metric in ['cer', 'wer']:
-                        improvement = baseline_score - user_score
-                    else:
-                        improvement = user_score - baseline_score
-                    pair_comparison[metric] = {
-                        'user_score': user_score,
-                        'baseline_score': baseline_score,
-                        'improvement': improvement
-                    }
-            comparison['pair_comparisons'][pair] = pair_comparison
-            # Determine if this pair is better or worse overall
-            quality_improvement = pair_comparison.get('quality_score', {}).get('improvement', 0)
-            if quality_improvement > 0.01:  # Threshold for significance
-                comparison['better_pairs'].append(pair)
-            elif quality_improvement < -0.01:
-                comparison['worse_pairs'].append(pair)
-    return comparison
-def generate_evaluation_report(results: Dict, model_name: str = "", comparison: Dict = None) -> str:
-    """Generate human-readable evaluation report."""
-    if results.get('error'):
-        return f"❌ **Evaluation Error**: {results['error']}"
     report = []
     # Header
-    report.append(f"## Evaluation Report: {model_name or 'Submission'}")
-    report.append("")
-    # Summary
-    summary = results['summary']
-    report.append("### 📊 Summary")
-    report.append(f"- **Total Samples Evaluated**: {summary['total_samples']:,}")
-    report.append(f"- **Language Pairs Covered**: {summary['language_pairs_covered']}")
-    report.append(f"- **Google Comparable Pairs**: {summary['google_comparable_pairs']}")
-    report.append("")
-    # Primary metrics
-    report.append("### 🎯 Primary Metrics")
-    for metric, value in summary['primary_metrics'].items():
-        formatted_value = f"{value:.4f}" if metric != 'bleu' else f"{value:.2f}"
-        report.append(f"- **{metric.upper()}**: {formatted_value}")
-    # Quality ranking (if comparison available)
-    if comparison and comparison.get('comparison_available'):
-        quality_comp = comparison['overall_comparison'].get('quality_score', {})
-        if quality_comp:
-            improvement = quality_comp.get('improvement', 0)
-            if improvement > 0.01:
-                report.append(f"  - 🟢 **{improvement:.3f}** better than baseline")
-            elif improvement < -0.01:
-                report.append(f"  - 🔴 **{abs(improvement):.3f}** worse than baseline")
-            else:
-                report.append(f"  - 🟡 Similar to baseline")
     report.append("")
-    # Secondary metrics
-    report.append("### 📈 Secondary Metrics")
-    for metric, value in summary['secondary_metrics'].items():
-        formatted_value = f"{value:.4f}"
-        report.append(f"- **{metric.upper()}**: {formatted_value}")
     report.append("")
-    # Language pair performance (top and bottom 5)
-    pair_metrics = results['pair_metrics']
-    if pair_metrics:
-        # Sort pairs by quality score
-        sorted_pairs = sorted(
-            [(k, v.get('quality_score', 0)) for k, v in pair_metrics.items() if v.get('sample_count', 0) > 0],
-            key=lambda x: x[1],
-            reverse=True
         )
-        if sorted_pairs:
-            report.append("### 🏆 Best Performing Language Pairs")
-            for pair, score in sorted_pairs[:5]:
-                src, tgt = pair.replace('_to_', ' → ').split(' → ')
-                report.append(f"- **{src} → {tgt}**: {score:.3f}")
-            if len(sorted_pairs) > 5:
-                report.append("")
-                report.append("### 📉 Challenging Language Pairs")
-                for pair, score in sorted_pairs[-3:]:
-                    src, tgt = pair.replace('_to_', ' → ').split(' → ')
-                    report.append(f"- **{src} → {tgt}**: {score:.3f}")
-    # Comparison with baseline
-    if comparison and comparison.get('comparison_available'):
         report.append("")
-        report.append("### 🔍 Comparison with Baseline")
-        better_count = len(comparison.get('better_pairs', []))
-        worse_count = len(comparison.get('worse_pairs', []))
-        total_comparable = len(comparison.get('pair_comparisons', {}))
-        if total_comparable > 0:
-            report.append(f"- **Better than baseline**: {better_count}/{total_comparable} pairs")
-            report.append(f"- **Worse than baseline**: {worse_count}/{total_comparable} pairs")
-            if comparison['better_pairs']:
-                report.append("  - Strong pairs: " + ", ".join(comparison['better_pairs'][:3]))
-            if comparison['worse_pairs']:
-                report.append("  - Weak pairs: " + ", ".join(comparison['worse_pairs'][:3]))
-    return "\n".join(report)
-def create_sample_analysis(results: Dict, n_samples: int = 10) -> pd.DataFrame:
-    """Create sample analysis showing best and worst translations."""
-    if 'sample_metrics' not in results:
-        return pd.DataFrame()
-    sample_df = results['sample_metrics']
-    # Get best and worst samples by quality score
-    best_samples = sample_df.nlargest(n_samples // 2, 'quality_score')
-    worst_samples = sample_df.nsmallest(n_samples // 2, 'quality_score')
-    analysis_samples = pd.concat([best_samples, worst_samples])
-    # Add category
-    analysis_samples['category'] = ['Best'] * len(best_samples) + ['Worst'] * len(worst_samples)
-    return analysis_samples[['sample_id', 'source_language', 'target_language',
-                           'quality_score', 'bleu', 'chrf', 'category']]
-def get_google_translate_baseline() -> Dict:
-    """Get Google Translate baseline results (if available)."""
-    try:
-        # This would load pre-computed Google Translate results
-        # For now, return empty dict - implement when Google Translate baseline is available
-        return {}
-    except:
-        return {}

 import Levenshtein
 from collections import defaultdict
 from transformers.models.whisper.english_normalizer import BasicTextNormalizer
+from typing import Dict, List, Tuple, Optional
+from scipy import stats
+from scipy.stats import bootstrap
+import warnings
+from config import (
+    ALL_UG40_LANGUAGES,
+    GOOGLE_SUPPORTED_LANGUAGES,
+    METRICS_CONFIG,
+    STATISTICAL_CONFIG,
+    EVALUATION_TRACKS,
+    MODEL_CATEGORIES,
+)
 from src.utils import get_all_language_pairs, get_google_comparable_pairs
+warnings.filterwarnings("ignore", category=RuntimeWarning)
 def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
+    """Calculate all metrics for a single sentence pair with robust error handling."""
     # Handle empty predictions
     if not prediction or not isinstance(prediction, str):
         prediction = ""
     if not reference or not isinstance(reference, str):
         reference = ""
     # Normalize texts
     normalizer = BasicTextNormalizer()
     pred_norm = normalizer(prediction)
     ref_norm = normalizer(reference)
     metrics = {}
+    # BLEU score (0-100 scale)
     try:
         bleu = BLEU(effective_order=True)
+        metrics["bleu"] = bleu.sentence_score(pred_norm, [ref_norm]).score
     except:
+        metrics["bleu"] = 0.0
     # ChrF score (normalize to 0-1)
     try:
         chrf = CHRF()
+        metrics["chrf"] = chrf.sentence_score(pred_norm, [ref_norm]).score / 100.0
     except:
+        metrics["chrf"] = 0.0
     # Character Error Rate (CER)
     try:
         if len(ref_norm) > 0:
+            metrics["cer"] = Levenshtein.distance(ref_norm, pred_norm) / len(ref_norm)
         else:
+            metrics["cer"] = 1.0 if len(pred_norm) > 0 else 0.0
     except:
+        metrics["cer"] = 1.0
     # Word Error Rate (WER)
     try:
         ref_words = ref_norm.split()
         pred_words = pred_norm.split()
         if len(ref_words) > 0:
+            metrics["wer"] = Levenshtein.distance(ref_words, pred_words) / len(
+                ref_words
+            )
         else:
+            metrics["wer"] = 1.0 if len(pred_words) > 0 else 0.0
     except:
+        metrics["wer"] = 1.0
     # Length ratio
     try:
         if len(ref_norm) > 0:
+            metrics["len_ratio"] = len(pred_norm) / len(ref_norm)
         else:
+            metrics["len_ratio"] = 1.0 if len(pred_norm) == 0 else float("inf")
     except:
+        metrics["len_ratio"] = 1.0
     # ROUGE scores
     try:
+        scorer = rouge_scorer.RougeScorer(
+            ["rouge1", "rouge2", "rougeL"], use_stemmer=True
+        )
         rouge_scores = scorer.score(ref_norm, pred_norm)
+        metrics["rouge1"] = rouge_scores["rouge1"].fmeasure
+        metrics["rouge2"] = rouge_scores["rouge2"].fmeasure
+        metrics["rougeL"] = rouge_scores["rougeL"].fmeasure
     except:
+        metrics["rouge1"] = 0.0
+        metrics["rouge2"] = 0.0
+        metrics["rougeL"] = 0.0
+    # Quality score (composite metric)
     try:
         quality_components = [
+            metrics["bleu"] / 100.0,  # Normalize BLEU to 0-1
+            metrics["chrf"],  # Already 0-1
+            1.0 - min(metrics["cer"], 1.0),  # Invert error rates
+            1.0 - min(metrics["wer"], 1.0),
+            metrics["rouge1"],
+            metrics["rougeL"],
         ]
+        metrics["quality_score"] = np.mean(quality_components)
+    except:
+        metrics["quality_score"] = 0.0
     return metrics
+def calculate_statistical_metrics(values: List[float]) -> Dict[str, float]:
+    """Calculate statistical measures including confidence intervals."""
+    if not values or len(values) == 0:
+        return {
+            "mean": 0.0,
+            "std": 0.0,
+            "median": 0.0,
+            "ci_lower": 0.0,
+            "ci_upper": 0.0,
+            "n_samples": 0,
+        }
+    values = np.array(values)
+    values = values[~np.isnan(values)]  # Remove NaN values
+    if len(values) == 0:
+        return {
+            "mean": 0.0,
+            "std": 0.0,
+            "median": 0.0,
+            "ci_lower": 0.0,
+            "ci_upper": 0.0,
+            "n_samples": 0,
+        }
+    stats_dict = {
+        "mean": float(np.mean(values)),
+        "std": float(np.std(values, ddof=1)) if len(values) > 1 else 0.0,
+        "median": float(np.median(values)),
+        "n_samples": len(values),
+    }
+    # Calculate confidence intervals using bootstrap if enough samples
+    if len(values) >= STATISTICAL_CONFIG["min_samples_for_ci"]:
+        try:
+            confidence_level = STATISTICAL_CONFIG["confidence_level"]
+            # Bootstrap confidence interval
+            def mean_func(x):
+                return np.mean(x)
+            res = bootstrap(
+                (values,),
+                mean_func,
+                n_resamples=STATISTICAL_CONFIG["bootstrap_samples"],
+                confidence_level=confidence_level,
+                random_state=42,
+            )
+            stats_dict["ci_lower"] = float(res.confidence_interval.low)
+            stats_dict["ci_upper"] = float(res.confidence_interval.high)
+        except Exception as e:
+            # Fallback to t-distribution CI
+            try:
+                alpha = 1 - confidence_level
+                t_val = stats.t.ppf(1 - alpha / 2, len(values) - 1)
+                margin = t_val * stats_dict["std"] / np.sqrt(len(values))
+                stats_dict["ci_lower"] = stats_dict["mean"] - margin
+                stats_dict["ci_upper"] = stats_dict["mean"] + margin
+            except:
+                stats_dict["ci_lower"] = stats_dict["mean"]
+                stats_dict["ci_upper"] = stats_dict["mean"]
+    else:
+        stats_dict["ci_lower"] = stats_dict["mean"]
+        stats_dict["ci_upper"] = stats_dict["mean"]
+    return stats_dict
+def perform_significance_test(
+    values1: List[float], values2: List[float], metric_name: str
+) -> Dict[str, float]:
+    """Perform statistical significance test between two groups."""
+    if len(values1) < 2 or len(values2) < 2:
+        return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
+    values1 = np.array(values1)
+    values2 = np.array(values2)
+    # Remove NaN values
+    values1 = values1[~np.isnan(values1)]
+    values2 = values2[~np.isnan(values2)]
+    if len(values1) < 2 or len(values2) < 2:
+        return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
+    try:
+        # Perform t-test
+        t_stat, p_value = stats.ttest_ind(values1, values2, equal_var=False)
+        # Calculate effect size (Cohen's d)
+        pooled_std = np.sqrt(
+            (
+                (len(values1) - 1) * np.var(values1, ddof=1)
+                + (len(values2) - 1) * np.var(values2, ddof=1)
+            )
+            / (len(values1) + len(values2) - 2)
+        )
+        if pooled_std > 0:
+            effect_size = abs(np.mean(values1) - np.mean(values2)) / pooled_std
+        else:
+            effect_size = 0.0
+        # Determine significance
+        significance_level = EVALUATION_TRACKS["google_comparable"][
+            "significance_level"
+        ]
+        significant = p_value < significance_level
+        return {
+            "p_value": float(p_value),
+            "effect_size": float(effect_size),
+            "significant": significant,
+            "t_statistic": float(t_stat),
+        }
+    except Exception as e:
+        return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
+def evaluate_predictions_by_track(
+    predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
+) -> Dict:
+    """Evaluate predictions for a specific track with statistical analysis."""
+    print(f"🔄 Evaluating for {track} track...")
+    track_config = EVALUATION_TRACKS[track]
+    track_languages = track_config["languages"]
+    # Filter test set and predictions to track languages
+    track_test_set = test_set[
+        (test_set["source_language"].isin(track_languages))
+        & (test_set["target_language"].isin(track_languages))
+    ].copy()
+    # Merge predictions with test set
+    merged = track_test_set.merge(
+        predictions, on="sample_id", how="inner", suffixes=("", "_pred")
     )
     if len(merged) == 0:
         return {
+            "error": f"No matching samples found for {track} track",
+            "evaluated_samples": 0,
+            "track": track,
         }
+    print(f"📊 Evaluating {len(merged)} samples for {track} track...")
     # Calculate metrics for each sample
     sample_metrics = []
     for idx, row in merged.iterrows():
+        metrics = calculate_sentence_metrics(row["target_text"], row["prediction"])
+        metrics["sample_id"] = row["sample_id"]
+        metrics["source_language"] = row["source_language"]
+        metrics["target_language"] = row["target_language"]
         sample_metrics.append(metrics)
     sample_df = pd.DataFrame(sample_metrics)
+    # Aggregate by language pairs with statistical analysis
     pair_metrics = {}
     overall_metrics = defaultdict(list)
     # Calculate metrics for each language pair
+    for src_lang in track_languages:
+        for tgt_lang in track_languages:
+            if src_lang == tgt_lang:
+                continue
+            pair_data = sample_df[
+                (sample_df["source_language"] == src_lang)
+                & (sample_df["target_language"] == tgt_lang)
+            ]
+            if len(pair_data) >= track_config["min_samples_per_pair"]:
+                pair_key = f"{src_lang}_to_{tgt_lang}"
+                pair_metrics[pair_key] = {}
+                # Calculate statistical metrics for each measure
+                for metric in (
+                    METRICS_CONFIG["primary_metrics"]
+                    + METRICS_CONFIG["secondary_metrics"]
+                ):
+                    if metric in pair_data.columns:
+                        values = (
+                            pair_data[metric]
+                            .replace([np.inf, -np.inf], np.nan)
+                            .dropna()
+                        )
+                        if len(values) > 0:
+                            stats_metrics = calculate_statistical_metrics(
+                                values.tolist()
+                            )
+                            pair_metrics[pair_key][metric] = stats_metrics
+                            # Add to overall metrics for track-level statistics
+                            overall_metrics[metric].append(stats_metrics["mean"])
+                pair_metrics[pair_key]["sample_count"] = len(pair_data)
+                pair_metrics[pair_key]["languages"] = f"{src_lang}-{tgt_lang}"
+    # Calculate track-level aggregated statistics
+    track_averages = {}
+    track_statistics = {}
     for metric in overall_metrics:
         if overall_metrics[metric]:
+            track_stats = calculate_statistical_metrics(overall_metrics[metric])
+            track_averages[metric] = track_stats["mean"]
+            track_statistics[metric] = track_stats
     # Generate evaluation summary
     summary = {
+        "track": track,
+        "track_name": track_config["name"],
+        "total_samples": len(sample_df),
+        "language_pairs_evaluated": len(
+            [k for k in pair_metrics if pair_metrics[k].get("sample_count", 0) > 0]
+        ),
+        "languages_covered": len(
+            set(sample_df["source_language"]) | set(sample_df["target_language"])
+        ),
+        "min_samples_per_pair": track_config["min_samples_per_pair"],
+        "statistical_power": track_config["statistical_power"],
+        "significance_level": track_config["significance_level"],
     }
     return {
+        "sample_metrics": sample_df,
+        "pair_metrics": pair_metrics,
+        "track_averages": track_averages,
+        "track_statistics": track_statistics,
+        "summary": summary,
+        "evaluated_samples": len(sample_df),
+        "track": track,
+        "error": None,
     }
+def evaluate_predictions_scientific(
+    predictions: pd.DataFrame, test_set: pd.DataFrame, model_category: str = "community"
+) -> Dict:
+    """Comprehensive evaluation across all tracks with scientific rigor."""
+    print("🔬 Starting scientific evaluation...")
+    # Validate model category
+    if model_category not in MODEL_CATEGORIES:
+        model_category = "community"
+    evaluation_results = {
+        "model_category": model_category,
+        "category_info": MODEL_CATEGORIES[model_category],
+        "tracks": {},
+        "cross_track_analysis": {},
+        "scientific_metadata": {
+            "evaluation_timestamp": pd.Timestamp.now().isoformat(),
+            "total_samples_submitted": len(predictions),
+            "total_samples_available": len(test_set),
+        },
+    }
+    # Evaluate each track
+    for track_name in EVALUATION_TRACKS.keys():
+        track_result = evaluate_predictions_by_track(predictions, test_set, track_name)
+        evaluation_results["tracks"][track_name] = track_result
+    # Cross-track consistency analysis
+    evaluation_results["cross_track_analysis"] = analyze_cross_track_consistency(
+        evaluation_results["tracks"]
+    )
+    return evaluation_results
+def analyze_cross_track_consistency(track_results: Dict) -> Dict:
+    """Analyze consistency of model performance across different tracks."""
+    consistency_analysis = {
+        "track_correlations": {},
+        "performance_stability": {},
+        "language_coverage_analysis": {},
     }
+    # Extract quality scores from each track for correlation analysis
+    track_scores = {}
+    for track_name, track_data in track_results.items():
+        if (
+            track_data.get("track_averages")
+            and "quality_score" in track_data["track_averages"]
+        ):
+            track_scores[track_name] = track_data["track_averages"]["quality_score"]
+    # Calculate pairwise correlations (would need more data points for meaningful correlation)
+    if len(track_scores) >= 2:
+        track_names = list(track_scores.keys())
+        for i, track1 in enumerate(track_names):
+            for track2 in track_names[i + 1 :]:
+                # This would be more meaningful with multiple models
+                consistency_analysis["track_correlations"][f"{track1}_vs_{track2}"] = {
+                    "score_difference": abs(
+                        track_scores[track1] - track_scores[track2]
+                    ),
+                    "relative_performance": track_scores[track1]
+                    / max(track_scores[track2], 0.001),
+                }
+    # Language coverage analysis
+    for track_name, track_data in track_results.items():
+        if track_data.get("summary"):
+            summary = track_data["summary"]
+            consistency_analysis["language_coverage_analysis"][track_name] = {
+                "coverage_rate": summary["language_pairs_evaluated"]
+                / max(summary.get("total_possible_pairs", 1), 1),
+                "samples_per_pair": summary["total_samples"]
+                / max(summary["language_pairs_evaluated"], 1),
+                "statistical_adequacy": summary["total_samples"]
+                >= EVALUATION_TRACKS[track_name]["min_samples_per_pair"]
+                * summary["language_pairs_evaluated"],
             }
+    return consistency_analysis
+def compare_models_statistically(
+    model1_results: Dict, model2_results: Dict, track: str = "google_comparable"
+) -> Dict:
+    """Perform statistical comparison between two models on a specific track."""
+    if track not in model1_results.get("tracks", {}) or track not in model2_results.get(
+        "tracks", {}
+    ):
+        return {"error": f"Track {track} not available for both models"}
+    track1_data = model1_results["tracks"][track]
+    track2_data = model2_results["tracks"][track]
+    if track1_data.get("error") or track2_data.get("error"):
+        return {"error": "One or both models have evaluation errors"}
+    comparison_results = {
+        "track": track,
+        "model1_category": model1_results.get("model_category", "unknown"),
+        "model2_category": model2_results.get("model_category", "unknown"),
+        "metric_comparisons": {},
+        "language_pair_comparisons": {},
+        "overall_significance": {},
+    }
+    # Compare each metric
+    for metric in (
+        METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]
+    ):
+        if metric in track1_data.get(
+            "track_statistics", {}
+        ) and metric in track2_data.get("track_statistics", {}):
+            # Extract sample-level data for this metric from both models
+            # This would require access to the original sample metrics
+            # For now, we'll use the aggregated statistics
+            stats1 = track1_data["track_statistics"][metric]
+            stats2 = track2_data["track_statistics"][metric]
+            # Create comparison summary
+            comparison_results["metric_comparisons"][metric] = {
+                "model1_mean": stats1["mean"],
+                "model1_ci": [stats1["ci_lower"], stats1["ci_upper"]],
+                "model2_mean": stats2["mean"],
+                "model2_ci": [stats2["ci_lower"], stats2["ci_upper"]],
+                "difference": stats1["mean"] - stats2["mean"],
+                "ci_overlap": not (
+                    stats1["ci_upper"] < stats2["ci_lower"]
+                    or stats2["ci_upper"] < stats1["ci_lower"]
+                ),
+            }
+    return comparison_results
+def generate_scientific_report(
+    results: Dict, model_name: str = "", baseline_results: Dict = None
+) -> str:
+    """Generate a comprehensive scientific evaluation report."""
+    if any(
+        track_data.get("error") for track_data in results.get("tracks", {}).values()
+    ):
+        return f"❌ **Evaluation Error**: Unable to complete scientific evaluation"
     report = []
     # Header
+    report.append(f"# 🔬 Scientific Evaluation Report: {model_name or 'Model'}")
     report.append("")
+    # Model categorization
+    category_info = results.get("category_info", {})
+    report.append(f"**Model Category**: {category_info.get('name', 'Unknown')}")
+    report.append(
+        f"**Category Description**: {category_info.get('description', 'N/A')}"
+    )
     report.append("")
+    # Track-by-track analysis
+    for track_name, track_data in results.get("tracks", {}).items():
+        if track_data.get("error"):
+            continue
+        track_config = EVALUATION_TRACKS[track_name]
+        summary = track_data.get("summary", {})
+        track_stats = track_data.get("track_statistics", {})
+        report.append(f"## {track_config['name']}")
+        report.append(f"*{track_config['description']}*")
+        report.append("")
+        # Summary statistics
+        report.append("### 📊 Summary Statistics")
+        report.append(f"- **Samples Evaluated**: {summary.get('total_samples', 0):,}")
+        report.append(
+            f"- **Language Pairs**: {summary.get('language_pairs_evaluated', 0)}"
         )
+        report.append(f"- **Languages Covered**: {summary.get('languages_covered', 0)}")
+        report.append(f"- **Statistical Power**: {track_config['statistical_power']}")
         report.append("")
+        # Primary metrics with confidence intervals
+        report.append("### 🎯 Primary Metrics (95% Confidence Intervals)")
+        for metric in METRICS_CONFIG["primary_metrics"]:
+            if metric in track_stats:
+                stats = track_stats[metric]
+                mean_val = stats["mean"]
+                ci_lower = stats["ci_lower"]
+                ci_upper = stats["ci_upper"]
+                report.append(
+                    f"- **{metric.upper()}**: {mean_val:.4f} [{ci_lower:.4f}, {ci_upper:.4f}]"
+                )
+        report.append("")
+        # Statistical adequacy assessment
+        min_required = track_config["min_samples_per_pair"] * summary.get(
+            "language_pairs_evaluated", 0
+        )
+        adequacy = (
+            "✅ Adequate"
+            if summary.get("total_samples", 0) >= min_required
+            else "⚠️ Limited"
+        )
+        report.append(f"**Statistical Adequacy**: {adequacy}")
+        report.append("")
+    # Cross-track analysis
+    cross_track = results.get("cross_track_analysis", {})
+    if cross_track:
+        report.append("## 🔄 Cross-Track Consistency Analysis")
+        coverage_analysis = cross_track.get("language_coverage_analysis", {})
+        for track_name, coverage_info in coverage_analysis.items():
+            adequacy = (
+                "✅ Statistically adequate"
+                if coverage_info.get("statistical_adequacy")
+                else "⚠️ Limited statistical power"
+            )
+            report.append(f"- **{track_name}**: {adequacy}")
+        report.append("")
+    # Baseline comparison if available
+    if baseline_results:
+        report.append("## 📈 Baseline Comparison")
+        # This would include detailed statistical comparisons
+        report.append("*Statistical comparison with baseline models*")
+        report.append("")
+    # Scientific recommendations
+    report.append("## 💡 Scientific Recommendations")
+    total_samples = sum(
+        track_data.get("summary", {}).get("total_samples", 0)
+        for track_data in results.get("tracks", {}).values()
+        if not track_data.get("error")
+    )
+    if total_samples < SAMPLE_SIZE_RECOMMENDATIONS["publication_quality"]:
+        report.append(
+            "- ⚠️ Consider collecting more evaluation samples for publication-quality results"
+        )
+    google_track = results.get("tracks", {}).get("google_comparable", {})
+    if (
+        not google_track.get("error")
+        and google_track.get("summary", {}).get("total_samples", 0) > 100
+    ):
+        report.append("- ✅ Sufficient data for comparison with commercial systems")
+    report.append("")
+    return "\n".join(report)