akera commited on
Commit
f54baf8
·
verified ·
1 Parent(s): 71c46bd

Update src/evaluation.py

Browse files
Files changed (1) hide show
  1. src/evaluation.py +73 -296
src/evaluation.py CHANGED
@@ -8,24 +8,21 @@ from collections import defaultdict
8
  from transformers.models.whisper.english_normalizer import BasicTextNormalizer
9
  from typing import Dict, List, Tuple, Optional
10
  from scipy import stats
11
- from scipy.stats import bootstrap
12
  import warnings
13
  from config import (
14
  ALL_UG40_LANGUAGES,
15
  GOOGLE_SUPPORTED_LANGUAGES,
16
  METRICS_CONFIG,
17
- STATISTICAL_CONFIG,
18
  EVALUATION_TRACKS,
19
  MODEL_CATEGORIES,
20
- SAMPLE_SIZE_RECOMMENDATIONS,
21
  )
22
- from src.utils import get_all_language_pairs, get_google_comparable_pairs
23
 
24
  warnings.filterwarnings("ignore", category=RuntimeWarning)
25
 
26
 
27
  def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
28
- """Calculate all metrics for a single sentence pair with robust error handling."""
29
 
30
  # Handle empty predictions
31
  if not prediction or not isinstance(prediction, str):
@@ -75,28 +72,17 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
75
  except:
76
  metrics["wer"] = 1.0
77
 
78
- # Length ratio
79
- try:
80
- if len(ref_norm) > 0:
81
- metrics["len_ratio"] = len(pred_norm) / len(ref_norm)
82
- else:
83
- metrics["len_ratio"] = 1.0 if len(pred_norm) == 0 else float("inf")
84
- except:
85
- metrics["len_ratio"] = 1.0
86
-
87
  # ROUGE scores
88
  try:
89
  scorer = rouge_scorer.RougeScorer(
90
- ["rouge1", "rouge2", "rougeL"], use_stemmer=True
91
  )
92
  rouge_scores = scorer.score(ref_norm, pred_norm)
93
 
94
  metrics["rouge1"] = rouge_scores["rouge1"].fmeasure
95
- metrics["rouge2"] = rouge_scores["rouge2"].fmeasure
96
  metrics["rougeL"] = rouge_scores["rougeL"].fmeasure
97
  except:
98
  metrics["rouge1"] = 0.0
99
- metrics["rouge2"] = 0.0
100
  metrics["rougeL"] = 0.0
101
 
102
  # Quality score (composite metric)
@@ -116,130 +102,53 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
116
  return metrics
117
 
118
 
119
- def calculate_statistical_metrics(values: List[float]) -> Dict[str, float]:
120
- """Calculate statistical measures including confidence intervals."""
121
 
122
  if not values or len(values) == 0:
123
- return {
124
- "mean": 0.0,
125
- "std": 0.0,
126
- "median": 0.0,
127
- "ci_lower": 0.0,
128
- "ci_upper": 0.0,
129
- "n_samples": 0,
130
- }
131
 
132
  values = np.array(values)
133
  values = values[~np.isnan(values)] # Remove NaN values
134
 
135
  if len(values) == 0:
136
- return {
137
- "mean": 0.0,
138
- "std": 0.0,
139
- "median": 0.0,
140
- "ci_lower": 0.0,
141
- "ci_upper": 0.0,
142
- "n_samples": 0,
143
- }
144
 
145
- stats_dict = {
146
- "mean": float(np.mean(values)),
147
- "std": float(np.std(values, ddof=1)) if len(values) > 1 else 0.0,
148
- "median": float(np.median(values)),
149
- "n_samples": len(values),
150
- }
151
-
152
- # Calculate confidence intervals using bootstrap if enough samples
153
- if len(values) >= STATISTICAL_CONFIG["min_samples_for_ci"]:
154
- try:
155
- confidence_level = STATISTICAL_CONFIG["confidence_level"]
156
-
157
- # Bootstrap confidence interval
158
- def mean_func(x):
159
- return np.mean(x)
160
-
161
- res = bootstrap(
162
- (values,),
163
- mean_func,
164
- n_resamples=STATISTICAL_CONFIG["bootstrap_samples"],
165
- confidence_level=confidence_level,
166
- random_state=42,
167
- )
168
-
169
- stats_dict["ci_lower"] = float(res.confidence_interval.low)
170
- stats_dict["ci_upper"] = float(res.confidence_interval.high)
171
-
172
- except Exception as e:
173
- # Fallback to t-distribution CI
174
- try:
175
- alpha = 1 - confidence_level
176
- t_val = stats.t.ppf(1 - alpha / 2, len(values) - 1)
177
- margin = t_val * stats_dict["std"] / np.sqrt(len(values))
178
- stats_dict["ci_lower"] = stats_dict["mean"] - margin
179
- stats_dict["ci_upper"] = stats_dict["mean"] + margin
180
- except:
181
- stats_dict["ci_lower"] = stats_dict["mean"]
182
- stats_dict["ci_upper"] = stats_dict["mean"]
183
- else:
184
- stats_dict["ci_lower"] = stats_dict["mean"]
185
- stats_dict["ci_upper"] = stats_dict["mean"]
186
-
187
- return stats_dict
188
-
189
-
190
- def perform_significance_test(
191
- values1: List[float], values2: List[float], metric_name: str
192
- ) -> Dict[str, float]:
193
- """Perform statistical significance test between two groups."""
194
 
195
- if len(values1) < 2 or len(values2) < 2:
196
- return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
197
-
198
- values1 = np.array(values1)
199
- values2 = np.array(values2)
200
-
201
- # Remove NaN values
202
- values1 = values1[~np.isnan(values1)]
203
- values2 = values2[~np.isnan(values2)]
204
-
205
- if len(values1) < 2 or len(values2) < 2:
206
- return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
207
 
208
  try:
209
- # Perform t-test
210
- t_stat, p_value = stats.ttest_ind(values1, values2, equal_var=False)
 
211
 
212
- # Calculate effect size (Cohen's d)
213
- pooled_std = np.sqrt(
214
- ((len(values1) - 1) * np.var(values1, ddof=1) +
215
- (len(values2) - 1) * np.var(values2, ddof=1)) /
216
- (len(values1) + len(values2) - 2)
217
- )
218
 
219
- if pooled_std > 0:
220
- effect_size = abs(np.mean(values1) - np.mean(values2)) / pooled_std
221
- else:
222
- effect_size = 0.0
223
 
224
- # Determine significance
225
- significance_level = EVALUATION_TRACKS["google_comparable"]["significance_level"]
226
- significant = p_value < significance_level
227
 
228
- return {
229
- "p_value": float(p_value),
230
- "effect_size": float(effect_size),
231
- "significant": significant,
232
- "t_statistic": float(t_stat),
233
- }
234
-
235
- except Exception as e:
236
- return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
237
 
238
 
239
  def evaluate_predictions_by_track(
240
  predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
241
  ) -> Dict:
242
- """Evaluate predictions for a specific track with statistical analysis."""
243
 
244
  print(f"🔄 Evaluating for {track} track...")
245
 
@@ -277,7 +186,7 @@ def evaluate_predictions_by_track(
277
 
278
  sample_df = pd.DataFrame(sample_metrics)
279
 
280
- # Aggregate by language pairs with statistical analysis
281
  pair_metrics = {}
282
  overall_metrics = defaultdict(list)
283
 
@@ -292,36 +201,44 @@ def evaluate_predictions_by_track(
292
  (sample_df["target_language"] == tgt_lang)
293
  ]
294
 
295
- if len(pair_data) >= track_config["min_samples_per_pair"]:
296
  pair_key = f"{src_lang}_to_{tgt_lang}"
297
  pair_metrics[pair_key] = {}
298
 
299
- # Calculate statistical metrics for each measure
300
- for metric in (
301
- METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]
302
- ):
303
  if metric in pair_data.columns:
304
  values = pair_data[metric].replace([np.inf, -np.inf], np.nan).dropna()
305
 
306
  if len(values) > 0:
307
- stats_metrics = calculate_statistical_metrics(values.tolist())
308
- pair_metrics[pair_key][metric] = stats_metrics
 
 
 
 
 
 
309
 
310
  # Add to overall metrics for track-level statistics
311
- overall_metrics[metric].append(stats_metrics["mean"])
312
 
313
  pair_metrics[pair_key]["sample_count"] = len(pair_data)
314
- pair_metrics[pair_key]["languages"] = f"{src_lang}-{tgt_lang}"
315
 
316
  # Calculate track-level aggregated statistics
317
  track_averages = {}
318
- track_statistics = {}
319
 
320
  for metric in overall_metrics:
321
  if overall_metrics[metric]:
322
- track_stats = calculate_statistical_metrics(overall_metrics[metric])
323
- track_averages[metric] = track_stats["mean"]
324
- track_statistics[metric] = track_stats
 
 
 
 
 
325
 
326
  # Generate evaluation summary
327
  summary = {
@@ -331,15 +248,12 @@ def evaluate_predictions_by_track(
331
  "language_pairs_evaluated": len([k for k in pair_metrics if pair_metrics[k].get("sample_count", 0) > 0]),
332
  "languages_covered": len(set(sample_df["source_language"]) | set(sample_df["target_language"])),
333
  "min_samples_per_pair": track_config["min_samples_per_pair"],
334
- "statistical_power": track_config["statistical_power"],
335
- "significance_level": track_config["significance_level"],
336
  }
337
 
338
  return {
339
- "sample_metrics": sample_df,
340
  "pair_metrics": pair_metrics,
341
  "track_averages": track_averages,
342
- "track_statistics": track_statistics,
343
  "summary": summary,
344
  "evaluated_samples": len(sample_df),
345
  "track": track,
@@ -347,12 +261,12 @@ def evaluate_predictions_by_track(
347
  }
348
 
349
 
350
- def evaluate_predictions_scientific(
351
  predictions: pd.DataFrame, test_set: pd.DataFrame, model_category: str = "community"
352
  ) -> Dict:
353
- """Comprehensive evaluation across all tracks with scientific rigor."""
354
 
355
- print("🔬 Starting scientific evaluation...")
356
 
357
  # Validate model category
358
  if model_category not in MODEL_CATEGORIES:
@@ -362,8 +276,7 @@ def evaluate_predictions_scientific(
362
  "model_category": model_category,
363
  "category_info": MODEL_CATEGORIES[model_category],
364
  "tracks": {},
365
- "cross_track_analysis": {},
366
- "scientific_metadata": {
367
  "evaluation_timestamp": pd.Timestamp.now().isoformat(),
368
  "total_samples_submitted": len(predictions),
369
  "total_samples_available": len(test_set),
@@ -375,120 +288,24 @@ def evaluate_predictions_scientific(
375
  track_result = evaluate_predictions_by_track(predictions, test_set, track_name)
376
  evaluation_results["tracks"][track_name] = track_result
377
 
378
- # Cross-track consistency analysis
379
- evaluation_results["cross_track_analysis"] = analyze_cross_track_consistency(
380
- evaluation_results["tracks"]
381
- )
382
-
383
  return evaluation_results
384
 
385
 
386
- def analyze_cross_track_consistency(track_results: Dict) -> Dict:
387
- """Analyze consistency of model performance across different tracks."""
388
-
389
- consistency_analysis = {
390
- "track_correlations": {},
391
- "performance_stability": {},
392
- "language_coverage_analysis": {},
393
- }
394
-
395
- # Extract quality scores from each track for correlation analysis
396
- track_scores = {}
397
- for track_name, track_data in track_results.items():
398
- if track_data.get("track_averages") and "quality_score" in track_data["track_averages"]:
399
- track_scores[track_name] = track_data["track_averages"]["quality_score"]
400
-
401
- # Calculate pairwise correlations (would need more data points for meaningful correlation)
402
- if len(track_scores) >= 2:
403
- track_names = list(track_scores.keys())
404
- for i, track1 in enumerate(track_names):
405
- for track2 in track_names[i + 1:]:
406
- # This would be more meaningful with multiple models
407
- consistency_analysis["track_correlations"][f"{track1}_vs_{track2}"] = {
408
- "score_difference": abs(track_scores[track1] - track_scores[track2]),
409
- "relative_performance": track_scores[track1] / max(track_scores[track2], 0.001),
410
- }
411
-
412
- # Language coverage analysis
413
- for track_name, track_data in track_results.items():
414
- if track_data.get("summary"):
415
- summary = track_data["summary"]
416
- consistency_analysis["language_coverage_analysis"][track_name] = {
417
- "coverage_rate": summary["language_pairs_evaluated"] / max(summary.get("total_possible_pairs", 1), 1),
418
- "samples_per_pair": summary["total_samples"] / max(summary["language_pairs_evaluated"], 1),
419
- "statistical_adequacy": summary["total_samples"] >= EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary["language_pairs_evaluated"],
420
- }
421
-
422
- return consistency_analysis
423
-
424
-
425
- def compare_models_statistically(
426
- model1_results: Dict, model2_results: Dict, track: str = "google_comparable"
427
- ) -> Dict:
428
- """Perform statistical comparison between two models on a specific track."""
429
-
430
- if track not in model1_results.get("tracks", {}) or track not in model2_results.get("tracks", {}):
431
- return {"error": f"Track {track} not available for both models"}
432
-
433
- track1_data = model1_results["tracks"][track]
434
- track2_data = model2_results["tracks"][track]
435
-
436
- if track1_data.get("error") or track2_data.get("error"):
437
- return {"error": "One or both models have evaluation errors"}
438
-
439
- comparison_results = {
440
- "track": track,
441
- "model1_category": model1_results.get("model_category", "unknown"),
442
- "model2_category": model2_results.get("model_category", "unknown"),
443
- "metric_comparisons": {},
444
- "language_pair_comparisons": {},
445
- "overall_significance": {},
446
- }
447
-
448
- # Compare each metric
449
- for metric in METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]:
450
- if (metric in track1_data.get("track_statistics", {}) and
451
- metric in track2_data.get("track_statistics", {})):
452
-
453
- # Extract sample-level data for this metric from both models
454
- # This would require access to the original sample metrics
455
- # For now, we'll use the aggregated statistics
456
-
457
- stats1 = track1_data["track_statistics"][metric]
458
- stats2 = track2_data["track_statistics"][metric]
459
-
460
- # Create comparison summary
461
- comparison_results["metric_comparisons"][metric] = {
462
- "model1_mean": stats1["mean"],
463
- "model1_ci": [stats1["ci_lower"], stats1["ci_upper"]],
464
- "model2_mean": stats2["mean"],
465
- "model2_ci": [stats2["ci_lower"], stats2["ci_upper"]],
466
- "difference": stats1["mean"] - stats2["mean"],
467
- "ci_overlap": not (stats1["ci_upper"] < stats2["ci_lower"] or
468
- stats2["ci_upper"] < stats1["ci_lower"]),
469
- }
470
-
471
- return comparison_results
472
-
473
-
474
- def generate_scientific_report(
475
- results: Dict, model_name: str = "", baseline_results: Dict = None
476
- ) -> str:
477
- """Generate a comprehensive scientific evaluation report."""
478
 
479
  if any(track_data.get("error") for track_data in results.get("tracks", {}).values()):
480
- return f"❌ **Evaluation Error**: Unable to complete scientific evaluation"
481
 
482
  report = []
483
 
484
  # Header
485
- report.append(f"# 🔬 Scientific Evaluation Report: {model_name or 'Model'}")
486
  report.append("")
487
 
488
  # Model categorization
489
  category_info = results.get("category_info", {})
490
  report.append(f"**Model Category**: {category_info.get('name', 'Unknown')}")
491
- report.append(f"**Category Description**: {category_info.get('description', 'N/A')}")
492
  report.append("")
493
 
494
  # Track-by-track analysis
@@ -498,73 +315,33 @@ def generate_scientific_report(
498
 
499
  track_config = EVALUATION_TRACKS[track_name]
500
  summary = track_data.get("summary", {})
501
- track_stats = track_data.get("track_statistics", {})
 
502
 
503
- report.append(f"## {track_config['name']}")
504
- report.append(f"*{track_config['description']}*")
505
  report.append("")
506
 
507
  # Summary statistics
508
- report.append("### 📊 Summary Statistics")
509
  report.append(f"- **Samples Evaluated**: {summary.get('total_samples', 0):,}")
510
  report.append(f"- **Language Pairs**: {summary.get('language_pairs_evaluated', 0)}")
511
  report.append(f"- **Languages Covered**: {summary.get('languages_covered', 0)}")
512
- report.append(f"- **Statistical Power**: {track_config['statistical_power']}")
513
  report.append("")
514
 
515
  # Primary metrics with confidence intervals
516
- report.append("### 🎯 Primary Metrics (95% Confidence Intervals)")
517
  for metric in METRICS_CONFIG["primary_metrics"]:
518
- if metric in track_stats:
519
- stats = track_stats[metric]
520
  mean_val = stats["mean"]
521
  ci_lower = stats["ci_lower"]
522
  ci_upper = stats["ci_upper"]
523
 
524
  report.append(f"- **{metric.upper()}**: {mean_val:.4f} [{ci_lower:.4f}, {ci_upper:.4f}]")
525
  report.append("")
526
-
527
- # Statistical adequacy assessment
528
- min_required = track_config["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
529
- adequacy = "✅ Adequate" if summary.get("total_samples", 0) >= min_required else "⚠️ Limited"
530
- report.append(f"**Statistical Adequacy**: {adequacy}")
531
- report.append("")
532
-
533
- # Cross-track analysis
534
- cross_track = results.get("cross_track_analysis", {})
535
- if cross_track:
536
- report.append("## 🔄 Cross-Track Consistency Analysis")
537
-
538
- coverage_analysis = cross_track.get("language_coverage_analysis", {})
539
- for track_name, coverage_info in coverage_analysis.items():
540
- adequacy = "✅ Statistically adequate" if coverage_info.get("statistical_adequacy") else "⚠️ Limited statistical power"
541
- report.append(f"- **{track_name}**: {adequacy}")
542
-
543
- report.append("")
544
-
545
- # Baseline comparison if available
546
- if baseline_results:
547
- report.append("## 📈 Baseline Comparison")
548
- # This would include detailed statistical comparisons
549
- report.append("*Statistical comparison with baseline models*")
550
- report.append("")
551
-
552
- # Scientific recommendations
553
- report.append("## 💡 Scientific Recommendations")
554
 
555
- total_samples = sum(
556
- track_data.get("summary", {}).get("total_samples", 0)
557
- for track_data in results.get("tracks", {}).values()
558
- if not track_data.get("error")
559
- )
560
-
561
- if total_samples < SAMPLE_SIZE_RECOMMENDATIONS["publication_quality"]:
562
- report.append("- ⚠️ Consider collecting more evaluation samples for publication-quality results")
563
-
564
- google_track = results.get("tracks", {}).get("google_comparable", {})
565
- if not google_track.get("error") and google_track.get("summary", {}).get("total_samples", 0) > 100:
566
- report.append("- ✅ Sufficient data for comparison with commercial systems")
567
-
568
- report.append("")
569
-
570
- return "\n".join(report)
 
8
  from transformers.models.whisper.english_normalizer import BasicTextNormalizer
9
  from typing import Dict, List, Tuple, Optional
10
  from scipy import stats
 
11
  import warnings
12
  from config import (
13
  ALL_UG40_LANGUAGES,
14
  GOOGLE_SUPPORTED_LANGUAGES,
15
  METRICS_CONFIG,
 
16
  EVALUATION_TRACKS,
17
  MODEL_CATEGORIES,
 
18
  )
19
+ from src.utils import get_all_language_pairs
20
 
21
  warnings.filterwarnings("ignore", category=RuntimeWarning)
22
 
23
 
24
  def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
25
+ """Calculate all metrics for a single sentence pair."""
26
 
27
  # Handle empty predictions
28
  if not prediction or not isinstance(prediction, str):
 
72
  except:
73
  metrics["wer"] = 1.0
74
 
 
 
 
 
 
 
 
 
 
75
  # ROUGE scores
76
  try:
77
  scorer = rouge_scorer.RougeScorer(
78
+ ["rouge1", "rougeL"], use_stemmer=True
79
  )
80
  rouge_scores = scorer.score(ref_norm, pred_norm)
81
 
82
  metrics["rouge1"] = rouge_scores["rouge1"].fmeasure
 
83
  metrics["rougeL"] = rouge_scores["rougeL"].fmeasure
84
  except:
85
  metrics["rouge1"] = 0.0
 
86
  metrics["rougeL"] = 0.0
87
 
88
  # Quality score (composite metric)
 
102
  return metrics
103
 
104
 
105
+ def calculate_confidence_interval(values: List[float], confidence_level: float = 0.95) -> Tuple[float, float, float]:
106
+ """Calculate mean and confidence interval for a list of values."""
107
 
108
  if not values or len(values) == 0:
109
+ return 0.0, 0.0, 0.0
 
 
 
 
 
 
 
110
 
111
  values = np.array(values)
112
  values = values[~np.isnan(values)] # Remove NaN values
113
 
114
  if len(values) == 0:
115
+ return 0.0, 0.0, 0.0
 
 
 
 
 
 
 
116
 
117
+ mean_val = float(np.mean(values))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ if len(values) < METRICS_CONFIG["min_samples_for_ci"]:
120
+ # Not enough samples for meaningful CI
121
+ return mean_val, mean_val, mean_val
 
 
 
 
 
 
 
 
 
122
 
123
  try:
124
+ # Bootstrap confidence interval
125
+ n_bootstrap = min(METRICS_CONFIG["bootstrap_samples"], 1000)
126
+ bootstrap_means = []
127
 
128
+ for _ in range(n_bootstrap):
129
+ bootstrap_sample = np.random.choice(values, size=len(values), replace=True)
130
+ bootstrap_means.append(np.mean(bootstrap_sample))
 
 
 
131
 
132
+ alpha = 1 - confidence_level
133
+ ci_lower = np.percentile(bootstrap_means, 100 * alpha / 2)
134
+ ci_upper = np.percentile(bootstrap_means, 100 * (1 - alpha / 2))
 
135
 
136
+ return mean_val, float(ci_lower), float(ci_upper)
 
 
137
 
138
+ except Exception:
139
+ # Fallback to t-distribution CI
140
+ try:
141
+ std_err = stats.sem(values)
142
+ h = std_err * stats.t.ppf((1 + confidence_level) / 2, len(values) - 1)
143
+ return mean_val, mean_val - h, mean_val + h
144
+ except:
145
+ return mean_val, mean_val, mean_val
 
146
 
147
 
148
  def evaluate_predictions_by_track(
149
  predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
150
  ) -> Dict:
151
+ """Evaluate predictions for a specific track."""
152
 
153
  print(f"🔄 Evaluating for {track} track...")
154
 
 
186
 
187
  sample_df = pd.DataFrame(sample_metrics)
188
 
189
+ # Aggregate by language pairs
190
  pair_metrics = {}
191
  overall_metrics = defaultdict(list)
192
 
 
201
  (sample_df["target_language"] == tgt_lang)
202
  ]
203
 
204
+ if len(pair_data) >= MIN_SAMPLES_PER_PAIR:
205
  pair_key = f"{src_lang}_to_{tgt_lang}"
206
  pair_metrics[pair_key] = {}
207
 
208
+ # Calculate statistics for each metric
209
+ for metric in METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]:
 
 
210
  if metric in pair_data.columns:
211
  values = pair_data[metric].replace([np.inf, -np.inf], np.nan).dropna()
212
 
213
  if len(values) > 0:
214
+ mean_val, ci_lower, ci_upper = calculate_confidence_interval(values.tolist())
215
+ pair_metrics[pair_key][metric] = {
216
+ "mean": mean_val,
217
+ "ci_lower": ci_lower,
218
+ "ci_upper": ci_upper,
219
+ "std": float(np.std(values)) if len(values) > 1 else 0.0,
220
+ "count": len(values)
221
+ }
222
 
223
  # Add to overall metrics for track-level statistics
224
+ overall_metrics[metric].append(mean_val)
225
 
226
  pair_metrics[pair_key]["sample_count"] = len(pair_data)
 
227
 
228
  # Calculate track-level aggregated statistics
229
  track_averages = {}
230
+ track_confidence = {}
231
 
232
  for metric in overall_metrics:
233
  if overall_metrics[metric]:
234
+ mean_val, ci_lower, ci_upper = calculate_confidence_interval(overall_metrics[metric])
235
+ track_averages[metric] = mean_val
236
+ track_confidence[metric] = {
237
+ "mean": mean_val,
238
+ "ci_lower": ci_lower,
239
+ "ci_upper": ci_upper,
240
+ "std": float(np.std(overall_metrics[metric])) if len(overall_metrics[metric]) > 1 else 0.0
241
+ }
242
 
243
  # Generate evaluation summary
244
  summary = {
 
248
  "language_pairs_evaluated": len([k for k in pair_metrics if pair_metrics[k].get("sample_count", 0) > 0]),
249
  "languages_covered": len(set(sample_df["source_language"]) | set(sample_df["target_language"])),
250
  "min_samples_per_pair": track_config["min_samples_per_pair"],
 
 
251
  }
252
 
253
  return {
 
254
  "pair_metrics": pair_metrics,
255
  "track_averages": track_averages,
256
+ "track_confidence": track_confidence,
257
  "summary": summary,
258
  "evaluated_samples": len(sample_df),
259
  "track": track,
 
261
  }
262
 
263
 
264
+ def evaluate_predictions(
265
  predictions: pd.DataFrame, test_set: pd.DataFrame, model_category: str = "community"
266
  ) -> Dict:
267
+ """Comprehensive evaluation across all tracks."""
268
 
269
+ print("🔬 Starting evaluation...")
270
 
271
  # Validate model category
272
  if model_category not in MODEL_CATEGORIES:
 
276
  "model_category": model_category,
277
  "category_info": MODEL_CATEGORIES[model_category],
278
  "tracks": {},
279
+ "metadata": {
 
280
  "evaluation_timestamp": pd.Timestamp.now().isoformat(),
281
  "total_samples_submitted": len(predictions),
282
  "total_samples_available": len(test_set),
 
288
  track_result = evaluate_predictions_by_track(predictions, test_set, track_name)
289
  evaluation_results["tracks"][track_name] = track_result
290
 
 
 
 
 
 
291
  return evaluation_results
292
 
293
 
294
+ def generate_evaluation_report(results: Dict, model_name: str = "") -> str:
295
+ """Generate a comprehensive evaluation report."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
  if any(track_data.get("error") for track_data in results.get("tracks", {}).values()):
298
+ return f"❌ **Evaluation Error**: Unable to complete evaluation"
299
 
300
  report = []
301
 
302
  # Header
303
+ report.append(f"### 🔬 Evaluation Report: {model_name or 'Model'}")
304
  report.append("")
305
 
306
  # Model categorization
307
  category_info = results.get("category_info", {})
308
  report.append(f"**Model Category**: {category_info.get('name', 'Unknown')}")
 
309
  report.append("")
310
 
311
  # Track-by-track analysis
 
315
 
316
  track_config = EVALUATION_TRACKS[track_name]
317
  summary = track_data.get("summary", {})
318
+ track_averages = track_data.get("track_averages", {})
319
+ track_confidence = track_data.get("track_confidence", {})
320
 
321
+ report.append(f"#### {track_config['name']}")
 
322
  report.append("")
323
 
324
  # Summary statistics
325
+ report.append("**Summary Statistics:**")
326
  report.append(f"- **Samples Evaluated**: {summary.get('total_samples', 0):,}")
327
  report.append(f"- **Language Pairs**: {summary.get('language_pairs_evaluated', 0)}")
328
  report.append(f"- **Languages Covered**: {summary.get('languages_covered', 0)}")
 
329
  report.append("")
330
 
331
  # Primary metrics with confidence intervals
332
+ report.append("**Primary Metrics (95% Confidence Intervals):**")
333
  for metric in METRICS_CONFIG["primary_metrics"]:
334
+ if metric in track_confidence:
335
+ stats = track_confidence[metric]
336
  mean_val = stats["mean"]
337
  ci_lower = stats["ci_lower"]
338
  ci_upper = stats["ci_upper"]
339
 
340
  report.append(f"- **{metric.upper()}**: {mean_val:.4f} [{ci_lower:.4f}, {ci_upper:.4f}]")
341
  report.append("")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
+ return "\n".join(report)
344
+
345
+
346
+ # Backwards compatibility
347
+ MIN_SAMPLES_PER_PAIR = 10