akera commited on
Commit
097598e
Β·
verified Β·
1 Parent(s): fb1cc27

Update src/evaluation.py

Browse files
Files changed (1) hide show
  1. src/evaluation.py +132 -188
src/evaluation.py CHANGED
@@ -17,6 +17,7 @@ from config import (
17
  STATISTICAL_CONFIG,
18
  EVALUATION_TRACKS,
19
  MODEL_CATEGORIES,
 
20
  )
21
  from src.utils import get_all_language_pairs, get_google_comparable_pairs
22
 
@@ -25,35 +26,35 @@ warnings.filterwarnings("ignore", category=RuntimeWarning)
25
 
26
  def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
27
  """Calculate all metrics for a single sentence pair with robust error handling."""
28
-
29
  # Handle empty predictions
30
  if not prediction or not isinstance(prediction, str):
31
  prediction = ""
32
-
33
  if not reference or not isinstance(reference, str):
34
  reference = ""
35
-
36
  # Normalize texts
37
  normalizer = BasicTextNormalizer()
38
  pred_norm = normalizer(prediction)
39
  ref_norm = normalizer(reference)
40
-
41
  metrics = {}
42
-
43
  # BLEU score (0-100 scale)
44
  try:
45
  bleu = BLEU(effective_order=True)
46
  metrics["bleu"] = bleu.sentence_score(pred_norm, [ref_norm]).score
47
  except:
48
  metrics["bleu"] = 0.0
49
-
50
  # ChrF score (normalize to 0-1)
51
  try:
52
  chrf = CHRF()
53
  metrics["chrf"] = chrf.sentence_score(pred_norm, [ref_norm]).score / 100.0
54
  except:
55
  metrics["chrf"] = 0.0
56
-
57
  # Character Error Rate (CER)
58
  try:
59
  if len(ref_norm) > 0:
@@ -62,20 +63,18 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
62
  metrics["cer"] = 1.0 if len(pred_norm) > 0 else 0.0
63
  except:
64
  metrics["cer"] = 1.0
65
-
66
  # Word Error Rate (WER)
67
  try:
68
  ref_words = ref_norm.split()
69
  pred_words = pred_norm.split()
70
  if len(ref_words) > 0:
71
- metrics["wer"] = Levenshtein.distance(ref_words, pred_words) / len(
72
- ref_words
73
- )
74
  else:
75
  metrics["wer"] = 1.0 if len(pred_words) > 0 else 0.0
76
  except:
77
  metrics["wer"] = 1.0
78
-
79
  # Length ratio
80
  try:
81
  if len(ref_norm) > 0:
@@ -84,14 +83,14 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
84
  metrics["len_ratio"] = 1.0 if len(pred_norm) == 0 else float("inf")
85
  except:
86
  metrics["len_ratio"] = 1.0
87
-
88
  # ROUGE scores
89
  try:
90
  scorer = rouge_scorer.RougeScorer(
91
  ["rouge1", "rouge2", "rougeL"], use_stemmer=True
92
  )
93
  rouge_scores = scorer.score(ref_norm, pred_norm)
94
-
95
  metrics["rouge1"] = rouge_scores["rouge1"].fmeasure
96
  metrics["rouge2"] = rouge_scores["rouge2"].fmeasure
97
  metrics["rougeL"] = rouge_scores["rougeL"].fmeasure
@@ -99,7 +98,7 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
99
  metrics["rouge1"] = 0.0
100
  metrics["rouge2"] = 0.0
101
  metrics["rougeL"] = 0.0
102
-
103
  # Quality score (composite metric)
104
  try:
105
  quality_components = [
@@ -113,13 +112,13 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
113
  metrics["quality_score"] = np.mean(quality_components)
114
  except:
115
  metrics["quality_score"] = 0.0
116
-
117
  return metrics
118
 
119
 
120
  def calculate_statistical_metrics(values: List[float]) -> Dict[str, float]:
121
  """Calculate statistical measures including confidence intervals."""
122
-
123
  if not values or len(values) == 0:
124
  return {
125
  "mean": 0.0,
@@ -129,10 +128,10 @@ def calculate_statistical_metrics(values: List[float]) -> Dict[str, float]:
129
  "ci_upper": 0.0,
130
  "n_samples": 0,
131
  }
132
-
133
  values = np.array(values)
134
  values = values[~np.isnan(values)] # Remove NaN values
135
-
136
  if len(values) == 0:
137
  return {
138
  "mean": 0.0,
@@ -142,23 +141,23 @@ def calculate_statistical_metrics(values: List[float]) -> Dict[str, float]:
142
  "ci_upper": 0.0,
143
  "n_samples": 0,
144
  }
145
-
146
  stats_dict = {
147
  "mean": float(np.mean(values)),
148
  "std": float(np.std(values, ddof=1)) if len(values) > 1 else 0.0,
149
  "median": float(np.median(values)),
150
  "n_samples": len(values),
151
  }
152
-
153
  # Calculate confidence intervals using bootstrap if enough samples
154
  if len(values) >= STATISTICAL_CONFIG["min_samples_for_ci"]:
155
  try:
156
  confidence_level = STATISTICAL_CONFIG["confidence_level"]
157
-
158
  # Bootstrap confidence interval
159
  def mean_func(x):
160
  return np.mean(x)
161
-
162
  res = bootstrap(
163
  (values,),
164
  mean_func,
@@ -166,10 +165,10 @@ def calculate_statistical_metrics(values: List[float]) -> Dict[str, float]:
166
  confidence_level=confidence_level,
167
  random_state=42,
168
  )
169
-
170
  stats_dict["ci_lower"] = float(res.confidence_interval.low)
171
  stats_dict["ci_upper"] = float(res.confidence_interval.high)
172
-
173
  except Exception as e:
174
  # Fallback to t-distribution CI
175
  try:
@@ -184,7 +183,7 @@ def calculate_statistical_metrics(values: List[float]) -> Dict[str, float]:
184
  else:
185
  stats_dict["ci_lower"] = stats_dict["mean"]
186
  stats_dict["ci_upper"] = stats_dict["mean"]
187
-
188
  return stats_dict
189
 
190
 
@@ -192,51 +191,47 @@ def perform_significance_test(
192
  values1: List[float], values2: List[float], metric_name: str
193
  ) -> Dict[str, float]:
194
  """Perform statistical significance test between two groups."""
195
-
196
  if len(values1) < 2 or len(values2) < 2:
197
  return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
198
-
199
  values1 = np.array(values1)
200
  values2 = np.array(values2)
201
-
202
  # Remove NaN values
203
  values1 = values1[~np.isnan(values1)]
204
  values2 = values2[~np.isnan(values2)]
205
-
206
  if len(values1) < 2 or len(values2) < 2:
207
  return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
208
-
209
  try:
210
  # Perform t-test
211
  t_stat, p_value = stats.ttest_ind(values1, values2, equal_var=False)
212
-
213
  # Calculate effect size (Cohen's d)
214
  pooled_std = np.sqrt(
215
- (
216
- (len(values1) - 1) * np.var(values1, ddof=1)
217
- + (len(values2) - 1) * np.var(values2, ddof=1)
218
- )
219
- / (len(values1) + len(values2) - 2)
220
  )
221
-
222
  if pooled_std > 0:
223
  effect_size = abs(np.mean(values1) - np.mean(values2)) / pooled_std
224
  else:
225
  effect_size = 0.0
226
-
227
  # Determine significance
228
- significance_level = EVALUATION_TRACKS["google_comparable"][
229
- "significance_level"
230
- ]
231
  significant = p_value < significance_level
232
-
233
  return {
234
  "p_value": float(p_value),
235
  "effect_size": float(effect_size),
236
  "significant": significant,
237
  "t_statistic": float(t_stat),
238
  }
239
-
240
  except Exception as e:
241
  return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
242
 
@@ -245,32 +240,32 @@ def evaluate_predictions_by_track(
245
  predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
246
  ) -> Dict:
247
  """Evaluate predictions for a specific track with statistical analysis."""
248
-
249
  print(f"πŸ”„ Evaluating for {track} track...")
250
-
251
  track_config = EVALUATION_TRACKS[track]
252
  track_languages = track_config["languages"]
253
-
254
  # Filter test set and predictions to track languages
255
  track_test_set = test_set[
256
- (test_set["source_language"].isin(track_languages))
257
- & (test_set["target_language"].isin(track_languages))
258
  ].copy()
259
-
260
  # Merge predictions with test set
261
  merged = track_test_set.merge(
262
  predictions, on="sample_id", how="inner", suffixes=("", "_pred")
263
  )
264
-
265
  if len(merged) == 0:
266
  return {
267
  "error": f"No matching samples found for {track} track",
268
  "evaluated_samples": 0,
269
  "track": track,
270
  }
271
-
272
  print(f"πŸ“Š Evaluating {len(merged)} samples for {track} track...")
273
-
274
  # Calculate metrics for each sample
275
  sample_metrics = []
276
  for idx, row in merged.iterrows():
@@ -279,78 +274,67 @@ def evaluate_predictions_by_track(
279
  metrics["source_language"] = row["source_language"]
280
  metrics["target_language"] = row["target_language"]
281
  sample_metrics.append(metrics)
282
-
283
  sample_df = pd.DataFrame(sample_metrics)
284
-
285
  # Aggregate by language pairs with statistical analysis
286
  pair_metrics = {}
287
  overall_metrics = defaultdict(list)
288
-
289
  # Calculate metrics for each language pair
290
  for src_lang in track_languages:
291
  for tgt_lang in track_languages:
292
  if src_lang == tgt_lang:
293
  continue
294
-
295
  pair_data = sample_df[
296
- (sample_df["source_language"] == src_lang)
297
- & (sample_df["target_language"] == tgt_lang)
298
  ]
299
-
300
  if len(pair_data) >= track_config["min_samples_per_pair"]:
301
  pair_key = f"{src_lang}_to_{tgt_lang}"
302
  pair_metrics[pair_key] = {}
303
-
304
  # Calculate statistical metrics for each measure
305
  for metric in (
306
- METRICS_CONFIG["primary_metrics"]
307
- + METRICS_CONFIG["secondary_metrics"]
308
  ):
309
  if metric in pair_data.columns:
310
- values = (
311
- pair_data[metric]
312
- .replace([np.inf, -np.inf], np.nan)
313
- .dropna()
314
- )
315
-
316
  if len(values) > 0:
317
- stats_metrics = calculate_statistical_metrics(
318
- values.tolist()
319
- )
320
  pair_metrics[pair_key][metric] = stats_metrics
321
-
322
  # Add to overall metrics for track-level statistics
323
  overall_metrics[metric].append(stats_metrics["mean"])
324
-
325
  pair_metrics[pair_key]["sample_count"] = len(pair_data)
326
  pair_metrics[pair_key]["languages"] = f"{src_lang}-{tgt_lang}"
327
-
328
  # Calculate track-level aggregated statistics
329
  track_averages = {}
330
  track_statistics = {}
331
-
332
  for metric in overall_metrics:
333
  if overall_metrics[metric]:
334
  track_stats = calculate_statistical_metrics(overall_metrics[metric])
335
  track_averages[metric] = track_stats["mean"]
336
  track_statistics[metric] = track_stats
337
-
338
  # Generate evaluation summary
339
  summary = {
340
  "track": track,
341
  "track_name": track_config["name"],
342
  "total_samples": len(sample_df),
343
- "language_pairs_evaluated": len(
344
- [k for k in pair_metrics if pair_metrics[k].get("sample_count", 0) > 0]
345
- ),
346
- "languages_covered": len(
347
- set(sample_df["source_language"]) | set(sample_df["target_language"])
348
- ),
349
  "min_samples_per_pair": track_config["min_samples_per_pair"],
350
  "statistical_power": track_config["statistical_power"],
351
  "significance_level": track_config["significance_level"],
352
  }
353
-
354
  return {
355
  "sample_metrics": sample_df,
356
  "pair_metrics": pair_metrics,
@@ -367,13 +351,13 @@ def evaluate_predictions_scientific(
367
  predictions: pd.DataFrame, test_set: pd.DataFrame, model_category: str = "community"
368
  ) -> Dict:
369
  """Comprehensive evaluation across all tracks with scientific rigor."""
370
-
371
  print("πŸ”¬ Starting scientific evaluation...")
372
-
373
  # Validate model category
374
  if model_category not in MODEL_CATEGORIES:
375
  model_category = "community"
376
-
377
  evaluation_results = {
378
  "model_category": model_category,
379
  "category_info": MODEL_CATEGORIES[model_category],
@@ -385,66 +369,56 @@ def evaluate_predictions_scientific(
385
  "total_samples_available": len(test_set),
386
  },
387
  }
388
-
389
  # Evaluate each track
390
  for track_name in EVALUATION_TRACKS.keys():
391
  track_result = evaluate_predictions_by_track(predictions, test_set, track_name)
392
  evaluation_results["tracks"][track_name] = track_result
393
-
394
  # Cross-track consistency analysis
395
  evaluation_results["cross_track_analysis"] = analyze_cross_track_consistency(
396
  evaluation_results["tracks"]
397
  )
398
-
399
  return evaluation_results
400
 
401
 
402
  def analyze_cross_track_consistency(track_results: Dict) -> Dict:
403
  """Analyze consistency of model performance across different tracks."""
404
-
405
  consistency_analysis = {
406
  "track_correlations": {},
407
  "performance_stability": {},
408
  "language_coverage_analysis": {},
409
  }
410
-
411
  # Extract quality scores from each track for correlation analysis
412
  track_scores = {}
413
  for track_name, track_data in track_results.items():
414
- if (
415
- track_data.get("track_averages")
416
- and "quality_score" in track_data["track_averages"]
417
- ):
418
  track_scores[track_name] = track_data["track_averages"]["quality_score"]
419
-
420
  # Calculate pairwise correlations (would need more data points for meaningful correlation)
421
  if len(track_scores) >= 2:
422
  track_names = list(track_scores.keys())
423
  for i, track1 in enumerate(track_names):
424
- for track2 in track_names[i + 1 :]:
425
  # This would be more meaningful with multiple models
426
  consistency_analysis["track_correlations"][f"{track1}_vs_{track2}"] = {
427
- "score_difference": abs(
428
- track_scores[track1] - track_scores[track2]
429
- ),
430
- "relative_performance": track_scores[track1]
431
- / max(track_scores[track2], 0.001),
432
  }
433
-
434
  # Language coverage analysis
435
  for track_name, track_data in track_results.items():
436
  if track_data.get("summary"):
437
  summary = track_data["summary"]
438
  consistency_analysis["language_coverage_analysis"][track_name] = {
439
- "coverage_rate": summary["language_pairs_evaluated"]
440
- / max(summary.get("total_possible_pairs", 1), 1),
441
- "samples_per_pair": summary["total_samples"]
442
- / max(summary["language_pairs_evaluated"], 1),
443
- "statistical_adequacy": summary["total_samples"]
444
- >= EVALUATION_TRACKS[track_name]["min_samples_per_pair"]
445
- * summary["language_pairs_evaluated"],
446
  }
447
-
448
  return consistency_analysis
449
 
450
 
@@ -452,18 +426,16 @@ def compare_models_statistically(
452
  model1_results: Dict, model2_results: Dict, track: str = "google_comparable"
453
  ) -> Dict:
454
  """Perform statistical comparison between two models on a specific track."""
455
-
456
- if track not in model1_results.get("tracks", {}) or track not in model2_results.get(
457
- "tracks", {}
458
- ):
459
  return {"error": f"Track {track} not available for both models"}
460
-
461
  track1_data = model1_results["tracks"][track]
462
  track2_data = model2_results["tracks"][track]
463
-
464
  if track1_data.get("error") or track2_data.get("error"):
465
  return {"error": "One or both models have evaluation errors"}
466
-
467
  comparison_results = {
468
  "track": track,
469
  "model1_category": model1_results.get("model_category", "unknown"),
@@ -472,22 +444,19 @@ def compare_models_statistically(
472
  "language_pair_comparisons": {},
473
  "overall_significance": {},
474
  }
475
-
476
  # Compare each metric
477
- for metric in (
478
- METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]
479
- ):
480
- if metric in track1_data.get(
481
- "track_statistics", {}
482
- ) and metric in track2_data.get("track_statistics", {}):
483
-
484
  # Extract sample-level data for this metric from both models
485
  # This would require access to the original sample metrics
486
  # For now, we'll use the aggregated statistics
487
-
488
  stats1 = track1_data["track_statistics"][metric]
489
  stats2 = track2_data["track_statistics"][metric]
490
-
491
  # Create comparison summary
492
  comparison_results["metric_comparisons"][metric] = {
493
  "model1_mean": stats1["mean"],
@@ -495,12 +464,10 @@ def compare_models_statistically(
495
  "model2_mean": stats2["mean"],
496
  "model2_ci": [stats2["ci_lower"], stats2["ci_upper"]],
497
  "difference": stats1["mean"] - stats2["mean"],
498
- "ci_overlap": not (
499
- stats1["ci_upper"] < stats2["ci_lower"]
500
- or stats2["ci_upper"] < stats1["ci_lower"]
501
- ),
502
  }
503
-
504
  return comparison_results
505
 
506
 
@@ -508,49 +475,43 @@ def generate_scientific_report(
508
  results: Dict, model_name: str = "", baseline_results: Dict = None
509
  ) -> str:
510
  """Generate a comprehensive scientific evaluation report."""
511
-
512
- if any(
513
- track_data.get("error") for track_data in results.get("tracks", {}).values()
514
- ):
515
  return f"❌ **Evaluation Error**: Unable to complete scientific evaluation"
516
-
517
  report = []
518
-
519
  # Header
520
  report.append(f"# πŸ”¬ Scientific Evaluation Report: {model_name or 'Model'}")
521
  report.append("")
522
-
523
  # Model categorization
524
  category_info = results.get("category_info", {})
525
  report.append(f"**Model Category**: {category_info.get('name', 'Unknown')}")
526
- report.append(
527
- f"**Category Description**: {category_info.get('description', 'N/A')}"
528
- )
529
  report.append("")
530
-
531
  # Track-by-track analysis
532
  for track_name, track_data in results.get("tracks", {}).items():
533
  if track_data.get("error"):
534
  continue
535
-
536
  track_config = EVALUATION_TRACKS[track_name]
537
  summary = track_data.get("summary", {})
538
  track_stats = track_data.get("track_statistics", {})
539
-
540
  report.append(f"## {track_config['name']}")
541
  report.append(f"*{track_config['description']}*")
542
  report.append("")
543
-
544
  # Summary statistics
545
  report.append("### πŸ“Š Summary Statistics")
546
  report.append(f"- **Samples Evaluated**: {summary.get('total_samples', 0):,}")
547
- report.append(
548
- f"- **Language Pairs**: {summary.get('language_pairs_evaluated', 0)}"
549
- )
550
  report.append(f"- **Languages Covered**: {summary.get('languages_covered', 0)}")
551
  report.append(f"- **Statistical Power**: {track_config['statistical_power']}")
552
  report.append("")
553
-
554
  # Primary metrics with confidence intervals
555
  report.append("### 🎯 Primary Metrics (95% Confidence Intervals)")
556
  for metric in METRICS_CONFIG["primary_metrics"]:
@@ -559,68 +520,51 @@ def generate_scientific_report(
559
  mean_val = stats["mean"]
560
  ci_lower = stats["ci_lower"]
561
  ci_upper = stats["ci_upper"]
562
-
563
- report.append(
564
- f"- **{metric.upper()}**: {mean_val:.4f} [{ci_lower:.4f}, {ci_upper:.4f}]"
565
- )
566
  report.append("")
567
-
568
  # Statistical adequacy assessment
569
- min_required = track_config["min_samples_per_pair"] * summary.get(
570
- "language_pairs_evaluated", 0
571
- )
572
- adequacy = (
573
- "βœ… Adequate"
574
- if summary.get("total_samples", 0) >= min_required
575
- else "⚠️ Limited"
576
- )
577
  report.append(f"**Statistical Adequacy**: {adequacy}")
578
  report.append("")
579
-
580
  # Cross-track analysis
581
  cross_track = results.get("cross_track_analysis", {})
582
  if cross_track:
583
  report.append("## πŸ”„ Cross-Track Consistency Analysis")
584
-
585
  coverage_analysis = cross_track.get("language_coverage_analysis", {})
586
  for track_name, coverage_info in coverage_analysis.items():
587
- adequacy = (
588
- "βœ… Statistically adequate"
589
- if coverage_info.get("statistical_adequacy")
590
- else "⚠️ Limited statistical power"
591
- )
592
  report.append(f"- **{track_name}**: {adequacy}")
593
-
594
  report.append("")
595
-
596
  # Baseline comparison if available
597
  if baseline_results:
598
  report.append("## πŸ“ˆ Baseline Comparison")
599
  # This would include detailed statistical comparisons
600
  report.append("*Statistical comparison with baseline models*")
601
  report.append("")
602
-
603
  # Scientific recommendations
604
  report.append("## πŸ’‘ Scientific Recommendations")
605
-
606
  total_samples = sum(
607
  track_data.get("summary", {}).get("total_samples", 0)
608
  for track_data in results.get("tracks", {}).values()
609
  if not track_data.get("error")
610
  )
611
-
612
  if total_samples < SAMPLE_SIZE_RECOMMENDATIONS["publication_quality"]:
613
- report.append(
614
- "- ⚠️ Consider collecting more evaluation samples for publication-quality results"
615
- )
616
-
617
  google_track = results.get("tracks", {}).get("google_comparable", {})
618
- if (
619
- not google_track.get("error")
620
- and google_track.get("summary", {}).get("total_samples", 0) > 100
621
- ):
622
  report.append("- βœ… Sufficient data for comparison with commercial systems")
623
-
624
  report.append("")
625
-
626
- return "\n".join(report)
 
17
  STATISTICAL_CONFIG,
18
  EVALUATION_TRACKS,
19
  MODEL_CATEGORIES,
20
+ SAMPLE_SIZE_RECOMMENDATIONS,
21
  )
22
  from src.utils import get_all_language_pairs, get_google_comparable_pairs
23
 
 
26
 
27
  def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
28
  """Calculate all metrics for a single sentence pair with robust error handling."""
29
+
30
  # Handle empty predictions
31
  if not prediction or not isinstance(prediction, str):
32
  prediction = ""
33
+
34
  if not reference or not isinstance(reference, str):
35
  reference = ""
36
+
37
  # Normalize texts
38
  normalizer = BasicTextNormalizer()
39
  pred_norm = normalizer(prediction)
40
  ref_norm = normalizer(reference)
41
+
42
  metrics = {}
43
+
44
  # BLEU score (0-100 scale)
45
  try:
46
  bleu = BLEU(effective_order=True)
47
  metrics["bleu"] = bleu.sentence_score(pred_norm, [ref_norm]).score
48
  except:
49
  metrics["bleu"] = 0.0
50
+
51
  # ChrF score (normalize to 0-1)
52
  try:
53
  chrf = CHRF()
54
  metrics["chrf"] = chrf.sentence_score(pred_norm, [ref_norm]).score / 100.0
55
  except:
56
  metrics["chrf"] = 0.0
57
+
58
  # Character Error Rate (CER)
59
  try:
60
  if len(ref_norm) > 0:
 
63
  metrics["cer"] = 1.0 if len(pred_norm) > 0 else 0.0
64
  except:
65
  metrics["cer"] = 1.0
66
+
67
  # Word Error Rate (WER)
68
  try:
69
  ref_words = ref_norm.split()
70
  pred_words = pred_norm.split()
71
  if len(ref_words) > 0:
72
+ metrics["wer"] = Levenshtein.distance(ref_words, pred_words) / len(ref_words)
 
 
73
  else:
74
  metrics["wer"] = 1.0 if len(pred_words) > 0 else 0.0
75
  except:
76
  metrics["wer"] = 1.0
77
+
78
  # Length ratio
79
  try:
80
  if len(ref_norm) > 0:
 
83
  metrics["len_ratio"] = 1.0 if len(pred_norm) == 0 else float("inf")
84
  except:
85
  metrics["len_ratio"] = 1.0
86
+
87
  # ROUGE scores
88
  try:
89
  scorer = rouge_scorer.RougeScorer(
90
  ["rouge1", "rouge2", "rougeL"], use_stemmer=True
91
  )
92
  rouge_scores = scorer.score(ref_norm, pred_norm)
93
+
94
  metrics["rouge1"] = rouge_scores["rouge1"].fmeasure
95
  metrics["rouge2"] = rouge_scores["rouge2"].fmeasure
96
  metrics["rougeL"] = rouge_scores["rougeL"].fmeasure
 
98
  metrics["rouge1"] = 0.0
99
  metrics["rouge2"] = 0.0
100
  metrics["rougeL"] = 0.0
101
+
102
  # Quality score (composite metric)
103
  try:
104
  quality_components = [
 
112
  metrics["quality_score"] = np.mean(quality_components)
113
  except:
114
  metrics["quality_score"] = 0.0
115
+
116
  return metrics
117
 
118
 
119
  def calculate_statistical_metrics(values: List[float]) -> Dict[str, float]:
120
  """Calculate statistical measures including confidence intervals."""
121
+
122
  if not values or len(values) == 0:
123
  return {
124
  "mean": 0.0,
 
128
  "ci_upper": 0.0,
129
  "n_samples": 0,
130
  }
131
+
132
  values = np.array(values)
133
  values = values[~np.isnan(values)] # Remove NaN values
134
+
135
  if len(values) == 0:
136
  return {
137
  "mean": 0.0,
 
141
  "ci_upper": 0.0,
142
  "n_samples": 0,
143
  }
144
+
145
  stats_dict = {
146
  "mean": float(np.mean(values)),
147
  "std": float(np.std(values, ddof=1)) if len(values) > 1 else 0.0,
148
  "median": float(np.median(values)),
149
  "n_samples": len(values),
150
  }
151
+
152
  # Calculate confidence intervals using bootstrap if enough samples
153
  if len(values) >= STATISTICAL_CONFIG["min_samples_for_ci"]:
154
  try:
155
  confidence_level = STATISTICAL_CONFIG["confidence_level"]
156
+
157
  # Bootstrap confidence interval
158
  def mean_func(x):
159
  return np.mean(x)
160
+
161
  res = bootstrap(
162
  (values,),
163
  mean_func,
 
165
  confidence_level=confidence_level,
166
  random_state=42,
167
  )
168
+
169
  stats_dict["ci_lower"] = float(res.confidence_interval.low)
170
  stats_dict["ci_upper"] = float(res.confidence_interval.high)
171
+
172
  except Exception as e:
173
  # Fallback to t-distribution CI
174
  try:
 
183
  else:
184
  stats_dict["ci_lower"] = stats_dict["mean"]
185
  stats_dict["ci_upper"] = stats_dict["mean"]
186
+
187
  return stats_dict
188
 
189
 
 
191
  values1: List[float], values2: List[float], metric_name: str
192
  ) -> Dict[str, float]:
193
  """Perform statistical significance test between two groups."""
194
+
195
  if len(values1) < 2 or len(values2) < 2:
196
  return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
197
+
198
  values1 = np.array(values1)
199
  values2 = np.array(values2)
200
+
201
  # Remove NaN values
202
  values1 = values1[~np.isnan(values1)]
203
  values2 = values2[~np.isnan(values2)]
204
+
205
  if len(values1) < 2 or len(values2) < 2:
206
  return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
207
+
208
  try:
209
  # Perform t-test
210
  t_stat, p_value = stats.ttest_ind(values1, values2, equal_var=False)
211
+
212
  # Calculate effect size (Cohen's d)
213
  pooled_std = np.sqrt(
214
+ ((len(values1) - 1) * np.var(values1, ddof=1) +
215
+ (len(values2) - 1) * np.var(values2, ddof=1)) /
216
+ (len(values1) + len(values2) - 2)
 
 
217
  )
218
+
219
  if pooled_std > 0:
220
  effect_size = abs(np.mean(values1) - np.mean(values2)) / pooled_std
221
  else:
222
  effect_size = 0.0
223
+
224
  # Determine significance
225
+ significance_level = EVALUATION_TRACKS["google_comparable"]["significance_level"]
 
 
226
  significant = p_value < significance_level
227
+
228
  return {
229
  "p_value": float(p_value),
230
  "effect_size": float(effect_size),
231
  "significant": significant,
232
  "t_statistic": float(t_stat),
233
  }
234
+
235
  except Exception as e:
236
  return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
237
 
 
240
  predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
241
  ) -> Dict:
242
  """Evaluate predictions for a specific track with statistical analysis."""
243
+
244
  print(f"πŸ”„ Evaluating for {track} track...")
245
+
246
  track_config = EVALUATION_TRACKS[track]
247
  track_languages = track_config["languages"]
248
+
249
  # Filter test set and predictions to track languages
250
  track_test_set = test_set[
251
+ (test_set["source_language"].isin(track_languages)) &
252
+ (test_set["target_language"].isin(track_languages))
253
  ].copy()
254
+
255
  # Merge predictions with test set
256
  merged = track_test_set.merge(
257
  predictions, on="sample_id", how="inner", suffixes=("", "_pred")
258
  )
259
+
260
  if len(merged) == 0:
261
  return {
262
  "error": f"No matching samples found for {track} track",
263
  "evaluated_samples": 0,
264
  "track": track,
265
  }
266
+
267
  print(f"πŸ“Š Evaluating {len(merged)} samples for {track} track...")
268
+
269
  # Calculate metrics for each sample
270
  sample_metrics = []
271
  for idx, row in merged.iterrows():
 
274
  metrics["source_language"] = row["source_language"]
275
  metrics["target_language"] = row["target_language"]
276
  sample_metrics.append(metrics)
277
+
278
  sample_df = pd.DataFrame(sample_metrics)
279
+
280
  # Aggregate by language pairs with statistical analysis
281
  pair_metrics = {}
282
  overall_metrics = defaultdict(list)
283
+
284
  # Calculate metrics for each language pair
285
  for src_lang in track_languages:
286
  for tgt_lang in track_languages:
287
  if src_lang == tgt_lang:
288
  continue
289
+
290
  pair_data = sample_df[
291
+ (sample_df["source_language"] == src_lang) &
292
+ (sample_df["target_language"] == tgt_lang)
293
  ]
294
+
295
  if len(pair_data) >= track_config["min_samples_per_pair"]:
296
  pair_key = f"{src_lang}_to_{tgt_lang}"
297
  pair_metrics[pair_key] = {}
298
+
299
  # Calculate statistical metrics for each measure
300
  for metric in (
301
+ METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]
 
302
  ):
303
  if metric in pair_data.columns:
304
+ values = pair_data[metric].replace([np.inf, -np.inf], np.nan).dropna()
305
+
 
 
 
 
306
  if len(values) > 0:
307
+ stats_metrics = calculate_statistical_metrics(values.tolist())
 
 
308
  pair_metrics[pair_key][metric] = stats_metrics
309
+
310
  # Add to overall metrics for track-level statistics
311
  overall_metrics[metric].append(stats_metrics["mean"])
312
+
313
  pair_metrics[pair_key]["sample_count"] = len(pair_data)
314
  pair_metrics[pair_key]["languages"] = f"{src_lang}-{tgt_lang}"
315
+
316
  # Calculate track-level aggregated statistics
317
  track_averages = {}
318
  track_statistics = {}
319
+
320
  for metric in overall_metrics:
321
  if overall_metrics[metric]:
322
  track_stats = calculate_statistical_metrics(overall_metrics[metric])
323
  track_averages[metric] = track_stats["mean"]
324
  track_statistics[metric] = track_stats
325
+
326
  # Generate evaluation summary
327
  summary = {
328
  "track": track,
329
  "track_name": track_config["name"],
330
  "total_samples": len(sample_df),
331
+ "language_pairs_evaluated": len([k for k in pair_metrics if pair_metrics[k].get("sample_count", 0) > 0]),
332
+ "languages_covered": len(set(sample_df["source_language"]) | set(sample_df["target_language"])),
 
 
 
 
333
  "min_samples_per_pair": track_config["min_samples_per_pair"],
334
  "statistical_power": track_config["statistical_power"],
335
  "significance_level": track_config["significance_level"],
336
  }
337
+
338
  return {
339
  "sample_metrics": sample_df,
340
  "pair_metrics": pair_metrics,
 
351
  predictions: pd.DataFrame, test_set: pd.DataFrame, model_category: str = "community"
352
  ) -> Dict:
353
  """Comprehensive evaluation across all tracks with scientific rigor."""
354
+
355
  print("πŸ”¬ Starting scientific evaluation...")
356
+
357
  # Validate model category
358
  if model_category not in MODEL_CATEGORIES:
359
  model_category = "community"
360
+
361
  evaluation_results = {
362
  "model_category": model_category,
363
  "category_info": MODEL_CATEGORIES[model_category],
 
369
  "total_samples_available": len(test_set),
370
  },
371
  }
372
+
373
  # Evaluate each track
374
  for track_name in EVALUATION_TRACKS.keys():
375
  track_result = evaluate_predictions_by_track(predictions, test_set, track_name)
376
  evaluation_results["tracks"][track_name] = track_result
377
+
378
  # Cross-track consistency analysis
379
  evaluation_results["cross_track_analysis"] = analyze_cross_track_consistency(
380
  evaluation_results["tracks"]
381
  )
382
+
383
  return evaluation_results
384
 
385
 
386
  def analyze_cross_track_consistency(track_results: Dict) -> Dict:
387
  """Analyze consistency of model performance across different tracks."""
388
+
389
  consistency_analysis = {
390
  "track_correlations": {},
391
  "performance_stability": {},
392
  "language_coverage_analysis": {},
393
  }
394
+
395
  # Extract quality scores from each track for correlation analysis
396
  track_scores = {}
397
  for track_name, track_data in track_results.items():
398
+ if track_data.get("track_averages") and "quality_score" in track_data["track_averages"]:
 
 
 
399
  track_scores[track_name] = track_data["track_averages"]["quality_score"]
400
+
401
  # Calculate pairwise correlations (would need more data points for meaningful correlation)
402
  if len(track_scores) >= 2:
403
  track_names = list(track_scores.keys())
404
  for i, track1 in enumerate(track_names):
405
+ for track2 in track_names[i + 1:]:
406
  # This would be more meaningful with multiple models
407
  consistency_analysis["track_correlations"][f"{track1}_vs_{track2}"] = {
408
+ "score_difference": abs(track_scores[track1] - track_scores[track2]),
409
+ "relative_performance": track_scores[track1] / max(track_scores[track2], 0.001),
 
 
 
410
  }
411
+
412
  # Language coverage analysis
413
  for track_name, track_data in track_results.items():
414
  if track_data.get("summary"):
415
  summary = track_data["summary"]
416
  consistency_analysis["language_coverage_analysis"][track_name] = {
417
+ "coverage_rate": summary["language_pairs_evaluated"] / max(summary.get("total_possible_pairs", 1), 1),
418
+ "samples_per_pair": summary["total_samples"] / max(summary["language_pairs_evaluated"], 1),
419
+ "statistical_adequacy": summary["total_samples"] >= EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary["language_pairs_evaluated"],
 
 
 
 
420
  }
421
+
422
  return consistency_analysis
423
 
424
 
 
426
  model1_results: Dict, model2_results: Dict, track: str = "google_comparable"
427
  ) -> Dict:
428
  """Perform statistical comparison between two models on a specific track."""
429
+
430
+ if track not in model1_results.get("tracks", {}) or track not in model2_results.get("tracks", {}):
 
 
431
  return {"error": f"Track {track} not available for both models"}
432
+
433
  track1_data = model1_results["tracks"][track]
434
  track2_data = model2_results["tracks"][track]
435
+
436
  if track1_data.get("error") or track2_data.get("error"):
437
  return {"error": "One or both models have evaluation errors"}
438
+
439
  comparison_results = {
440
  "track": track,
441
  "model1_category": model1_results.get("model_category", "unknown"),
 
444
  "language_pair_comparisons": {},
445
  "overall_significance": {},
446
  }
447
+
448
  # Compare each metric
449
+ for metric in METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]:
450
+ if (metric in track1_data.get("track_statistics", {}) and
451
+ metric in track2_data.get("track_statistics", {})):
452
+
 
 
 
453
  # Extract sample-level data for this metric from both models
454
  # This would require access to the original sample metrics
455
  # For now, we'll use the aggregated statistics
456
+
457
  stats1 = track1_data["track_statistics"][metric]
458
  stats2 = track2_data["track_statistics"][metric]
459
+
460
  # Create comparison summary
461
  comparison_results["metric_comparisons"][metric] = {
462
  "model1_mean": stats1["mean"],
 
464
  "model2_mean": stats2["mean"],
465
  "model2_ci": [stats2["ci_lower"], stats2["ci_upper"]],
466
  "difference": stats1["mean"] - stats2["mean"],
467
+ "ci_overlap": not (stats1["ci_upper"] < stats2["ci_lower"] or
468
+ stats2["ci_upper"] < stats1["ci_lower"]),
 
 
469
  }
470
+
471
  return comparison_results
472
 
473
 
 
475
  results: Dict, model_name: str = "", baseline_results: Dict = None
476
  ) -> str:
477
  """Generate a comprehensive scientific evaluation report."""
478
+
479
+ if any(track_data.get("error") for track_data in results.get("tracks", {}).values()):
 
 
480
  return f"❌ **Evaluation Error**: Unable to complete scientific evaluation"
481
+
482
  report = []
483
+
484
  # Header
485
  report.append(f"# πŸ”¬ Scientific Evaluation Report: {model_name or 'Model'}")
486
  report.append("")
487
+
488
  # Model categorization
489
  category_info = results.get("category_info", {})
490
  report.append(f"**Model Category**: {category_info.get('name', 'Unknown')}")
491
+ report.append(f"**Category Description**: {category_info.get('description', 'N/A')}")
 
 
492
  report.append("")
493
+
494
  # Track-by-track analysis
495
  for track_name, track_data in results.get("tracks", {}).items():
496
  if track_data.get("error"):
497
  continue
498
+
499
  track_config = EVALUATION_TRACKS[track_name]
500
  summary = track_data.get("summary", {})
501
  track_stats = track_data.get("track_statistics", {})
502
+
503
  report.append(f"## {track_config['name']}")
504
  report.append(f"*{track_config['description']}*")
505
  report.append("")
506
+
507
  # Summary statistics
508
  report.append("### πŸ“Š Summary Statistics")
509
  report.append(f"- **Samples Evaluated**: {summary.get('total_samples', 0):,}")
510
+ report.append(f"- **Language Pairs**: {summary.get('language_pairs_evaluated', 0)}")
 
 
511
  report.append(f"- **Languages Covered**: {summary.get('languages_covered', 0)}")
512
  report.append(f"- **Statistical Power**: {track_config['statistical_power']}")
513
  report.append("")
514
+
515
  # Primary metrics with confidence intervals
516
  report.append("### 🎯 Primary Metrics (95% Confidence Intervals)")
517
  for metric in METRICS_CONFIG["primary_metrics"]:
 
520
  mean_val = stats["mean"]
521
  ci_lower = stats["ci_lower"]
522
  ci_upper = stats["ci_upper"]
523
+
524
+ report.append(f"- **{metric.upper()}**: {mean_val:.4f} [{ci_lower:.4f}, {ci_upper:.4f}]")
 
 
525
  report.append("")
526
+
527
  # Statistical adequacy assessment
528
+ min_required = track_config["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
529
+ adequacy = "βœ… Adequate" if summary.get("total_samples", 0) >= min_required else "⚠️ Limited"
 
 
 
 
 
 
530
  report.append(f"**Statistical Adequacy**: {adequacy}")
531
  report.append("")
532
+
533
  # Cross-track analysis
534
  cross_track = results.get("cross_track_analysis", {})
535
  if cross_track:
536
  report.append("## πŸ”„ Cross-Track Consistency Analysis")
537
+
538
  coverage_analysis = cross_track.get("language_coverage_analysis", {})
539
  for track_name, coverage_info in coverage_analysis.items():
540
+ adequacy = "βœ… Statistically adequate" if coverage_info.get("statistical_adequacy") else "⚠️ Limited statistical power"
 
 
 
 
541
  report.append(f"- **{track_name}**: {adequacy}")
542
+
543
  report.append("")
544
+
545
  # Baseline comparison if available
546
  if baseline_results:
547
  report.append("## πŸ“ˆ Baseline Comparison")
548
  # This would include detailed statistical comparisons
549
  report.append("*Statistical comparison with baseline models*")
550
  report.append("")
551
+
552
  # Scientific recommendations
553
  report.append("## πŸ’‘ Scientific Recommendations")
554
+
555
  total_samples = sum(
556
  track_data.get("summary", {}).get("total_samples", 0)
557
  for track_data in results.get("tracks", {}).values()
558
  if not track_data.get("error")
559
  )
560
+
561
  if total_samples < SAMPLE_SIZE_RECOMMENDATIONS["publication_quality"]:
562
+ report.append("- ⚠️ Consider collecting more evaluation samples for publication-quality results")
563
+
 
 
564
  google_track = results.get("tracks", {}).get("google_comparable", {})
565
+ if not google_track.get("error") and google_track.get("summary", {}).get("total_samples", 0) > 100:
 
 
 
566
  report.append("- βœ… Sufficient data for comparison with commercial systems")
567
+
568
  report.append("")
569
+
570
+ return "\n".join(report)