akera commited on
Commit
f0df659
Β·
verified Β·
1 Parent(s): d5b83bc

Update src/evaluation.py

Browse files
Files changed (1) hide show
  1. src/evaluation.py +543 -336
src/evaluation.py CHANGED
@@ -6,414 +6,621 @@ from rouge_score import rouge_scorer
6
  import Levenshtein
7
  from collections import defaultdict
8
  from transformers.models.whisper.english_normalizer import BasicTextNormalizer
9
- from typing import Dict, List, Tuple
10
- from config import ALL_UG40_LANGUAGES, GOOGLE_SUPPORTED_LANGUAGES, METRICS_CONFIG
 
 
 
 
 
 
 
 
 
 
11
  from src.utils import get_all_language_pairs, get_google_comparable_pairs
12
 
 
 
 
13
  def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
14
- """Calculate all metrics for a single sentence pair - Fixed to match reference implementation."""
15
-
16
  # Handle empty predictions
17
  if not prediction or not isinstance(prediction, str):
18
  prediction = ""
19
-
20
  if not reference or not isinstance(reference, str):
21
  reference = ""
22
-
23
  # Normalize texts
24
  normalizer = BasicTextNormalizer()
25
  pred_norm = normalizer(prediction)
26
  ref_norm = normalizer(reference)
27
-
28
  metrics = {}
29
-
30
- # BLEU score (keep as 0-100 scale initially)
31
  try:
32
  bleu = BLEU(effective_order=True)
33
- metrics['bleu'] = bleu.sentence_score(pred_norm, [ref_norm]).score
34
  except:
35
- metrics['bleu'] = 0.0
36
-
37
  # ChrF score (normalize to 0-1)
38
  try:
39
  chrf = CHRF()
40
- metrics['chrf'] = chrf.sentence_score(pred_norm, [ref_norm]).score / 100.0
41
  except:
42
- metrics['chrf'] = 0.0
43
-
44
  # Character Error Rate (CER)
45
  try:
46
  if len(ref_norm) > 0:
47
- metrics['cer'] = Levenshtein.distance(ref_norm, pred_norm) / len(ref_norm)
48
  else:
49
- metrics['cer'] = 1.0 if len(pred_norm) > 0 else 0.0
50
  except:
51
- metrics['cer'] = 1.0
52
-
53
  # Word Error Rate (WER)
54
  try:
55
  ref_words = ref_norm.split()
56
  pred_words = pred_norm.split()
57
  if len(ref_words) > 0:
58
- metrics['wer'] = Levenshtein.distance(ref_words, pred_words) / len(ref_words)
 
 
59
  else:
60
- metrics['wer'] = 1.0 if len(pred_words) > 0 else 0.0
61
  except:
62
- metrics['wer'] = 1.0
63
-
64
  # Length ratio
65
  try:
66
  if len(ref_norm) > 0:
67
- metrics['len_ratio'] = len(pred_norm) / len(ref_norm)
68
  else:
69
- metrics['len_ratio'] = 1.0 if len(pred_norm) == 0 else float('inf')
70
  except:
71
- metrics['len_ratio'] = 1.0
72
-
73
  # ROUGE scores
74
  try:
75
- scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
 
 
76
  rouge_scores = scorer.score(ref_norm, pred_norm)
77
-
78
- metrics['rouge1'] = rouge_scores['rouge1'].fmeasure
79
- metrics['rouge2'] = rouge_scores['rouge2'].fmeasure
80
- metrics['rougeL'] = rouge_scores['rougeL'].fmeasure
81
  except:
82
- metrics['rouge1'] = 0.0
83
- metrics['rouge2'] = 0.0
84
- metrics['rougeL'] = 0.0
85
-
86
- # Quality score (composite metric) - Fixed to match reference
87
  try:
88
  quality_components = [
89
- metrics['bleu'] / 100.0, # Normalize BLEU to 0-1
90
- metrics['chrf'], # Already 0-1
91
- 1.0 - min(metrics['cer'], 1.0), # Invert error rates
92
- 1.0 - min(metrics['wer'], 1.0),
93
- metrics['rouge1'],
94
- metrics['rougeL']
95
  ]
96
- metrics['quality_score'] = np.mean(quality_components)
97
- except Exception as e:
98
- # Fallback without ROUGE
99
- print(f"Error calculating quality score: {e}")
100
- try:
101
- fallback_components = [
102
- metrics['bleu'] / 100.0,
103
- metrics['chrf'],
104
- 1.0 - min(metrics['cer'], 1.0),
105
- 1.0 - min(metrics['wer'], 1.0)
106
- ]
107
- metrics['quality_score'] = np.mean(fallback_components)
108
- except:
109
- metrics['quality_score'] = 0.0
110
-
111
  return metrics
112
 
113
- def evaluate_predictions(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
114
- """Evaluate predictions against test set targets."""
115
-
116
- print("Starting evaluation...")
117
-
118
- # Merge predictions with test set (which contains targets)
119
- merged = test_set.merge(
120
- predictions,
121
- on='sample_id',
122
- how='inner',
123
- suffixes=('', '_pred')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  )
125
-
126
  if len(merged) == 0:
127
  return {
128
- 'error': 'No matching samples found between predictions and test set',
129
- 'evaluated_samples': 0
 
130
  }
131
-
132
- print(f"Evaluating {len(merged)} samples...")
133
-
134
  # Calculate metrics for each sample
135
  sample_metrics = []
136
  for idx, row in merged.iterrows():
137
- metrics = calculate_sentence_metrics(row['target_text'], row['prediction'])
138
- metrics['sample_id'] = row['sample_id']
139
- metrics['source_language'] = row['source_language']
140
- metrics['target_language'] = row['target_language']
141
- metrics['google_comparable'] = row.get('google_comparable', False)
142
  sample_metrics.append(metrics)
143
-
144
  sample_df = pd.DataFrame(sample_metrics)
145
-
146
- # Aggregate by language pairs - Fixed aggregation
147
  pair_metrics = {}
148
  overall_metrics = defaultdict(list)
149
- google_comparable_metrics = defaultdict(list)
150
-
151
  # Calculate metrics for each language pair
152
- for src_lang in ALL_UG40_LANGUAGES:
153
- for tgt_lang in ALL_UG40_LANGUAGES:
154
- if src_lang != tgt_lang:
155
- pair_data = sample_df[
156
- (sample_df['source_language'] == src_lang) &
157
- (sample_df['target_language'] == tgt_lang)
158
- ]
159
-
160
- if len(pair_data) > 0:
161
- pair_key = f"{src_lang}_to_{tgt_lang}"
162
- pair_metrics[pair_key] = {}
163
-
164
- # Calculate averages for this pair
165
- for metric in METRICS_CONFIG['primary_metrics'] + METRICS_CONFIG['secondary_metrics']:
166
- if metric in pair_data.columns:
167
- # Filter out invalid values
168
- valid_values = pair_data[metric].replace([np.inf, -np.inf], np.nan).dropna()
169
- if len(valid_values) > 0:
170
- avg_value = float(valid_values.mean())
171
- pair_metrics[pair_key][metric] = avg_value
172
-
173
- # Add to overall averages
174
- overall_metrics[metric].append(avg_value)
175
-
176
- # Add to Google comparable if applicable
177
- if (src_lang in GOOGLE_SUPPORTED_LANGUAGES and
178
- tgt_lang in GOOGLE_SUPPORTED_LANGUAGES):
179
- google_comparable_metrics[metric].append(avg_value)
180
-
181
- pair_metrics[pair_key]['sample_count'] = len(pair_data)
182
-
183
- # Calculate overall averages
184
- averages = {}
 
 
 
 
 
 
 
 
 
185
  for metric in overall_metrics:
186
  if overall_metrics[metric]:
187
- averages[metric] = float(np.mean(overall_metrics[metric]))
188
- else:
189
- averages[metric] = 0.0
190
-
191
- # Calculate Google comparable averages
192
- google_averages = {}
193
- for metric in google_comparable_metrics:
194
- if google_comparable_metrics[metric]:
195
- google_averages[metric] = float(np.mean(google_comparable_metrics[metric]))
196
- else:
197
- google_averages[metric] = 0.0
198
-
199
  # Generate evaluation summary
200
  summary = {
201
- 'total_samples': len(sample_df),
202
- 'language_pairs_covered': len([k for k in pair_metrics if pair_metrics[k].get('sample_count', 0) > 0]),
203
- 'google_comparable_pairs': len([k for k in pair_metrics
204
- if '_to_' in k and
205
- k.split('_to_')[0] in GOOGLE_SUPPORTED_LANGUAGES and
206
- k.split('_to_')[1] in GOOGLE_SUPPORTED_LANGUAGES and
207
- pair_metrics[k].get('sample_count', 0) > 0]),
208
- 'primary_metrics': {metric: averages.get(metric, 0.0)
209
- for metric in METRICS_CONFIG['primary_metrics']},
210
- 'secondary_metrics': {metric: averages.get(metric, 0.0)
211
- for metric in METRICS_CONFIG['secondary_metrics']}
 
212
  }
213
-
214
  return {
215
- 'sample_metrics': sample_df,
216
- 'pair_metrics': pair_metrics,
217
- 'averages': averages,
218
- 'google_comparable_averages': google_averages,
219
- 'summary': summary,
220
- 'evaluated_samples': len(sample_df),
221
- 'error': None
 
222
  }
223
 
224
- # Keep the rest of the functions unchanged...
225
- def compare_with_baseline(results: Dict, baseline_results: Dict = None) -> Dict:
226
- """Compare results with baseline (e.g., Google Translate)."""
227
-
228
- if baseline_results is None:
229
- return {
230
- 'comparison_available': False,
231
- 'message': 'No baseline available for comparison'
232
- }
233
-
234
- comparison = {
235
- 'comparison_available': True,
236
- 'overall_comparison': {},
237
- 'pair_comparisons': {},
238
- 'better_pairs': [],
239
- 'worse_pairs': []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  }
241
-
242
- # Compare overall metrics
243
- for metric in METRICS_CONFIG['primary_metrics']:
244
- if metric in results['averages'] and metric in baseline_results['averages']:
245
- user_score = results['averages'][metric]
246
- baseline_score = baseline_results['averages'][metric]
247
-
248
- # For error metrics (cer, wer), lower is better
249
- if metric in ['cer', 'wer']:
250
- improvement = baseline_score - user_score # Positive = improvement
251
- else:
252
- improvement = user_score - baseline_score # Positive = improvement
253
-
254
- comparison['overall_comparison'][metric] = {
255
- 'user_score': user_score,
256
- 'baseline_score': baseline_score,
257
- 'improvement': improvement,
258
- 'improvement_percent': (improvement / max(baseline_score, 0.001)) * 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  }
260
-
261
- # Compare by language pairs (only Google comparable ones)
262
- google_pairs = [k for k in results['pair_metrics']
263
- if '_to_' in k and
264
- k.split('_to_')[0] in GOOGLE_SUPPORTED_LANGUAGES and
265
- k.split('_to_')[1] in GOOGLE_SUPPORTED_LANGUAGES]
266
-
267
- for pair in google_pairs:
268
- if pair in baseline_results['pair_metrics']:
269
- pair_comparison = {}
270
-
271
- for metric in METRICS_CONFIG['primary_metrics']:
272
- if (metric in results['pair_metrics'][pair] and
273
- metric in baseline_results['pair_metrics'][pair]):
274
-
275
- user_score = results['pair_metrics'][pair][metric]
276
- baseline_score = baseline_results['pair_metrics'][pair][metric]
277
-
278
- if metric in ['cer', 'wer']:
279
- improvement = baseline_score - user_score
280
- else:
281
- improvement = user_score - baseline_score
282
-
283
- pair_comparison[metric] = {
284
- 'user_score': user_score,
285
- 'baseline_score': baseline_score,
286
- 'improvement': improvement
287
- }
288
-
289
- comparison['pair_comparisons'][pair] = pair_comparison
290
-
291
- # Determine if this pair is better or worse overall
292
- quality_improvement = pair_comparison.get('quality_score', {}).get('improvement', 0)
293
- if quality_improvement > 0.01: # Threshold for significance
294
- comparison['better_pairs'].append(pair)
295
- elif quality_improvement < -0.01:
296
- comparison['worse_pairs'].append(pair)
297
-
298
- return comparison
299
-
300
- def generate_evaluation_report(results: Dict, model_name: str = "", comparison: Dict = None) -> str:
301
- """Generate human-readable evaluation report."""
302
-
303
- if results.get('error'):
304
- return f"❌ **Evaluation Error**: {results['error']}"
305
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  report = []
307
-
308
  # Header
309
- report.append(f"## Evaluation Report: {model_name or 'Submission'}")
310
- report.append("")
311
-
312
- # Summary
313
- summary = results['summary']
314
- report.append("### πŸ“Š Summary")
315
- report.append(f"- **Total Samples Evaluated**: {summary['total_samples']:,}")
316
- report.append(f"- **Language Pairs Covered**: {summary['language_pairs_covered']}")
317
- report.append(f"- **Google Comparable Pairs**: {summary['google_comparable_pairs']}")
318
- report.append("")
319
-
320
- # Primary metrics
321
- report.append("### 🎯 Primary Metrics")
322
- for metric, value in summary['primary_metrics'].items():
323
- formatted_value = f"{value:.4f}" if metric != 'bleu' else f"{value:.2f}"
324
- report.append(f"- **{metric.upper()}**: {formatted_value}")
325
-
326
- # Quality ranking (if comparison available)
327
- if comparison and comparison.get('comparison_available'):
328
- quality_comp = comparison['overall_comparison'].get('quality_score', {})
329
- if quality_comp:
330
- improvement = quality_comp.get('improvement', 0)
331
- if improvement > 0.01:
332
- report.append(f" - 🟒 **{improvement:.3f}** better than baseline")
333
- elif improvement < -0.01:
334
- report.append(f" - πŸ”΄ **{abs(improvement):.3f}** worse than baseline")
335
- else:
336
- report.append(f" - 🟑 Similar to baseline")
337
-
338
  report.append("")
339
-
340
- # Secondary metrics
341
- report.append("### πŸ“ˆ Secondary Metrics")
342
- for metric, value in summary['secondary_metrics'].items():
343
- formatted_value = f"{value:.4f}"
344
- report.append(f"- **{metric.upper()}**: {formatted_value}")
 
345
  report.append("")
346
-
347
- # Language pair performance (top and bottom 5)
348
- pair_metrics = results['pair_metrics']
349
- if pair_metrics:
350
- # Sort pairs by quality score
351
- sorted_pairs = sorted(
352
- [(k, v.get('quality_score', 0)) for k, v in pair_metrics.items() if v.get('sample_count', 0) > 0],
353
- key=lambda x: x[1],
354
- reverse=True
 
 
 
 
 
 
 
 
 
 
355
  )
356
-
357
- if sorted_pairs:
358
- report.append("### πŸ† Best Performing Language Pairs")
359
- for pair, score in sorted_pairs[:5]:
360
- src, tgt = pair.replace('_to_', ' β†’ ').split(' β†’ ')
361
- report.append(f"- **{src} β†’ {tgt}**: {score:.3f}")
362
-
363
- if len(sorted_pairs) > 5:
364
- report.append("")
365
- report.append("### πŸ“‰ Challenging Language Pairs")
366
- for pair, score in sorted_pairs[-3:]:
367
- src, tgt = pair.replace('_to_', ' β†’ ').split(' β†’ ')
368
- report.append(f"- **{src} β†’ {tgt}**: {score:.3f}")
369
-
370
- # Comparison with baseline
371
- if comparison and comparison.get('comparison_available'):
372
  report.append("")
373
- report.append("### πŸ” Comparison with Baseline")
374
-
375
- better_count = len(comparison.get('better_pairs', []))
376
- worse_count = len(comparison.get('worse_pairs', []))
377
- total_comparable = len(comparison.get('pair_comparisons', {}))
378
-
379
- if total_comparable > 0:
380
- report.append(f"- **Better than baseline**: {better_count}/{total_comparable} pairs")
381
- report.append(f"- **Worse than baseline**: {worse_count}/{total_comparable} pairs")
382
-
383
- if comparison['better_pairs']:
384
- report.append(" - Strong pairs: " + ", ".join(comparison['better_pairs'][:3]))
385
-
386
- if comparison['worse_pairs']:
387
- report.append(" - Weak pairs: " + ", ".join(comparison['worse_pairs'][:3]))
388
-
389
- return "\n".join(report)
390
 
391
- def create_sample_analysis(results: Dict, n_samples: int = 10) -> pd.DataFrame:
392
- """Create sample analysis showing best and worst translations."""
393
-
394
- if 'sample_metrics' not in results:
395
- return pd.DataFrame()
396
-
397
- sample_df = results['sample_metrics']
398
-
399
- # Get best and worst samples by quality score
400
- best_samples = sample_df.nlargest(n_samples // 2, 'quality_score')
401
- worst_samples = sample_df.nsmallest(n_samples // 2, 'quality_score')
402
-
403
- analysis_samples = pd.concat([best_samples, worst_samples])
404
-
405
- # Add category
406
- analysis_samples['category'] = ['Best'] * len(best_samples) + ['Worst'] * len(worst_samples)
407
-
408
- return analysis_samples[['sample_id', 'source_language', 'target_language',
409
- 'quality_score', 'bleu', 'chrf', 'category']]
410
-
411
- def get_google_translate_baseline() -> Dict:
412
- """Get Google Translate baseline results (if available)."""
413
-
414
- try:
415
- # This would load pre-computed Google Translate results
416
- # For now, return empty dict - implement when Google Translate baseline is available
417
- return {}
418
- except:
419
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import Levenshtein
7
  from collections import defaultdict
8
  from transformers.models.whisper.english_normalizer import BasicTextNormalizer
9
+ from typing import Dict, List, Tuple, Optional
10
+ from scipy import stats
11
+ from scipy.stats import bootstrap
12
+ import warnings
13
+ from config import (
14
+ ALL_UG40_LANGUAGES,
15
+ GOOGLE_SUPPORTED_LANGUAGES,
16
+ METRICS_CONFIG,
17
+ STATISTICAL_CONFIG,
18
+ EVALUATION_TRACKS,
19
+ MODEL_CATEGORIES,
20
+ )
21
  from src.utils import get_all_language_pairs, get_google_comparable_pairs
22
 
23
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
24
+
25
+
26
  def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
27
+ """Calculate all metrics for a single sentence pair with robust error handling."""
28
+
29
  # Handle empty predictions
30
  if not prediction or not isinstance(prediction, str):
31
  prediction = ""
32
+
33
  if not reference or not isinstance(reference, str):
34
  reference = ""
35
+
36
  # Normalize texts
37
  normalizer = BasicTextNormalizer()
38
  pred_norm = normalizer(prediction)
39
  ref_norm = normalizer(reference)
40
+
41
  metrics = {}
42
+
43
+ # BLEU score (0-100 scale)
44
  try:
45
  bleu = BLEU(effective_order=True)
46
+ metrics["bleu"] = bleu.sentence_score(pred_norm, [ref_norm]).score
47
  except:
48
+ metrics["bleu"] = 0.0
49
+
50
  # ChrF score (normalize to 0-1)
51
  try:
52
  chrf = CHRF()
53
+ metrics["chrf"] = chrf.sentence_score(pred_norm, [ref_norm]).score / 100.0
54
  except:
55
+ metrics["chrf"] = 0.0
56
+
57
  # Character Error Rate (CER)
58
  try:
59
  if len(ref_norm) > 0:
60
+ metrics["cer"] = Levenshtein.distance(ref_norm, pred_norm) / len(ref_norm)
61
  else:
62
+ metrics["cer"] = 1.0 if len(pred_norm) > 0 else 0.0
63
  except:
64
+ metrics["cer"] = 1.0
65
+
66
  # Word Error Rate (WER)
67
  try:
68
  ref_words = ref_norm.split()
69
  pred_words = pred_norm.split()
70
  if len(ref_words) > 0:
71
+ metrics["wer"] = Levenshtein.distance(ref_words, pred_words) / len(
72
+ ref_words
73
+ )
74
  else:
75
+ metrics["wer"] = 1.0 if len(pred_words) > 0 else 0.0
76
  except:
77
+ metrics["wer"] = 1.0
78
+
79
  # Length ratio
80
  try:
81
  if len(ref_norm) > 0:
82
+ metrics["len_ratio"] = len(pred_norm) / len(ref_norm)
83
  else:
84
+ metrics["len_ratio"] = 1.0 if len(pred_norm) == 0 else float("inf")
85
  except:
86
+ metrics["len_ratio"] = 1.0
87
+
88
  # ROUGE scores
89
  try:
90
+ scorer = rouge_scorer.RougeScorer(
91
+ ["rouge1", "rouge2", "rougeL"], use_stemmer=True
92
+ )
93
  rouge_scores = scorer.score(ref_norm, pred_norm)
94
+
95
+ metrics["rouge1"] = rouge_scores["rouge1"].fmeasure
96
+ metrics["rouge2"] = rouge_scores["rouge2"].fmeasure
97
+ metrics["rougeL"] = rouge_scores["rougeL"].fmeasure
98
  except:
99
+ metrics["rouge1"] = 0.0
100
+ metrics["rouge2"] = 0.0
101
+ metrics["rougeL"] = 0.0
102
+
103
+ # Quality score (composite metric)
104
  try:
105
  quality_components = [
106
+ metrics["bleu"] / 100.0, # Normalize BLEU to 0-1
107
+ metrics["chrf"], # Already 0-1
108
+ 1.0 - min(metrics["cer"], 1.0), # Invert error rates
109
+ 1.0 - min(metrics["wer"], 1.0),
110
+ metrics["rouge1"],
111
+ metrics["rougeL"],
112
  ]
113
+ metrics["quality_score"] = np.mean(quality_components)
114
+ except:
115
+ metrics["quality_score"] = 0.0
116
+
 
 
 
 
 
 
 
 
 
 
 
117
  return metrics
118
 
119
+
120
+ def calculate_statistical_metrics(values: List[float]) -> Dict[str, float]:
121
+ """Calculate statistical measures including confidence intervals."""
122
+
123
+ if not values or len(values) == 0:
124
+ return {
125
+ "mean": 0.0,
126
+ "std": 0.0,
127
+ "median": 0.0,
128
+ "ci_lower": 0.0,
129
+ "ci_upper": 0.0,
130
+ "n_samples": 0,
131
+ }
132
+
133
+ values = np.array(values)
134
+ values = values[~np.isnan(values)] # Remove NaN values
135
+
136
+ if len(values) == 0:
137
+ return {
138
+ "mean": 0.0,
139
+ "std": 0.0,
140
+ "median": 0.0,
141
+ "ci_lower": 0.0,
142
+ "ci_upper": 0.0,
143
+ "n_samples": 0,
144
+ }
145
+
146
+ stats_dict = {
147
+ "mean": float(np.mean(values)),
148
+ "std": float(np.std(values, ddof=1)) if len(values) > 1 else 0.0,
149
+ "median": float(np.median(values)),
150
+ "n_samples": len(values),
151
+ }
152
+
153
+ # Calculate confidence intervals using bootstrap if enough samples
154
+ if len(values) >= STATISTICAL_CONFIG["min_samples_for_ci"]:
155
+ try:
156
+ confidence_level = STATISTICAL_CONFIG["confidence_level"]
157
+
158
+ # Bootstrap confidence interval
159
+ def mean_func(x):
160
+ return np.mean(x)
161
+
162
+ res = bootstrap(
163
+ (values,),
164
+ mean_func,
165
+ n_resamples=STATISTICAL_CONFIG["bootstrap_samples"],
166
+ confidence_level=confidence_level,
167
+ random_state=42,
168
+ )
169
+
170
+ stats_dict["ci_lower"] = float(res.confidence_interval.low)
171
+ stats_dict["ci_upper"] = float(res.confidence_interval.high)
172
+
173
+ except Exception as e:
174
+ # Fallback to t-distribution CI
175
+ try:
176
+ alpha = 1 - confidence_level
177
+ t_val = stats.t.ppf(1 - alpha / 2, len(values) - 1)
178
+ margin = t_val * stats_dict["std"] / np.sqrt(len(values))
179
+ stats_dict["ci_lower"] = stats_dict["mean"] - margin
180
+ stats_dict["ci_upper"] = stats_dict["mean"] + margin
181
+ except:
182
+ stats_dict["ci_lower"] = stats_dict["mean"]
183
+ stats_dict["ci_upper"] = stats_dict["mean"]
184
+ else:
185
+ stats_dict["ci_lower"] = stats_dict["mean"]
186
+ stats_dict["ci_upper"] = stats_dict["mean"]
187
+
188
+ return stats_dict
189
+
190
+
191
+ def perform_significance_test(
192
+ values1: List[float], values2: List[float], metric_name: str
193
+ ) -> Dict[str, float]:
194
+ """Perform statistical significance test between two groups."""
195
+
196
+ if len(values1) < 2 or len(values2) < 2:
197
+ return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
198
+
199
+ values1 = np.array(values1)
200
+ values2 = np.array(values2)
201
+
202
+ # Remove NaN values
203
+ values1 = values1[~np.isnan(values1)]
204
+ values2 = values2[~np.isnan(values2)]
205
+
206
+ if len(values1) < 2 or len(values2) < 2:
207
+ return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
208
+
209
+ try:
210
+ # Perform t-test
211
+ t_stat, p_value = stats.ttest_ind(values1, values2, equal_var=False)
212
+
213
+ # Calculate effect size (Cohen's d)
214
+ pooled_std = np.sqrt(
215
+ (
216
+ (len(values1) - 1) * np.var(values1, ddof=1)
217
+ + (len(values2) - 1) * np.var(values2, ddof=1)
218
+ )
219
+ / (len(values1) + len(values2) - 2)
220
+ )
221
+
222
+ if pooled_std > 0:
223
+ effect_size = abs(np.mean(values1) - np.mean(values2)) / pooled_std
224
+ else:
225
+ effect_size = 0.0
226
+
227
+ # Determine significance
228
+ significance_level = EVALUATION_TRACKS["google_comparable"][
229
+ "significance_level"
230
+ ]
231
+ significant = p_value < significance_level
232
+
233
+ return {
234
+ "p_value": float(p_value),
235
+ "effect_size": float(effect_size),
236
+ "significant": significant,
237
+ "t_statistic": float(t_stat),
238
+ }
239
+
240
+ except Exception as e:
241
+ return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
242
+
243
+
244
+ def evaluate_predictions_by_track(
245
+ predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
246
+ ) -> Dict:
247
+ """Evaluate predictions for a specific track with statistical analysis."""
248
+
249
+ print(f"πŸ”„ Evaluating for {track} track...")
250
+
251
+ track_config = EVALUATION_TRACKS[track]
252
+ track_languages = track_config["languages"]
253
+
254
+ # Filter test set and predictions to track languages
255
+ track_test_set = test_set[
256
+ (test_set["source_language"].isin(track_languages))
257
+ & (test_set["target_language"].isin(track_languages))
258
+ ].copy()
259
+
260
+ # Merge predictions with test set
261
+ merged = track_test_set.merge(
262
+ predictions, on="sample_id", how="inner", suffixes=("", "_pred")
263
  )
264
+
265
  if len(merged) == 0:
266
  return {
267
+ "error": f"No matching samples found for {track} track",
268
+ "evaluated_samples": 0,
269
+ "track": track,
270
  }
271
+
272
+ print(f"πŸ“Š Evaluating {len(merged)} samples for {track} track...")
273
+
274
  # Calculate metrics for each sample
275
  sample_metrics = []
276
  for idx, row in merged.iterrows():
277
+ metrics = calculate_sentence_metrics(row["target_text"], row["prediction"])
278
+ metrics["sample_id"] = row["sample_id"]
279
+ metrics["source_language"] = row["source_language"]
280
+ metrics["target_language"] = row["target_language"]
 
281
  sample_metrics.append(metrics)
282
+
283
  sample_df = pd.DataFrame(sample_metrics)
284
+
285
+ # Aggregate by language pairs with statistical analysis
286
  pair_metrics = {}
287
  overall_metrics = defaultdict(list)
288
+
 
289
  # Calculate metrics for each language pair
290
+ for src_lang in track_languages:
291
+ for tgt_lang in track_languages:
292
+ if src_lang == tgt_lang:
293
+ continue
294
+
295
+ pair_data = sample_df[
296
+ (sample_df["source_language"] == src_lang)
297
+ & (sample_df["target_language"] == tgt_lang)
298
+ ]
299
+
300
+ if len(pair_data) >= track_config["min_samples_per_pair"]:
301
+ pair_key = f"{src_lang}_to_{tgt_lang}"
302
+ pair_metrics[pair_key] = {}
303
+
304
+ # Calculate statistical metrics for each measure
305
+ for metric in (
306
+ METRICS_CONFIG["primary_metrics"]
307
+ + METRICS_CONFIG["secondary_metrics"]
308
+ ):
309
+ if metric in pair_data.columns:
310
+ values = (
311
+ pair_data[metric]
312
+ .replace([np.inf, -np.inf], np.nan)
313
+ .dropna()
314
+ )
315
+
316
+ if len(values) > 0:
317
+ stats_metrics = calculate_statistical_metrics(
318
+ values.tolist()
319
+ )
320
+ pair_metrics[pair_key][metric] = stats_metrics
321
+
322
+ # Add to overall metrics for track-level statistics
323
+ overall_metrics[metric].append(stats_metrics["mean"])
324
+
325
+ pair_metrics[pair_key]["sample_count"] = len(pair_data)
326
+ pair_metrics[pair_key]["languages"] = f"{src_lang}-{tgt_lang}"
327
+
328
+ # Calculate track-level aggregated statistics
329
+ track_averages = {}
330
+ track_statistics = {}
331
+
332
  for metric in overall_metrics:
333
  if overall_metrics[metric]:
334
+ track_stats = calculate_statistical_metrics(overall_metrics[metric])
335
+ track_averages[metric] = track_stats["mean"]
336
+ track_statistics[metric] = track_stats
337
+
 
 
 
 
 
 
 
 
338
  # Generate evaluation summary
339
  summary = {
340
+ "track": track,
341
+ "track_name": track_config["name"],
342
+ "total_samples": len(sample_df),
343
+ "language_pairs_evaluated": len(
344
+ [k for k in pair_metrics if pair_metrics[k].get("sample_count", 0) > 0]
345
+ ),
346
+ "languages_covered": len(
347
+ set(sample_df["source_language"]) | set(sample_df["target_language"])
348
+ ),
349
+ "min_samples_per_pair": track_config["min_samples_per_pair"],
350
+ "statistical_power": track_config["statistical_power"],
351
+ "significance_level": track_config["significance_level"],
352
  }
353
+
354
  return {
355
+ "sample_metrics": sample_df,
356
+ "pair_metrics": pair_metrics,
357
+ "track_averages": track_averages,
358
+ "track_statistics": track_statistics,
359
+ "summary": summary,
360
+ "evaluated_samples": len(sample_df),
361
+ "track": track,
362
+ "error": None,
363
  }
364
 
365
+
366
+ def evaluate_predictions_scientific(
367
+ predictions: pd.DataFrame, test_set: pd.DataFrame, model_category: str = "community"
368
+ ) -> Dict:
369
+ """Comprehensive evaluation across all tracks with scientific rigor."""
370
+
371
+ print("πŸ”¬ Starting scientific evaluation...")
372
+
373
+ # Validate model category
374
+ if model_category not in MODEL_CATEGORIES:
375
+ model_category = "community"
376
+
377
+ evaluation_results = {
378
+ "model_category": model_category,
379
+ "category_info": MODEL_CATEGORIES[model_category],
380
+ "tracks": {},
381
+ "cross_track_analysis": {},
382
+ "scientific_metadata": {
383
+ "evaluation_timestamp": pd.Timestamp.now().isoformat(),
384
+ "total_samples_submitted": len(predictions),
385
+ "total_samples_available": len(test_set),
386
+ },
387
+ }
388
+
389
+ # Evaluate each track
390
+ for track_name in EVALUATION_TRACKS.keys():
391
+ track_result = evaluate_predictions_by_track(predictions, test_set, track_name)
392
+ evaluation_results["tracks"][track_name] = track_result
393
+
394
+ # Cross-track consistency analysis
395
+ evaluation_results["cross_track_analysis"] = analyze_cross_track_consistency(
396
+ evaluation_results["tracks"]
397
+ )
398
+
399
+ return evaluation_results
400
+
401
+
402
+ def analyze_cross_track_consistency(track_results: Dict) -> Dict:
403
+ """Analyze consistency of model performance across different tracks."""
404
+
405
+ consistency_analysis = {
406
+ "track_correlations": {},
407
+ "performance_stability": {},
408
+ "language_coverage_analysis": {},
409
  }
410
+
411
+ # Extract quality scores from each track for correlation analysis
412
+ track_scores = {}
413
+ for track_name, track_data in track_results.items():
414
+ if (
415
+ track_data.get("track_averages")
416
+ and "quality_score" in track_data["track_averages"]
417
+ ):
418
+ track_scores[track_name] = track_data["track_averages"]["quality_score"]
419
+
420
+ # Calculate pairwise correlations (would need more data points for meaningful correlation)
421
+ if len(track_scores) >= 2:
422
+ track_names = list(track_scores.keys())
423
+ for i, track1 in enumerate(track_names):
424
+ for track2 in track_names[i + 1 :]:
425
+ # This would be more meaningful with multiple models
426
+ consistency_analysis["track_correlations"][f"{track1}_vs_{track2}"] = {
427
+ "score_difference": abs(
428
+ track_scores[track1] - track_scores[track2]
429
+ ),
430
+ "relative_performance": track_scores[track1]
431
+ / max(track_scores[track2], 0.001),
432
+ }
433
+
434
+ # Language coverage analysis
435
+ for track_name, track_data in track_results.items():
436
+ if track_data.get("summary"):
437
+ summary = track_data["summary"]
438
+ consistency_analysis["language_coverage_analysis"][track_name] = {
439
+ "coverage_rate": summary["language_pairs_evaluated"]
440
+ / max(summary.get("total_possible_pairs", 1), 1),
441
+ "samples_per_pair": summary["total_samples"]
442
+ / max(summary["language_pairs_evaluated"], 1),
443
+ "statistical_adequacy": summary["total_samples"]
444
+ >= EVALUATION_TRACKS[track_name]["min_samples_per_pair"]
445
+ * summary["language_pairs_evaluated"],
446
  }
447
+
448
+ return consistency_analysis
449
+
450
+
451
+ def compare_models_statistically(
452
+ model1_results: Dict, model2_results: Dict, track: str = "google_comparable"
453
+ ) -> Dict:
454
+ """Perform statistical comparison between two models on a specific track."""
455
+
456
+ if track not in model1_results.get("tracks", {}) or track not in model2_results.get(
457
+ "tracks", {}
458
+ ):
459
+ return {"error": f"Track {track} not available for both models"}
460
+
461
+ track1_data = model1_results["tracks"][track]
462
+ track2_data = model2_results["tracks"][track]
463
+
464
+ if track1_data.get("error") or track2_data.get("error"):
465
+ return {"error": "One or both models have evaluation errors"}
466
+
467
+ comparison_results = {
468
+ "track": track,
469
+ "model1_category": model1_results.get("model_category", "unknown"),
470
+ "model2_category": model2_results.get("model_category", "unknown"),
471
+ "metric_comparisons": {},
472
+ "language_pair_comparisons": {},
473
+ "overall_significance": {},
474
+ }
475
+
476
+ # Compare each metric
477
+ for metric in (
478
+ METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]
479
+ ):
480
+ if metric in track1_data.get(
481
+ "track_statistics", {}
482
+ ) and metric in track2_data.get("track_statistics", {}):
483
+
484
+ # Extract sample-level data for this metric from both models
485
+ # This would require access to the original sample metrics
486
+ # For now, we'll use the aggregated statistics
487
+
488
+ stats1 = track1_data["track_statistics"][metric]
489
+ stats2 = track2_data["track_statistics"][metric]
490
+
491
+ # Create comparison summary
492
+ comparison_results["metric_comparisons"][metric] = {
493
+ "model1_mean": stats1["mean"],
494
+ "model1_ci": [stats1["ci_lower"], stats1["ci_upper"]],
495
+ "model2_mean": stats2["mean"],
496
+ "model2_ci": [stats2["ci_lower"], stats2["ci_upper"]],
497
+ "difference": stats1["mean"] - stats2["mean"],
498
+ "ci_overlap": not (
499
+ stats1["ci_upper"] < stats2["ci_lower"]
500
+ or stats2["ci_upper"] < stats1["ci_lower"]
501
+ ),
502
+ }
503
+
504
+ return comparison_results
505
+
506
+
507
+ def generate_scientific_report(
508
+ results: Dict, model_name: str = "", baseline_results: Dict = None
509
+ ) -> str:
510
+ """Generate a comprehensive scientific evaluation report."""
511
+
512
+ if any(
513
+ track_data.get("error") for track_data in results.get("tracks", {}).values()
514
+ ):
515
+ return f"❌ **Evaluation Error**: Unable to complete scientific evaluation"
516
+
517
  report = []
518
+
519
  # Header
520
+ report.append(f"# πŸ”¬ Scientific Evaluation Report: {model_name or 'Model'}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  report.append("")
522
+
523
+ # Model categorization
524
+ category_info = results.get("category_info", {})
525
+ report.append(f"**Model Category**: {category_info.get('name', 'Unknown')}")
526
+ report.append(
527
+ f"**Category Description**: {category_info.get('description', 'N/A')}"
528
+ )
529
  report.append("")
530
+
531
+ # Track-by-track analysis
532
+ for track_name, track_data in results.get("tracks", {}).items():
533
+ if track_data.get("error"):
534
+ continue
535
+
536
+ track_config = EVALUATION_TRACKS[track_name]
537
+ summary = track_data.get("summary", {})
538
+ track_stats = track_data.get("track_statistics", {})
539
+
540
+ report.append(f"## {track_config['name']}")
541
+ report.append(f"*{track_config['description']}*")
542
+ report.append("")
543
+
544
+ # Summary statistics
545
+ report.append("### πŸ“Š Summary Statistics")
546
+ report.append(f"- **Samples Evaluated**: {summary.get('total_samples', 0):,}")
547
+ report.append(
548
+ f"- **Language Pairs**: {summary.get('language_pairs_evaluated', 0)}"
549
  )
550
+ report.append(f"- **Languages Covered**: {summary.get('languages_covered', 0)}")
551
+ report.append(f"- **Statistical Power**: {track_config['statistical_power']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
552
  report.append("")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553
 
554
+ # Primary metrics with confidence intervals
555
+ report.append("### 🎯 Primary Metrics (95% Confidence Intervals)")
556
+ for metric in METRICS_CONFIG["primary_metrics"]:
557
+ if metric in track_stats:
558
+ stats = track_stats[metric]
559
+ mean_val = stats["mean"]
560
+ ci_lower = stats["ci_lower"]
561
+ ci_upper = stats["ci_upper"]
562
+
563
+ report.append(
564
+ f"- **{metric.upper()}**: {mean_val:.4f} [{ci_lower:.4f}, {ci_upper:.4f}]"
565
+ )
566
+ report.append("")
567
+
568
+ # Statistical adequacy assessment
569
+ min_required = track_config["min_samples_per_pair"] * summary.get(
570
+ "language_pairs_evaluated", 0
571
+ )
572
+ adequacy = (
573
+ "βœ… Adequate"
574
+ if summary.get("total_samples", 0) >= min_required
575
+ else "⚠️ Limited"
576
+ )
577
+ report.append(f"**Statistical Adequacy**: {adequacy}")
578
+ report.append("")
579
+
580
+ # Cross-track analysis
581
+ cross_track = results.get("cross_track_analysis", {})
582
+ if cross_track:
583
+ report.append("## πŸ”„ Cross-Track Consistency Analysis")
584
+
585
+ coverage_analysis = cross_track.get("language_coverage_analysis", {})
586
+ for track_name, coverage_info in coverage_analysis.items():
587
+ adequacy = (
588
+ "βœ… Statistically adequate"
589
+ if coverage_info.get("statistical_adequacy")
590
+ else "⚠️ Limited statistical power"
591
+ )
592
+ report.append(f"- **{track_name}**: {adequacy}")
593
+
594
+ report.append("")
595
+
596
+ # Baseline comparison if available
597
+ if baseline_results:
598
+ report.append("## πŸ“ˆ Baseline Comparison")
599
+ # This would include detailed statistical comparisons
600
+ report.append("*Statistical comparison with baseline models*")
601
+ report.append("")
602
+
603
+ # Scientific recommendations
604
+ report.append("## πŸ’‘ Scientific Recommendations")
605
+
606
+ total_samples = sum(
607
+ track_data.get("summary", {}).get("total_samples", 0)
608
+ for track_data in results.get("tracks", {}).values()
609
+ if not track_data.get("error")
610
+ )
611
+
612
+ if total_samples < SAMPLE_SIZE_RECOMMENDATIONS["publication_quality"]:
613
+ report.append(
614
+ "- ⚠️ Consider collecting more evaluation samples for publication-quality results"
615
+ )
616
+
617
+ google_track = results.get("tracks", {}).get("google_comparable", {})
618
+ if (
619
+ not google_track.get("error")
620
+ and google_track.get("summary", {}).get("total_samples", 0) > 100
621
+ ):
622
+ report.append("- βœ… Sufficient data for comparison with commercial systems")
623
+
624
+ report.append("")
625
+
626
+ return "\n".join(report)