akera commited on
Commit
fb1cc27
·
verified ·
1 Parent(s): 2275510

Update src/utils.py

Browse files
Files changed (1) hide show
  1. src/utils.py +128 -155
src/utils.py CHANGED
@@ -13,6 +13,7 @@ from config import (
13
  MODEL_CATEGORIES,
14
  STATISTICAL_CONFIG,
15
  METRICS_CONFIG,
 
16
  )
17
 
18
 
@@ -40,7 +41,7 @@ def get_track_language_pairs(track: str) -> List[Tuple[str, str]]:
40
  """Get language pairs for a specific evaluation track."""
41
  if track not in EVALUATION_TRACKS:
42
  return []
43
-
44
  track_languages = EVALUATION_TRACKS[track]["languages"]
45
  pairs = []
46
  for src in track_languages:
@@ -73,40 +74,35 @@ def sanitize_model_name(name: str) -> str:
73
  """Sanitize model name for display and storage with enhanced validation."""
74
  if not name or not isinstance(name, str):
75
  return "Anonymous_Model"
76
-
77
  # Remove special characters, limit length
78
  name = re.sub(r"[^\w\-.]", "_", name.strip())
79
  # Remove multiple consecutive underscores
80
  name = re.sub(r"_+", "_", name)
81
  # Remove leading/trailing underscores
82
  name = name.strip("_")
83
-
84
  # Ensure minimum length
85
  if len(name) < 3:
86
  name = f"Model_{name}"
87
-
88
  # Check for reserved names
89
  reserved_names = ["admin", "test", "baseline", "google", "system"]
90
  if name.lower() in reserved_names:
91
  name = f"User_{name}"
92
-
93
  return name[:50] # Limit to 50 characters
94
 
95
 
96
- def format_metric_value(
97
- value: float,
98
- metric: str,
99
- include_ci: bool = False,
100
- ci_lower: float = None,
101
- ci_upper: float = None,
102
- ) -> str:
103
  """Format metric value for display with optional confidence intervals."""
104
  if pd.isna(value) or value is None:
105
  return "N/A"
106
-
107
  try:
108
  precision = METRICS_CONFIG["display_precision"]
109
-
110
  if metric == "coverage_rate":
111
  formatted = f"{value:.{precision}%}"
112
  elif metric in ["bleu"]:
@@ -116,14 +112,14 @@ def format_metric_value(
116
  formatted = f"{min(value, 1.0):.{precision}f}"
117
  else:
118
  formatted = f"{value:.{precision}f}"
119
-
120
  # Add confidence interval if requested
121
  if include_ci and ci_lower is not None and ci_upper is not None:
122
  ci_str = f" [{ci_lower:.{precision}f}, {ci_upper:.{precision}f}]"
123
  formatted += ci_str
124
-
125
  return formatted
126
-
127
  except (ValueError, TypeError):
128
  return str(value)
129
 
@@ -132,32 +128,32 @@ def calculate_effect_size(values1: List[float], values2: List[float]) -> float:
132
  """Calculate Cohen's d effect size between two groups."""
133
  if len(values1) < 2 or len(values2) < 2:
134
  return 0.0
135
-
136
  try:
137
  values1 = np.array(values1)
138
  values2 = np.array(values2)
139
-
140
  # Remove NaN values
141
  values1 = values1[~np.isnan(values1)]
142
  values2 = values2[~np.isnan(values2)]
143
-
144
  if len(values1) < 2 or len(values2) < 2:
145
  return 0.0
146
-
147
  # Calculate pooled standard deviation
148
  n1, n2 = len(values1), len(values2)
149
  pooled_std = np.sqrt(
150
  ((n1 - 1) * np.var(values1, ddof=1) + (n2 - 1) * np.var(values2, ddof=1))
151
  / (n1 + n2 - 2)
152
  )
153
-
154
  if pooled_std == 0:
155
  return 0.0
156
-
157
  # Cohen's d
158
  effect_size = (np.mean(values1) - np.mean(values2)) / pooled_std
159
  return abs(effect_size)
160
-
161
  except Exception:
162
  return 0.0
163
 
@@ -165,7 +161,7 @@ def calculate_effect_size(values1: List[float], values2: List[float]) -> float:
165
  def interpret_effect_size(effect_size: float) -> str:
166
  """Interpret effect size according to Cohen's conventions."""
167
  thresholds = STATISTICAL_CONFIG["effect_size_thresholds"]
168
-
169
  if effect_size < thresholds["small"]:
170
  return "negligible"
171
  elif effect_size < thresholds["medium"]:
@@ -182,28 +178,24 @@ def calculate_statistical_power(
182
  """Estimate statistical power for given effect size and sample sizes."""
183
  if n1 < 2 or n2 < 2:
184
  return 0.0
185
-
186
  try:
187
  # Simplified power calculation using t-test
188
  # This is an approximation
189
  df = n1 + n2 - 2
190
- pooled_se = np.sqrt((1 / n1) + (1 / n2))
191
-
192
  # Critical t-value
193
- t_critical = stats.t.ppf(1 - alpha / 2, df)
194
-
195
  # Non-centrality parameter
196
  ncp = effect_size / pooled_se
197
-
198
  # Power (approximate)
199
- power = (
200
- 1
201
- - stats.t.cdf(t_critical, df, loc=ncp)
202
- + stats.t.cdf(-t_critical, df, loc=ncp)
203
- )
204
-
205
  return min(1.0, max(0.0, power))
206
-
207
  except Exception:
208
  return 0.0
209
 
@@ -211,16 +203,16 @@ def calculate_statistical_power(
211
  def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
212
  """Get comprehensive statistics about test data coverage for each track."""
213
  track_stats = {}
214
-
215
  for track_name, track_config in EVALUATION_TRACKS.items():
216
  track_languages = track_config["languages"]
217
-
218
  # Filter test data to track languages
219
  track_data = test_data[
220
- (test_data["source_language"].isin(track_languages))
221
- & (test_data["target_language"].isin(track_languages))
222
  ]
223
-
224
  if track_data.empty:
225
  track_stats[track_name] = {
226
  "total_samples": 0,
@@ -230,31 +222,29 @@ def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
230
  "adequacy_assessment": "insufficient",
231
  }
232
  continue
233
-
234
  # Calculate pair-wise statistics
235
  pair_counts = {}
236
  for src in track_languages:
237
  for tgt in track_languages:
238
  if src == tgt:
239
  continue
240
-
241
  pair_data = track_data[
242
- (track_data["source_language"] == src)
243
- & (track_data["target_language"] == tgt)
244
  ]
245
-
246
  pair_key = f"{src}_to_{tgt}"
247
  pair_counts[pair_key] = len(pair_data)
248
-
249
  # Calculate adequacy
250
  min_required = track_config["min_samples_per_pair"]
251
- adequate_pairs = sum(
252
- 1 for count in pair_counts.values() if count >= min_required
253
- )
254
  total_possible_pairs = len(track_languages) * (len(track_languages) - 1)
255
-
256
  adequacy_rate = adequate_pairs / max(total_possible_pairs, 1)
257
-
258
  if adequacy_rate >= 0.8:
259
  adequacy = "excellent"
260
  elif adequacy_rate >= 0.6:
@@ -263,7 +253,7 @@ def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
263
  adequacy = "fair"
264
  else:
265
  adequacy = "insufficient"
266
-
267
  track_stats[track_name] = {
268
  "total_samples": len(track_data),
269
  "language_pairs": len([k for k, v in pair_counts.items() if v > 0]),
@@ -273,7 +263,7 @@ def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
273
  "adequacy_rate": adequacy_rate,
274
  "min_samples_per_pair": min_required,
275
  }
276
-
277
  return track_stats
278
 
279
 
@@ -281,7 +271,7 @@ def validate_submission_completeness_scientific(
281
  predictions: pd.DataFrame, test_set: pd.DataFrame, track: str = None
282
  ) -> Dict:
283
  """Enhanced validation with track-specific analysis."""
284
-
285
  if predictions.empty or test_set.empty:
286
  return {
287
  "is_complete": False,
@@ -291,23 +281,23 @@ def validate_submission_completeness_scientific(
291
  "coverage": 0.0,
292
  "track_analysis": {},
293
  }
294
-
295
  # If track specified, filter to track languages
296
  if track and track in EVALUATION_TRACKS:
297
  track_languages = EVALUATION_TRACKS[track]["languages"]
298
  test_set = test_set[
299
- (test_set["source_language"].isin(track_languages))
300
- & (test_set["target_language"].isin(track_languages))
301
  ]
302
-
303
  try:
304
  required_ids = set(test_set["sample_id"].astype(str))
305
  provided_ids = set(predictions["sample_id"].astype(str))
306
-
307
  missing_ids = required_ids - provided_ids
308
  extra_ids = provided_ids - required_ids
309
  matching_ids = provided_ids & required_ids
310
-
311
  base_result = {
312
  "is_complete": len(missing_ids) == 0,
313
  "missing_count": len(missing_ids),
@@ -315,14 +305,14 @@ def validate_submission_completeness_scientific(
315
  "missing_ids": list(missing_ids)[:10],
316
  "coverage": len(matching_ids) / len(required_ids) if required_ids else 0.0,
317
  }
318
-
319
  # Add track-specific analysis if requested
320
  if track:
321
  track_analysis = analyze_track_coverage(predictions, test_set, track)
322
  base_result["track_analysis"] = track_analysis
323
-
324
  return base_result
325
-
326
  except Exception as e:
327
  print(f"Error in submission completeness validation: {e}")
328
  return {
@@ -339,38 +329,37 @@ def analyze_track_coverage(
339
  predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
340
  ) -> Dict:
341
  """Analyze coverage for a specific track."""
342
-
343
  if track not in EVALUATION_TRACKS:
344
  return {"error": f"Unknown track: {track}"}
345
-
346
  track_config = EVALUATION_TRACKS[track]
347
  track_languages = track_config["languages"]
348
-
349
  # Filter test set to track languages
350
  track_test_set = test_set[
351
- (test_set["source_language"].isin(track_languages))
352
- & (test_set["target_language"].isin(track_languages))
353
  ]
354
-
355
  if track_test_set.empty:
356
  return {"error": f"No test data available for {track} track"}
357
-
358
  # Merge with predictions
359
- merged = track_test_set.merge(
360
- predictions, on="sample_id", how="left", suffixes=("", "_pred")
361
- )
362
-
363
  # Analyze by language pair
364
  pair_analysis = {}
365
  for src in track_languages:
366
  for tgt in track_languages:
367
  if src == tgt:
368
  continue
369
-
370
  pair_data = merged[
371
- (merged["source_language"] == src) & (merged["target_language"] == tgt)
 
372
  ]
373
-
374
  if len(pair_data) > 0:
375
  covered = pair_data["prediction"].notna().sum()
376
  pair_analysis[f"{src}_to_{tgt}"] = {
@@ -379,19 +368,18 @@ def analyze_track_coverage(
379
  "coverage_rate": covered / len(pair_data),
380
  "meets_minimum": covered >= track_config["min_samples_per_pair"],
381
  }
382
-
383
  # Overall track statistics
384
  total_pairs = len(pair_analysis)
385
  adequate_pairs = sum(1 for info in pair_analysis.values() if info["meets_minimum"])
386
-
387
  return {
388
  "track_name": track_config["name"],
389
  "total_language_pairs": total_pairs,
390
  "adequate_pairs": adequate_pairs,
391
  "adequacy_rate": adequate_pairs / max(total_pairs, 1),
392
  "pair_analysis": pair_analysis,
393
- "overall_adequate": adequate_pairs
394
- >= total_pairs * 0.8, # 80% of pairs adequate
395
  }
396
 
397
 
@@ -399,40 +387,35 @@ def calculate_language_pair_coverage_scientific(
399
  predictions: pd.DataFrame, test_set: pd.DataFrame
400
  ) -> Dict:
401
  """Calculate comprehensive language pair coverage with statistical metrics."""
402
-
403
  if predictions.empty or test_set.empty:
404
  return {}
405
-
406
  try:
407
  # Merge to get language info
408
- merged = test_set.merge(
409
- predictions, on="sample_id", how="left", suffixes=("", "_pred")
410
- )
411
-
412
  coverage = {}
413
  for src in ALL_UG40_LANGUAGES:
414
  for tgt in ALL_UG40_LANGUAGES:
415
  if src == tgt:
416
  continue
417
-
418
  pair_data = merged[
419
- (merged["source_language"] == src)
420
- & (merged["target_language"] == tgt)
421
  ]
422
-
423
  if len(pair_data) > 0:
424
  predicted_count = pair_data["prediction"].notna().sum()
425
  coverage_rate = predicted_count / len(pair_data)
426
-
427
  # Determine which tracks include this pair
428
  tracks_included = []
429
  for track_name, track_config in EVALUATION_TRACKS.items():
430
- if (
431
- src in track_config["languages"]
432
- and tgt in track_config["languages"]
433
- ):
434
  tracks_included.append(track_name)
435
-
436
  coverage[f"{src}_{tgt}"] = {
437
  "total": len(pair_data),
438
  "predicted": predicted_count,
@@ -440,18 +423,17 @@ def calculate_language_pair_coverage_scientific(
440
  "display_name": format_language_pair(src, tgt),
441
  "tracks_included": tracks_included,
442
  "google_comparable": (
443
- src in GOOGLE_SUPPORTED_LANGUAGES
444
- and tgt in GOOGLE_SUPPORTED_LANGUAGES
445
  ),
446
  "statistical_adequacy": {
447
- track: predicted_count
448
- >= EVALUATION_TRACKS[track]["min_samples_per_pair"]
449
  for track in tracks_included
450
  },
451
  }
452
-
453
  return coverage
454
-
455
  except Exception as e:
456
  print(f"Error calculating language pair coverage: {e}")
457
  return {}
@@ -474,37 +456,37 @@ def clean_text_for_evaluation(text: str) -> str:
474
  """Clean text for evaluation, handling common encoding issues."""
475
  if not isinstance(text, str):
476
  return str(text) if text is not None else ""
477
-
478
  # Remove extra whitespace
479
  text = re.sub(r"\s+", " ", text.strip())
480
-
481
  # Handle common encoding issues
482
  text = text.replace("\u00a0", " ") # Non-breaking space
483
  text = text.replace("\u2019", "'") # Right single quotation mark
484
  text = text.replace("\u201c", '"') # Left double quotation mark
485
  text = text.replace("\u201d", '"') # Right double quotation mark
486
-
487
  return text
488
 
489
 
490
  def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -> Dict:
491
  """Extract comprehensive summary statistics from model evaluation results."""
492
-
493
  if not model_results or "tracks" not in model_results:
494
  return {}
495
-
496
  tracks = model_results["tracks"]
497
-
498
  # If specific track requested
499
  if track and track in tracks:
500
  track_data = tracks[track]
501
  if track_data.get("error"):
502
  return {"error": f"No valid data for {track} track"}
503
-
504
  track_averages = track_data.get("track_averages", {})
505
  track_statistics = track_data.get("track_statistics", {})
506
  summary = track_data.get("summary", {})
507
-
508
  stats = {
509
  "track": track,
510
  "track_name": EVALUATION_TRACKS[track]["name"],
@@ -513,10 +495,9 @@ def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -
513
  "chrf": track_averages.get("chrf", 0.0),
514
  "total_samples": summary.get("total_samples", 0),
515
  "language_pairs": summary.get("language_pairs_evaluated", 0),
516
- "statistical_adequacy": summary.get("total_samples", 0)
517
- >= 100, # Simple threshold
518
  }
519
-
520
  # Add confidence intervals if available
521
  if "quality_score" in track_statistics:
522
  quality_stats = track_statistics["quality_score"]
@@ -524,27 +505,27 @@ def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -
524
  quality_stats.get("ci_lower", 0.0),
525
  quality_stats.get("ci_upper", 0.0),
526
  ]
527
-
528
  return stats
529
-
530
  # Otherwise, return summary across all tracks
531
  all_tracks_summary = {
532
  "tracks_evaluated": len([t for t in tracks.values() if not t.get("error")]),
533
  "total_tracks": len(EVALUATION_TRACKS),
534
  "by_track": {},
535
  }
536
-
537
  for track_name, track_data in tracks.items():
538
  if not track_data.get("error"):
539
  track_averages = track_data.get("track_averages", {})
540
  summary = track_data.get("summary", {})
541
-
542
  all_tracks_summary["by_track"][track_name] = {
543
  "quality_score": track_averages.get("quality_score", 0.0),
544
  "samples": summary.get("total_samples", 0),
545
  "pairs": summary.get("language_pairs_evaluated", 0),
546
  }
547
-
548
  return all_tracks_summary
549
 
550
 
@@ -553,12 +534,10 @@ def generate_model_identifier_scientific(
553
  ) -> str:
554
  """Generate a unique scientific identifier for a model."""
555
  clean_name = sanitize_model_name(model_name)
556
- clean_author = (
557
- re.sub(r"[^\w\-]", "_", author.strip())[:20] if author else "Anonymous"
558
- )
559
  clean_category = category[:10] if category in MODEL_CATEGORIES else "community"
560
  timestamp = datetime.datetime.now().strftime("%m%d_%H%M")
561
-
562
  return f"{clean_category}_{clean_name}_{clean_author}_{timestamp}"
563
 
564
 
@@ -566,28 +545,26 @@ def validate_dataframe_structure_enhanced(
566
  df: pd.DataFrame, required_columns: List[str], track: str = None
567
  ) -> Tuple[bool, List[str]]:
568
  """Enhanced DataFrame structure validation with track-specific checks."""
569
-
570
  if df.empty:
571
  return False, ["DataFrame is empty"]
572
-
573
  issues = []
574
-
575
  # Check required columns
576
  missing_columns = [col for col in required_columns if col not in df.columns]
577
  if missing_columns:
578
  issues.append(f"Missing columns: {', '.join(missing_columns)}")
579
-
580
  # Check for track-specific requirements
581
  if track and track in EVALUATION_TRACKS:
582
  track_config = EVALUATION_TRACKS[track]
583
  min_samples = track_config.get("min_samples_per_pair", 10)
584
-
585
  # Check sample size adequacy
586
  if len(df) < min_samples * 5: # At least 5 pairs worth of data
587
- issues.append(
588
- f"Insufficient samples for {track} track (minimum ~{min_samples * 5})"
589
- )
590
-
591
  # Check data types
592
  if "sample_id" in df.columns:
593
  if not df["sample_id"].dtype == "object":
@@ -595,7 +572,7 @@ def validate_dataframe_structure_enhanced(
595
  df["sample_id"] = df["sample_id"].astype(str)
596
  except Exception:
597
  issues.append("Cannot convert sample_id to string")
598
-
599
  return len(issues) == 0, issues
600
 
601
 
@@ -613,10 +590,10 @@ def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
613
  """Truncate text to specified length with suffix."""
614
  if not isinstance(text, str):
615
  text = str(text)
616
-
617
  if len(text) <= max_length:
618
  return text
619
-
620
  return text[: max_length - len(suffix)] + suffix
621
 
622
 
@@ -624,20 +601,20 @@ def calculate_sample_size_recommendation(
624
  desired_power: float = 0.8, effect_size: float = 0.5, alpha: float = 0.05
625
  ) -> int:
626
  """Calculate recommended sample size for statistical analysis."""
627
-
628
  try:
629
  # Simplified sample size calculation for t-test
630
  # This is an approximation using Cohen's conventions
631
-
632
  z_alpha = stats.norm.ppf(1 - alpha / 2)
633
  z_beta = stats.norm.ppf(desired_power)
634
-
635
  # Sample size per group
636
  n_per_group = 2 * ((z_alpha + z_beta) / effect_size) ** 2
637
-
638
  # Round up to nearest integer
639
  return max(10, int(np.ceil(n_per_group)))
640
-
641
  except Exception:
642
  return 50 # Default fallback
643
 
@@ -646,33 +623,29 @@ def assess_model_category_appropriateness(
646
  model_name: str, category: str, performance_data: Dict
647
  ) -> Dict:
648
  """Assess if the detected/assigned model category is appropriate."""
649
-
650
  assessment = {
651
  "category": category,
652
  "appropriate": True,
653
  "confidence": 1.0,
654
  "recommendations": [],
655
  }
656
-
657
  # Check for category mismatches based on performance
658
  if category == "baseline" and performance_data:
659
  # Baselines shouldn't perform too well
660
  quality_scores = []
661
  for track_data in performance_data.get("tracks", {}).values():
662
  if not track_data.get("error"):
663
- quality_scores.append(
664
- track_data.get("track_averages", {}).get("quality_score", 0)
665
- )
666
-
667
- if (
668
- quality_scores and max(quality_scores) > 0.7
669
- ): # High performance for baseline
670
  assessment["appropriate"] = False
671
  assessment["confidence"] = 0.3
672
  assessment["recommendations"].append(
673
  "High performance suggests this might not be a baseline model"
674
  )
675
-
676
  # Check for commercial model expectations
677
  if category == "commercial":
678
  # Commercial models should have good Google-comparable performance
@@ -683,5 +656,5 @@ def assess_model_category_appropriateness(
683
  assessment["recommendations"].append(
684
  "Low performance unexpected for commercial systems"
685
  )
686
-
687
- return assessment
 
13
  MODEL_CATEGORIES,
14
  STATISTICAL_CONFIG,
15
  METRICS_CONFIG,
16
+ SAMPLE_SIZE_RECOMMENDATIONS,
17
  )
18
 
19
 
 
41
  """Get language pairs for a specific evaluation track."""
42
  if track not in EVALUATION_TRACKS:
43
  return []
44
+
45
  track_languages = EVALUATION_TRACKS[track]["languages"]
46
  pairs = []
47
  for src in track_languages:
 
74
  """Sanitize model name for display and storage with enhanced validation."""
75
  if not name or not isinstance(name, str):
76
  return "Anonymous_Model"
77
+
78
  # Remove special characters, limit length
79
  name = re.sub(r"[^\w\-.]", "_", name.strip())
80
  # Remove multiple consecutive underscores
81
  name = re.sub(r"_+", "_", name)
82
  # Remove leading/trailing underscores
83
  name = name.strip("_")
84
+
85
  # Ensure minimum length
86
  if len(name) < 3:
87
  name = f"Model_{name}"
88
+
89
  # Check for reserved names
90
  reserved_names = ["admin", "test", "baseline", "google", "system"]
91
  if name.lower() in reserved_names:
92
  name = f"User_{name}"
93
+
94
  return name[:50] # Limit to 50 characters
95
 
96
 
97
+ def format_metric_value(value: float, metric: str, include_ci: bool = False,
98
+ ci_lower: float = None, ci_upper: float = None) -> str:
 
 
 
 
 
99
  """Format metric value for display with optional confidence intervals."""
100
  if pd.isna(value) or value is None:
101
  return "N/A"
102
+
103
  try:
104
  precision = METRICS_CONFIG["display_precision"]
105
+
106
  if metric == "coverage_rate":
107
  formatted = f"{value:.{precision}%}"
108
  elif metric in ["bleu"]:
 
112
  formatted = f"{min(value, 1.0):.{precision}f}"
113
  else:
114
  formatted = f"{value:.{precision}f}"
115
+
116
  # Add confidence interval if requested
117
  if include_ci and ci_lower is not None and ci_upper is not None:
118
  ci_str = f" [{ci_lower:.{precision}f}, {ci_upper:.{precision}f}]"
119
  formatted += ci_str
120
+
121
  return formatted
122
+
123
  except (ValueError, TypeError):
124
  return str(value)
125
 
 
128
  """Calculate Cohen's d effect size between two groups."""
129
  if len(values1) < 2 or len(values2) < 2:
130
  return 0.0
131
+
132
  try:
133
  values1 = np.array(values1)
134
  values2 = np.array(values2)
135
+
136
  # Remove NaN values
137
  values1 = values1[~np.isnan(values1)]
138
  values2 = values2[~np.isnan(values2)]
139
+
140
  if len(values1) < 2 or len(values2) < 2:
141
  return 0.0
142
+
143
  # Calculate pooled standard deviation
144
  n1, n2 = len(values1), len(values2)
145
  pooled_std = np.sqrt(
146
  ((n1 - 1) * np.var(values1, ddof=1) + (n2 - 1) * np.var(values2, ddof=1))
147
  / (n1 + n2 - 2)
148
  )
149
+
150
  if pooled_std == 0:
151
  return 0.0
152
+
153
  # Cohen's d
154
  effect_size = (np.mean(values1) - np.mean(values2)) / pooled_std
155
  return abs(effect_size)
156
+
157
  except Exception:
158
  return 0.0
159
 
 
161
  def interpret_effect_size(effect_size: float) -> str:
162
  """Interpret effect size according to Cohen's conventions."""
163
  thresholds = STATISTICAL_CONFIG["effect_size_thresholds"]
164
+
165
  if effect_size < thresholds["small"]:
166
  return "negligible"
167
  elif effect_size < thresholds["medium"]:
 
178
  """Estimate statistical power for given effect size and sample sizes."""
179
  if n1 < 2 or n2 < 2:
180
  return 0.0
181
+
182
  try:
183
  # Simplified power calculation using t-test
184
  # This is an approximation
185
  df = n1 + n2 - 2
186
+ pooled_se = np.sqrt((1/n1) + (1/n2))
187
+
188
  # Critical t-value
189
+ t_critical = stats.t.ppf(1 - alpha/2, df)
190
+
191
  # Non-centrality parameter
192
  ncp = effect_size / pooled_se
193
+
194
  # Power (approximate)
195
+ power = 1 - stats.t.cdf(t_critical, df, loc=ncp) + stats.t.cdf(-t_critical, df, loc=ncp)
196
+
 
 
 
 
197
  return min(1.0, max(0.0, power))
198
+
199
  except Exception:
200
  return 0.0
201
 
 
203
  def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
204
  """Get comprehensive statistics about test data coverage for each track."""
205
  track_stats = {}
206
+
207
  for track_name, track_config in EVALUATION_TRACKS.items():
208
  track_languages = track_config["languages"]
209
+
210
  # Filter test data to track languages
211
  track_data = test_data[
212
+ (test_data["source_language"].isin(track_languages)) &
213
+ (test_data["target_language"].isin(track_languages))
214
  ]
215
+
216
  if track_data.empty:
217
  track_stats[track_name] = {
218
  "total_samples": 0,
 
222
  "adequacy_assessment": "insufficient",
223
  }
224
  continue
225
+
226
  # Calculate pair-wise statistics
227
  pair_counts = {}
228
  for src in track_languages:
229
  for tgt in track_languages:
230
  if src == tgt:
231
  continue
232
+
233
  pair_data = track_data[
234
+ (track_data["source_language"] == src) &
235
+ (track_data["target_language"] == tgt)
236
  ]
237
+
238
  pair_key = f"{src}_to_{tgt}"
239
  pair_counts[pair_key] = len(pair_data)
240
+
241
  # Calculate adequacy
242
  min_required = track_config["min_samples_per_pair"]
243
+ adequate_pairs = sum(1 for count in pair_counts.values() if count >= min_required)
 
 
244
  total_possible_pairs = len(track_languages) * (len(track_languages) - 1)
245
+
246
  adequacy_rate = adequate_pairs / max(total_possible_pairs, 1)
247
+
248
  if adequacy_rate >= 0.8:
249
  adequacy = "excellent"
250
  elif adequacy_rate >= 0.6:
 
253
  adequacy = "fair"
254
  else:
255
  adequacy = "insufficient"
256
+
257
  track_stats[track_name] = {
258
  "total_samples": len(track_data),
259
  "language_pairs": len([k for k, v in pair_counts.items() if v > 0]),
 
263
  "adequacy_rate": adequacy_rate,
264
  "min_samples_per_pair": min_required,
265
  }
266
+
267
  return track_stats
268
 
269
 
 
271
  predictions: pd.DataFrame, test_set: pd.DataFrame, track: str = None
272
  ) -> Dict:
273
  """Enhanced validation with track-specific analysis."""
274
+
275
  if predictions.empty or test_set.empty:
276
  return {
277
  "is_complete": False,
 
281
  "coverage": 0.0,
282
  "track_analysis": {},
283
  }
284
+
285
  # If track specified, filter to track languages
286
  if track and track in EVALUATION_TRACKS:
287
  track_languages = EVALUATION_TRACKS[track]["languages"]
288
  test_set = test_set[
289
+ (test_set["source_language"].isin(track_languages)) &
290
+ (test_set["target_language"].isin(track_languages))
291
  ]
292
+
293
  try:
294
  required_ids = set(test_set["sample_id"].astype(str))
295
  provided_ids = set(predictions["sample_id"].astype(str))
296
+
297
  missing_ids = required_ids - provided_ids
298
  extra_ids = provided_ids - required_ids
299
  matching_ids = provided_ids & required_ids
300
+
301
  base_result = {
302
  "is_complete": len(missing_ids) == 0,
303
  "missing_count": len(missing_ids),
 
305
  "missing_ids": list(missing_ids)[:10],
306
  "coverage": len(matching_ids) / len(required_ids) if required_ids else 0.0,
307
  }
308
+
309
  # Add track-specific analysis if requested
310
  if track:
311
  track_analysis = analyze_track_coverage(predictions, test_set, track)
312
  base_result["track_analysis"] = track_analysis
313
+
314
  return base_result
315
+
316
  except Exception as e:
317
  print(f"Error in submission completeness validation: {e}")
318
  return {
 
329
  predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
330
  ) -> Dict:
331
  """Analyze coverage for a specific track."""
332
+
333
  if track not in EVALUATION_TRACKS:
334
  return {"error": f"Unknown track: {track}"}
335
+
336
  track_config = EVALUATION_TRACKS[track]
337
  track_languages = track_config["languages"]
338
+
339
  # Filter test set to track languages
340
  track_test_set = test_set[
341
+ (test_set["source_language"].isin(track_languages)) &
342
+ (test_set["target_language"].isin(track_languages))
343
  ]
344
+
345
  if track_test_set.empty:
346
  return {"error": f"No test data available for {track} track"}
347
+
348
  # Merge with predictions
349
+ merged = track_test_set.merge(predictions, on="sample_id", how="left", suffixes=("", "_pred"))
350
+
 
 
351
  # Analyze by language pair
352
  pair_analysis = {}
353
  for src in track_languages:
354
  for tgt in track_languages:
355
  if src == tgt:
356
  continue
357
+
358
  pair_data = merged[
359
+ (merged["source_language"] == src) &
360
+ (merged["target_language"] == tgt)
361
  ]
362
+
363
  if len(pair_data) > 0:
364
  covered = pair_data["prediction"].notna().sum()
365
  pair_analysis[f"{src}_to_{tgt}"] = {
 
368
  "coverage_rate": covered / len(pair_data),
369
  "meets_minimum": covered >= track_config["min_samples_per_pair"],
370
  }
371
+
372
  # Overall track statistics
373
  total_pairs = len(pair_analysis)
374
  adequate_pairs = sum(1 for info in pair_analysis.values() if info["meets_minimum"])
375
+
376
  return {
377
  "track_name": track_config["name"],
378
  "total_language_pairs": total_pairs,
379
  "adequate_pairs": adequate_pairs,
380
  "adequacy_rate": adequate_pairs / max(total_pairs, 1),
381
  "pair_analysis": pair_analysis,
382
+ "overall_adequate": adequate_pairs >= total_pairs * 0.8, # 80% of pairs adequate
 
383
  }
384
 
385
 
 
387
  predictions: pd.DataFrame, test_set: pd.DataFrame
388
  ) -> Dict:
389
  """Calculate comprehensive language pair coverage with statistical metrics."""
390
+
391
  if predictions.empty or test_set.empty:
392
  return {}
393
+
394
  try:
395
  # Merge to get language info
396
+ merged = test_set.merge(predictions, on="sample_id", how="left", suffixes=("", "_pred"))
397
+
 
 
398
  coverage = {}
399
  for src in ALL_UG40_LANGUAGES:
400
  for tgt in ALL_UG40_LANGUAGES:
401
  if src == tgt:
402
  continue
403
+
404
  pair_data = merged[
405
+ (merged["source_language"] == src) &
406
+ (merged["target_language"] == tgt)
407
  ]
408
+
409
  if len(pair_data) > 0:
410
  predicted_count = pair_data["prediction"].notna().sum()
411
  coverage_rate = predicted_count / len(pair_data)
412
+
413
  # Determine which tracks include this pair
414
  tracks_included = []
415
  for track_name, track_config in EVALUATION_TRACKS.items():
416
+ if src in track_config["languages"] and tgt in track_config["languages"]:
 
 
 
417
  tracks_included.append(track_name)
418
+
419
  coverage[f"{src}_{tgt}"] = {
420
  "total": len(pair_data),
421
  "predicted": predicted_count,
 
423
  "display_name": format_language_pair(src, tgt),
424
  "tracks_included": tracks_included,
425
  "google_comparable": (
426
+ src in GOOGLE_SUPPORTED_LANGUAGES and
427
+ tgt in GOOGLE_SUPPORTED_LANGUAGES
428
  ),
429
  "statistical_adequacy": {
430
+ track: predicted_count >= EVALUATION_TRACKS[track]["min_samples_per_pair"]
 
431
  for track in tracks_included
432
  },
433
  }
434
+
435
  return coverage
436
+
437
  except Exception as e:
438
  print(f"Error calculating language pair coverage: {e}")
439
  return {}
 
456
  """Clean text for evaluation, handling common encoding issues."""
457
  if not isinstance(text, str):
458
  return str(text) if text is not None else ""
459
+
460
  # Remove extra whitespace
461
  text = re.sub(r"\s+", " ", text.strip())
462
+
463
  # Handle common encoding issues
464
  text = text.replace("\u00a0", " ") # Non-breaking space
465
  text = text.replace("\u2019", "'") # Right single quotation mark
466
  text = text.replace("\u201c", '"') # Left double quotation mark
467
  text = text.replace("\u201d", '"') # Right double quotation mark
468
+
469
  return text
470
 
471
 
472
  def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -> Dict:
473
  """Extract comprehensive summary statistics from model evaluation results."""
474
+
475
  if not model_results or "tracks" not in model_results:
476
  return {}
477
+
478
  tracks = model_results["tracks"]
479
+
480
  # If specific track requested
481
  if track and track in tracks:
482
  track_data = tracks[track]
483
  if track_data.get("error"):
484
  return {"error": f"No valid data for {track} track"}
485
+
486
  track_averages = track_data.get("track_averages", {})
487
  track_statistics = track_data.get("track_statistics", {})
488
  summary = track_data.get("summary", {})
489
+
490
  stats = {
491
  "track": track,
492
  "track_name": EVALUATION_TRACKS[track]["name"],
 
495
  "chrf": track_averages.get("chrf", 0.0),
496
  "total_samples": summary.get("total_samples", 0),
497
  "language_pairs": summary.get("language_pairs_evaluated", 0),
498
+ "statistical_adequacy": summary.get("total_samples", 0) >= 100, # Simple threshold
 
499
  }
500
+
501
  # Add confidence intervals if available
502
  if "quality_score" in track_statistics:
503
  quality_stats = track_statistics["quality_score"]
 
505
  quality_stats.get("ci_lower", 0.0),
506
  quality_stats.get("ci_upper", 0.0),
507
  ]
508
+
509
  return stats
510
+
511
  # Otherwise, return summary across all tracks
512
  all_tracks_summary = {
513
  "tracks_evaluated": len([t for t in tracks.values() if not t.get("error")]),
514
  "total_tracks": len(EVALUATION_TRACKS),
515
  "by_track": {},
516
  }
517
+
518
  for track_name, track_data in tracks.items():
519
  if not track_data.get("error"):
520
  track_averages = track_data.get("track_averages", {})
521
  summary = track_data.get("summary", {})
522
+
523
  all_tracks_summary["by_track"][track_name] = {
524
  "quality_score": track_averages.get("quality_score", 0.0),
525
  "samples": summary.get("total_samples", 0),
526
  "pairs": summary.get("language_pairs_evaluated", 0),
527
  }
528
+
529
  return all_tracks_summary
530
 
531
 
 
534
  ) -> str:
535
  """Generate a unique scientific identifier for a model."""
536
  clean_name = sanitize_model_name(model_name)
537
+ clean_author = re.sub(r"[^\w\-]", "_", author.strip())[:20] if author else "Anonymous"
 
 
538
  clean_category = category[:10] if category in MODEL_CATEGORIES else "community"
539
  timestamp = datetime.datetime.now().strftime("%m%d_%H%M")
540
+
541
  return f"{clean_category}_{clean_name}_{clean_author}_{timestamp}"
542
 
543
 
 
545
  df: pd.DataFrame, required_columns: List[str], track: str = None
546
  ) -> Tuple[bool, List[str]]:
547
  """Enhanced DataFrame structure validation with track-specific checks."""
548
+
549
  if df.empty:
550
  return False, ["DataFrame is empty"]
551
+
552
  issues = []
553
+
554
  # Check required columns
555
  missing_columns = [col for col in required_columns if col not in df.columns]
556
  if missing_columns:
557
  issues.append(f"Missing columns: {', '.join(missing_columns)}")
558
+
559
  # Check for track-specific requirements
560
  if track and track in EVALUATION_TRACKS:
561
  track_config = EVALUATION_TRACKS[track]
562
  min_samples = track_config.get("min_samples_per_pair", 10)
563
+
564
  # Check sample size adequacy
565
  if len(df) < min_samples * 5: # At least 5 pairs worth of data
566
+ issues.append(f"Insufficient samples for {track} track (minimum ~{min_samples * 5})")
567
+
 
 
568
  # Check data types
569
  if "sample_id" in df.columns:
570
  if not df["sample_id"].dtype == "object":
 
572
  df["sample_id"] = df["sample_id"].astype(str)
573
  except Exception:
574
  issues.append("Cannot convert sample_id to string")
575
+
576
  return len(issues) == 0, issues
577
 
578
 
 
590
  """Truncate text to specified length with suffix."""
591
  if not isinstance(text, str):
592
  text = str(text)
593
+
594
  if len(text) <= max_length:
595
  return text
596
+
597
  return text[: max_length - len(suffix)] + suffix
598
 
599
 
 
601
  desired_power: float = 0.8, effect_size: float = 0.5, alpha: float = 0.05
602
  ) -> int:
603
  """Calculate recommended sample size for statistical analysis."""
604
+
605
  try:
606
  # Simplified sample size calculation for t-test
607
  # This is an approximation using Cohen's conventions
608
+
609
  z_alpha = stats.norm.ppf(1 - alpha / 2)
610
  z_beta = stats.norm.ppf(desired_power)
611
+
612
  # Sample size per group
613
  n_per_group = 2 * ((z_alpha + z_beta) / effect_size) ** 2
614
+
615
  # Round up to nearest integer
616
  return max(10, int(np.ceil(n_per_group)))
617
+
618
  except Exception:
619
  return 50 # Default fallback
620
 
 
623
  model_name: str, category: str, performance_data: Dict
624
  ) -> Dict:
625
  """Assess if the detected/assigned model category is appropriate."""
626
+
627
  assessment = {
628
  "category": category,
629
  "appropriate": True,
630
  "confidence": 1.0,
631
  "recommendations": [],
632
  }
633
+
634
  # Check for category mismatches based on performance
635
  if category == "baseline" and performance_data:
636
  # Baselines shouldn't perform too well
637
  quality_scores = []
638
  for track_data in performance_data.get("tracks", {}).values():
639
  if not track_data.get("error"):
640
+ quality_scores.append(track_data.get("track_averages", {}).get("quality_score", 0))
641
+
642
+ if quality_scores and max(quality_scores) > 0.7: # High performance for baseline
 
 
 
 
643
  assessment["appropriate"] = False
644
  assessment["confidence"] = 0.3
645
  assessment["recommendations"].append(
646
  "High performance suggests this might not be a baseline model"
647
  )
648
+
649
  # Check for commercial model expectations
650
  if category == "commercial":
651
  # Commercial models should have good Google-comparable performance
 
656
  assessment["recommendations"].append(
657
  "Low performance unexpected for commercial systems"
658
  )
659
+
660
+ return assessment