akera commited on
Commit
aa9fced
·
verified ·
1 Parent(s): d82b528

Update src/utils.py

Browse files
Files changed (1) hide show
  1. src/utils.py +151 -448
src/utils.py CHANGED
@@ -4,16 +4,13 @@ import datetime
4
  import pandas as pd
5
  import numpy as np
6
  from typing import Dict, List, Tuple, Set, Optional, Union
7
- from scipy import stats
8
  from config import (
9
  ALL_UG40_LANGUAGES,
10
  GOOGLE_SUPPORTED_LANGUAGES,
11
  LANGUAGE_NAMES,
12
  EVALUATION_TRACKS,
13
  MODEL_CATEGORIES,
14
- STATISTICAL_CONFIG,
15
  METRICS_CONFIG,
16
- SAMPLE_SIZE_RECOMMENDATIONS,
17
  )
18
 
19
 
@@ -71,7 +68,7 @@ def create_submission_id() -> str:
71
 
72
 
73
  def sanitize_model_name(name: str) -> str:
74
- """Sanitize model name for display and storage with enhanced validation."""
75
  if not name or not isinstance(name, str):
76
  return "Anonymous_Model"
77
 
@@ -94,241 +91,87 @@ def sanitize_model_name(name: str) -> str:
94
  return name[:50] # Limit to 50 characters
95
 
96
 
97
- def format_metric_value(value: float, metric: str, include_ci: bool = False,
98
- ci_lower: float = None, ci_upper: float = None) -> str:
99
- """Format metric value for display with optional confidence intervals."""
100
  if pd.isna(value) or value is None:
101
  return "N/A"
102
 
103
  try:
104
- precision = METRICS_CONFIG["display_precision"]
 
105
 
106
  if metric == "coverage_rate":
107
- formatted = f"{value:.{precision}%}"
108
  elif metric in ["bleu"]:
109
- formatted = f"{value:.2f}"
110
  elif metric in ["cer", "wer"] and value > 1:
111
  # Cap error rates at 1.0 for display
112
- formatted = f"{min(value, 1.0):.{precision}f}"
113
  else:
114
- formatted = f"{value:.{precision}f}"
115
-
116
- # Add confidence interval if requested
117
- if include_ci and ci_lower is not None and ci_upper is not None:
118
- ci_str = f" [{ci_lower:.{precision}f}, {ci_upper:.{precision}f}]"
119
- formatted += ci_str
120
-
121
- return formatted
122
 
123
  except (ValueError, TypeError):
124
  return str(value)
125
 
126
 
127
- def calculate_effect_size(values1: List[float], values2: List[float]) -> float:
128
- """Calculate Cohen's d effect size between two groups."""
129
- if len(values1) < 2 or len(values2) < 2:
130
- return 0.0
131
-
132
  try:
133
- values1 = np.array(values1)
134
- values2 = np.array(values2)
135
-
136
- # Remove NaN values
137
- values1 = values1[~np.isnan(values1)]
138
- values2 = values2[~np.isnan(values2)]
139
-
140
- if len(values1) < 2 or len(values2) < 2:
141
- return 0.0
142
-
143
- # Calculate pooled standard deviation
144
- n1, n2 = len(values1), len(values2)
145
- pooled_std = np.sqrt(
146
- ((n1 - 1) * np.var(values1, ddof=1) + (n2 - 1) * np.var(values2, ddof=1))
147
- / (n1 + n2 - 2)
148
- )
149
-
150
- if pooled_std == 0:
151
- return 0.0
152
-
153
- # Cohen's d
154
- effect_size = (np.mean(values1) - np.mean(values2)) / pooled_std
155
- return abs(effect_size)
156
-
157
- except Exception:
158
- return 0.0
159
 
160
 
161
- def interpret_effect_size(effect_size: float) -> str:
162
- """Interpret effect size according to Cohen's conventions."""
163
- thresholds = STATISTICAL_CONFIG["effect_size_thresholds"]
 
164
 
165
- if effect_size < thresholds["small"]:
166
- return "negligible"
167
- elif effect_size < thresholds["medium"]:
168
- return "small"
169
- elif effect_size < thresholds["large"]:
170
- return "medium"
171
- else:
172
- return "large"
173
-
174
-
175
- def calculate_statistical_power(
176
- effect_size: float, n1: int, n2: int, alpha: float = 0.05
177
- ) -> float:
178
- """Estimate statistical power for given effect size and sample sizes."""
179
- if n1 < 2 or n2 < 2:
180
- return 0.0
181
 
182
- try:
183
- # Simplified power calculation using t-test
184
- # This is an approximation
185
- df = n1 + n2 - 2
186
- pooled_se = np.sqrt((1/n1) + (1/n2))
187
-
188
- # Critical t-value
189
- t_critical = stats.t.ppf(1 - alpha/2, df)
190
-
191
- # Non-centrality parameter
192
- ncp = effect_size / pooled_se
193
-
194
- # Power (approximate)
195
- power = 1 - stats.t.cdf(t_critical, df, loc=ncp) + stats.t.cdf(-t_critical, df, loc=ncp)
196
-
197
- return min(1.0, max(0.0, power))
198
-
199
- except Exception:
200
- return 0.0
201
 
202
 
203
- def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
204
- """Get comprehensive statistics about test data coverage for each track."""
205
- track_stats = {}
 
206
 
207
- for track_name, track_config in EVALUATION_TRACKS.items():
208
- track_languages = track_config["languages"]
209
-
210
- # Filter test data to track languages
211
- track_data = test_data[
212
- (test_data["source_language"].isin(track_languages)) &
213
- (test_data["target_language"].isin(track_languages))
214
- ]
215
-
216
- if track_data.empty:
217
- track_stats[track_name] = {
218
- "total_samples": 0,
219
- "language_pairs": 0,
220
- "samples_per_pair": {},
221
- "coverage_matrix": {},
222
- "adequacy_assessment": "insufficient",
223
- }
224
- continue
225
-
226
- # Calculate pair-wise statistics
227
- pair_counts = {}
228
- for src in track_languages:
229
- for tgt in track_languages:
230
- if src == tgt:
231
- continue
232
-
233
- pair_data = track_data[
234
- (track_data["source_language"] == src) &
235
- (track_data["target_language"] == tgt)
236
- ]
237
-
238
- pair_key = f"{src}_to_{tgt}"
239
- pair_counts[pair_key] = len(pair_data)
240
-
241
- # Calculate adequacy
242
- min_required = track_config["min_samples_per_pair"]
243
- adequate_pairs = sum(1 for count in pair_counts.values() if count >= min_required)
244
- total_possible_pairs = len(track_languages) * (len(track_languages) - 1)
245
-
246
- adequacy_rate = adequate_pairs / max(total_possible_pairs, 1)
247
-
248
- if adequacy_rate >= 0.8:
249
- adequacy = "excellent"
250
- elif adequacy_rate >= 0.6:
251
- adequacy = "good"
252
- elif adequacy_rate >= 0.4:
253
- adequacy = "fair"
254
- else:
255
- adequacy = "insufficient"
256
-
257
- track_stats[track_name] = {
258
- "total_samples": len(track_data),
259
- "language_pairs": len([k for k, v in pair_counts.items() if v > 0]),
260
- "samples_per_pair": pair_counts,
261
- "coverage_matrix": pair_counts,
262
- "adequacy_assessment": adequacy,
263
- "adequacy_rate": adequacy_rate,
264
- "min_samples_per_pair": min_required,
265
- }
266
 
267
- return track_stats
268
-
269
-
270
- def validate_submission_completeness_scientific(
271
- predictions: pd.DataFrame, test_set: pd.DataFrame, track: str = None
272
- ) -> Dict:
273
- """Enhanced validation with track-specific analysis."""
274
 
275
- if predictions.empty or test_set.empty:
276
- return {
277
- "is_complete": False,
278
- "missing_count": len(test_set) if not test_set.empty else 0,
279
- "extra_count": len(predictions) if not predictions.empty else 0,
280
- "missing_ids": [],
281
- "coverage": 0.0,
282
- "track_analysis": {},
283
- }
284
 
285
- # If track specified, filter to track languages
286
- if track and track in EVALUATION_TRACKS:
287
- track_languages = EVALUATION_TRACKS[track]["languages"]
288
- test_set = test_set[
289
- (test_set["source_language"].isin(track_languages)) &
290
- (test_set["target_language"].isin(track_languages))
291
- ]
292
 
293
- try:
294
- required_ids = set(test_set["sample_id"].astype(str))
295
- provided_ids = set(predictions["sample_id"].astype(str))
296
-
297
- missing_ids = required_ids - provided_ids
298
- extra_ids = provided_ids - required_ids
299
- matching_ids = provided_ids & required_ids
300
-
301
- base_result = {
302
- "is_complete": len(missing_ids) == 0,
303
- "missing_count": len(missing_ids),
304
- "extra_count": len(extra_ids),
305
- "missing_ids": list(missing_ids)[:10],
306
- "coverage": len(matching_ids) / len(required_ids) if required_ids else 0.0,
307
- }
308
-
309
- # Add track-specific analysis if requested
310
- if track:
311
- track_analysis = analyze_track_coverage(predictions, test_set, track)
312
- base_result["track_analysis"] = track_analysis
313
-
314
- return base_result
315
-
316
- except Exception as e:
317
- print(f"Error in submission completeness validation: {e}")
318
- return {
319
- "is_complete": False,
320
- "missing_count": 0,
321
- "extra_count": 0,
322
- "missing_ids": [],
323
- "coverage": 0.0,
324
- "track_analysis": {},
325
- }
326
 
327
 
328
- def analyze_track_coverage(
329
- predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
330
- ) -> Dict:
331
- """Analyze coverage for a specific track."""
332
 
333
  if track not in EVALUATION_TRACKS:
334
  return {"error": f"Unknown track: {track}"}
@@ -345,8 +188,12 @@ def analyze_track_coverage(
345
  if track_test_set.empty:
346
  return {"error": f"No test data available for {track} track"}
347
 
348
- # Merge with predictions
349
- merged = track_test_set.merge(predictions, on="sample_id", how="left", suffixes=("", "_pred"))
 
 
 
 
350
 
351
  # Analyze by language pair
352
  pair_analysis = {}
@@ -355,122 +202,119 @@ def analyze_track_coverage(
355
  if src == tgt:
356
  continue
357
 
358
- pair_data = merged[
359
- (merged["source_language"] == src) &
360
- (merged["target_language"] == tgt)
361
  ]
362
 
363
- if len(pair_data) > 0:
364
- covered = pair_data["prediction"].notna().sum()
 
 
365
  pair_analysis[f"{src}_to_{tgt}"] = {
366
- "total": len(pair_data),
367
- "covered": covered,
368
- "coverage_rate": covered / len(pair_data),
369
- "meets_minimum": covered >= track_config["min_samples_per_pair"],
370
  }
371
 
372
- # Overall track statistics
373
- total_pairs = len(pair_analysis)
374
- adequate_pairs = sum(1 for info in pair_analysis.values() if info["meets_minimum"])
375
-
376
  return {
377
  "track_name": track_config["name"],
378
- "total_language_pairs": total_pairs,
379
- "adequate_pairs": adequate_pairs,
380
- "adequacy_rate": adequate_pairs / max(total_pairs, 1),
381
  "pair_analysis": pair_analysis,
382
- "overall_adequate": adequate_pairs >= total_pairs * 0.8, # 80% of pairs adequate
383
  }
384
 
385
 
386
- def calculate_language_pair_coverage_scientific(
387
- predictions: pd.DataFrame, test_set: pd.DataFrame
388
- ) -> Dict:
389
- """Calculate comprehensive language pair coverage with statistical metrics."""
390
-
391
- if predictions.empty or test_set.empty:
392
- return {}
393
 
394
- try:
395
- # Merge to get language info
396
- merged = test_set.merge(predictions, on="sample_id", how="left", suffixes=("", "_pred"))
397
-
398
- coverage = {}
399
- for src in ALL_UG40_LANGUAGES:
400
- for tgt in ALL_UG40_LANGUAGES:
401
- if src == tgt:
402
- continue
403
-
404
- pair_data = merged[
405
- (merged["source_language"] == src) &
406
- (merged["target_language"] == tgt)
407
- ]
408
-
409
- if len(pair_data) > 0:
410
- predicted_count = pair_data["prediction"].notna().sum()
411
- coverage_rate = predicted_count / len(pair_data)
412
-
413
- # Determine which tracks include this pair
414
- tracks_included = []
415
- for track_name, track_config in EVALUATION_TRACKS.items():
416
- if src in track_config["languages"] and tgt in track_config["languages"]:
417
- tracks_included.append(track_name)
418
-
419
- coverage[f"{src}_{tgt}"] = {
420
- "total": len(pair_data),
421
- "predicted": predicted_count,
422
- "coverage": coverage_rate,
423
- "display_name": format_language_pair(src, tgt),
424
- "tracks_included": tracks_included,
425
- "google_comparable": (
426
- src in GOOGLE_SUPPORTED_LANGUAGES and
427
- tgt in GOOGLE_SUPPORTED_LANGUAGES
428
- ),
429
- "statistical_adequacy": {
430
- track: predicted_count >= EVALUATION_TRACKS[track]["min_samples_per_pair"]
431
- for track in tracks_included
432
- },
433
- }
434
-
435
- return coverage
436
-
437
- except Exception as e:
438
- print(f"Error calculating language pair coverage: {e}")
439
- return {}
440
 
441
 
442
- def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
443
- """Safely divide two numbers, handling edge cases."""
444
- try:
445
- if denominator == 0 or pd.isna(denominator) or pd.isna(numerator):
446
- return default
447
- result = numerator / denominator
448
- if pd.isna(result) or not np.isfinite(result):
449
- return default
450
- return float(result)
451
- except (TypeError, ValueError, ZeroDivisionError):
452
- return default
453
 
454
 
455
- def clean_text_for_evaluation(text: str) -> str:
456
- """Clean text for evaluation, handling common encoding issues."""
457
  if not isinstance(text, str):
458
- return str(text) if text is not None else ""
459
 
460
- # Remove extra whitespace
461
- text = re.sub(r"\s+", " ", text.strip())
462
 
463
- # Handle common encoding issues
464
- text = text.replace("\u00a0", " ") # Non-breaking space
465
- text = text.replace("\u2019", "'") # Right single quotation mark
466
- text = text.replace("\u201c", '"') # Left double quotation mark
467
- text = text.replace("\u201d", '"') # Right double quotation mark
 
 
 
 
 
 
 
 
 
468
 
469
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
471
 
472
- def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -> Dict:
473
- """Extract comprehensive summary statistics from model evaluation results."""
474
 
475
  if not model_results or "tracks" not in model_results:
476
  return {}
@@ -484,7 +328,6 @@ def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -
484
  return {"error": f"No valid data for {track} track"}
485
 
486
  track_averages = track_data.get("track_averages", {})
487
- track_statistics = track_data.get("track_statistics", {})
488
  summary = track_data.get("summary", {})
489
 
490
  stats = {
@@ -495,17 +338,8 @@ def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -
495
  "chrf": track_averages.get("chrf", 0.0),
496
  "total_samples": summary.get("total_samples", 0),
497
  "language_pairs": summary.get("language_pairs_evaluated", 0),
498
- "statistical_adequacy": summary.get("total_samples", 0) >= 100, # Simple threshold
499
  }
500
 
501
- # Add confidence intervals if available
502
- if "quality_score" in track_statistics:
503
- quality_stats = track_statistics["quality_score"]
504
- stats["confidence_interval"] = [
505
- quality_stats.get("ci_lower", 0.0),
506
- quality_stats.get("ci_upper", 0.0),
507
- ]
508
-
509
  return stats
510
 
511
  # Otherwise, return summary across all tracks
@@ -526,135 +360,4 @@ def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -
526
  "pairs": summary.get("language_pairs_evaluated", 0),
527
  }
528
 
529
- return all_tracks_summary
530
-
531
-
532
- def generate_model_identifier_scientific(
533
- model_name: str, author: str, category: str
534
- ) -> str:
535
- """Generate a unique scientific identifier for a model."""
536
- clean_name = sanitize_model_name(model_name)
537
- clean_author = re.sub(r"[^\w\-]", "_", author.strip())[:20] if author else "Anonymous"
538
- clean_category = category[:10] if category in MODEL_CATEGORIES else "community"
539
- timestamp = datetime.datetime.now().strftime("%m%d_%H%M")
540
-
541
- return f"{clean_category}_{clean_name}_{clean_author}_{timestamp}"
542
-
543
-
544
- def validate_dataframe_structure_enhanced(
545
- df: pd.DataFrame, required_columns: List[str], track: str = None
546
- ) -> Tuple[bool, List[str]]:
547
- """Enhanced DataFrame structure validation with track-specific checks."""
548
-
549
- if df.empty:
550
- return False, ["DataFrame is empty"]
551
-
552
- issues = []
553
-
554
- # Check required columns
555
- missing_columns = [col for col in required_columns if col not in df.columns]
556
- if missing_columns:
557
- issues.append(f"Missing columns: {', '.join(missing_columns)}")
558
-
559
- # Check for track-specific requirements
560
- if track and track in EVALUATION_TRACKS:
561
- track_config = EVALUATION_TRACKS[track]
562
- min_samples = track_config.get("min_samples_per_pair", 10)
563
-
564
- # Check sample size adequacy
565
- if len(df) < min_samples * 5: # At least 5 pairs worth of data
566
- issues.append(f"Insufficient samples for {track} track (minimum ~{min_samples * 5})")
567
-
568
- # Check data types
569
- if "sample_id" in df.columns:
570
- if not df["sample_id"].dtype == "object":
571
- try:
572
- df["sample_id"] = df["sample_id"].astype(str)
573
- except Exception:
574
- issues.append("Cannot convert sample_id to string")
575
-
576
- return len(issues) == 0, issues
577
-
578
-
579
- def format_duration(seconds: float) -> str:
580
- """Format duration in seconds to human-readable format."""
581
- if seconds < 60:
582
- return f"{seconds:.1f}s"
583
- elif seconds < 3600:
584
- return f"{seconds/60:.1f}m"
585
- else:
586
- return f"{seconds/3600:.1f}h"
587
-
588
-
589
- def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
590
- """Truncate text to specified length with suffix."""
591
- if not isinstance(text, str):
592
- text = str(text)
593
-
594
- if len(text) <= max_length:
595
- return text
596
-
597
- return text[: max_length - len(suffix)] + suffix
598
-
599
-
600
- def calculate_sample_size_recommendation(
601
- desired_power: float = 0.8, effect_size: float = 0.5, alpha: float = 0.05
602
- ) -> int:
603
- """Calculate recommended sample size for statistical analysis."""
604
-
605
- try:
606
- # Simplified sample size calculation for t-test
607
- # This is an approximation using Cohen's conventions
608
-
609
- z_alpha = stats.norm.ppf(1 - alpha / 2)
610
- z_beta = stats.norm.ppf(desired_power)
611
-
612
- # Sample size per group
613
- n_per_group = 2 * ((z_alpha + z_beta) / effect_size) ** 2
614
-
615
- # Round up to nearest integer
616
- return max(10, int(np.ceil(n_per_group)))
617
-
618
- except Exception:
619
- return 50 # Default fallback
620
-
621
-
622
- def assess_model_category_appropriateness(
623
- model_name: str, category: str, performance_data: Dict
624
- ) -> Dict:
625
- """Assess if the detected/assigned model category is appropriate."""
626
-
627
- assessment = {
628
- "category": category,
629
- "appropriate": True,
630
- "confidence": 1.0,
631
- "recommendations": [],
632
- }
633
-
634
- # Check for category mismatches based on performance
635
- if category == "baseline" and performance_data:
636
- # Baselines shouldn't perform too well
637
- quality_scores = []
638
- for track_data in performance_data.get("tracks", {}).values():
639
- if not track_data.get("error"):
640
- quality_scores.append(track_data.get("track_averages", {}).get("quality_score", 0))
641
-
642
- if quality_scores and max(quality_scores) > 0.7: # High performance for baseline
643
- assessment["appropriate"] = False
644
- assessment["confidence"] = 0.3
645
- assessment["recommendations"].append(
646
- "High performance suggests this might not be a baseline model"
647
- )
648
-
649
- # Check for commercial model expectations
650
- if category == "commercial":
651
- # Commercial models should have good Google-comparable performance
652
- google_track = performance_data.get("tracks", {}).get("google_comparable", {})
653
- if not google_track.get("error"):
654
- quality = google_track.get("track_averages", {}).get("quality_score", 0)
655
- if quality < 0.3: # Poor performance for commercial
656
- assessment["recommendations"].append(
657
- "Low performance unexpected for commercial systems"
658
- )
659
-
660
- return assessment
 
4
  import pandas as pd
5
  import numpy as np
6
  from typing import Dict, List, Tuple, Set, Optional, Union
 
7
  from config import (
8
  ALL_UG40_LANGUAGES,
9
  GOOGLE_SUPPORTED_LANGUAGES,
10
  LANGUAGE_NAMES,
11
  EVALUATION_TRACKS,
12
  MODEL_CATEGORIES,
 
13
  METRICS_CONFIG,
 
14
  )
15
 
16
 
 
68
 
69
 
70
  def sanitize_model_name(name: str) -> str:
71
+ """Sanitize model name for display and storage."""
72
  if not name or not isinstance(name, str):
73
  return "Anonymous_Model"
74
 
 
91
  return name[:50] # Limit to 50 characters
92
 
93
 
94
+ def format_metric_value(value: float, metric: str, precision: int = None) -> str:
95
+ """Format metric value for display."""
 
96
  if pd.isna(value) or value is None:
97
  return "N/A"
98
 
99
  try:
100
+ if precision is None:
101
+ precision = METRICS_CONFIG["display_precision"]
102
 
103
  if metric == "coverage_rate":
104
+ return f"{value:.1%}"
105
  elif metric in ["bleu"]:
106
+ return f"{value:.2f}"
107
  elif metric in ["cer", "wer"] and value > 1:
108
  # Cap error rates at 1.0 for display
109
+ return f"{min(value, 1.0):.{precision}f}"
110
  else:
111
+ return f"{value:.{precision}f}"
 
 
 
 
 
 
 
112
 
113
  except (ValueError, TypeError):
114
  return str(value)
115
 
116
 
117
+ def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
118
+ """Safely divide two numbers, handling edge cases."""
 
 
 
119
  try:
120
+ if denominator == 0 or pd.isna(denominator) or pd.isna(numerator):
121
+ return default
122
+ result = numerator / denominator
123
+ if pd.isna(result) or not np.isfinite(result):
124
+ return default
125
+ return float(result)
126
+ except (TypeError, ValueError, ZeroDivisionError):
127
+ return default
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
 
130
+ def clean_text_for_evaluation(text: str) -> str:
131
+ """Clean text for evaluation, handling common encoding issues."""
132
+ if not isinstance(text, str):
133
+ return str(text) if text is not None else ""
134
 
135
+ # Remove extra whitespace
136
+ text = re.sub(r"\s+", " ", text.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
+ # Handle common encoding issues
139
+ text = text.replace("\u00a0", " ") # Non-breaking space
140
+ text = text.replace("\u2019", "'") # Right single quotation mark
141
+ text = text.replace("\u201c", '"') # Left double quotation mark
142
+ text = text.replace("\u201d", '"') # Right double quotation mark
143
+
144
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
 
147
+ def validate_dataframe_structure(
148
+ df: pd.DataFrame, required_columns: List[str], track: str = None
149
+ ) -> Tuple[bool, List[str]]:
150
+ """Validate DataFrame structure."""
151
 
152
+ if df.empty:
153
+ return False, ["DataFrame is empty"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ issues = []
 
 
 
 
 
 
156
 
157
+ # Check required columns
158
+ missing_columns = [col for col in required_columns if col not in df.columns]
159
+ if missing_columns:
160
+ issues.append(f"Missing columns: {', '.join(missing_columns)}")
 
 
 
 
 
161
 
162
+ # Check data types
163
+ if "sample_id" in df.columns:
164
+ if not df["sample_id"].dtype == "object":
165
+ try:
166
+ df["sample_id"] = df["sample_id"].astype(str)
167
+ except Exception:
168
+ issues.append("Cannot convert sample_id to string")
169
 
170
+ return len(issues) == 0, issues
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
 
173
+ def calculate_track_coverage(predictions: pd.DataFrame, test_set: pd.DataFrame, track: str) -> Dict:
174
+ """Calculate coverage statistics for a specific track."""
 
 
175
 
176
  if track not in EVALUATION_TRACKS:
177
  return {"error": f"Unknown track: {track}"}
 
188
  if track_test_set.empty:
189
  return {"error": f"No test data available for {track} track"}
190
 
191
+ # Calculate coverage
192
+ pred_ids = set(predictions["sample_id"].astype(str))
193
+ test_ids = set(track_test_set["sample_id"].astype(str))
194
+
195
+ matching_ids = pred_ids & test_ids
196
+ coverage_rate = len(matching_ids) / len(test_ids)
197
 
198
  # Analyze by language pair
199
  pair_analysis = {}
 
202
  if src == tgt:
203
  continue
204
 
205
+ pair_test_data = track_test_set[
206
+ (track_test_set["source_language"] == src) &
207
+ (track_test_set["target_language"] == tgt)
208
  ]
209
 
210
+ if len(pair_test_data) > 0:
211
+ pair_test_ids = set(pair_test_data["sample_id"].astype(str))
212
+ pair_matching = pred_ids & pair_test_ids
213
+
214
  pair_analysis[f"{src}_to_{tgt}"] = {
215
+ "total": len(pair_test_data),
216
+ "covered": len(pair_matching),
217
+ "coverage_rate": len(pair_matching) / len(pair_test_data),
 
218
  }
219
 
 
 
 
 
220
  return {
221
  "track_name": track_config["name"],
222
+ "total_samples": len(track_test_set),
223
+ "covered_samples": len(matching_ids),
224
+ "coverage_rate": coverage_rate,
225
  "pair_analysis": pair_analysis,
 
226
  }
227
 
228
 
229
+ def generate_model_identifier(model_name: str, author: str, category: str) -> str:
230
+ """Generate a unique identifier for a model."""
231
+ clean_name = sanitize_model_name(model_name)
232
+ clean_author = re.sub(r"[^\w\-]", "_", author.strip())[:20] if author else "Anonymous"
233
+ clean_category = category[:10] if category in MODEL_CATEGORIES else "community"
234
+ timestamp = datetime.datetime.now().strftime("%m%d_%H%M")
 
235
 
236
+ return f"{clean_category}_{clean_name}_{clean_author}_{timestamp}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
 
239
+ def format_duration(seconds: float) -> str:
240
+ """Format duration in seconds to human-readable format."""
241
+ if seconds < 60:
242
+ return f"{seconds:.1f}s"
243
+ elif seconds < 3600:
244
+ return f"{seconds/60:.1f}m"
245
+ else:
246
+ return f"{seconds/3600:.1f}h"
 
 
 
247
 
248
 
249
+ def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
250
+ """Truncate text to specified length with suffix."""
251
  if not isinstance(text, str):
252
+ text = str(text)
253
 
254
+ if len(text) <= max_length:
255
+ return text
256
 
257
+ return text[: max_length - len(suffix)] + suffix
258
+
259
+
260
+ def get_language_pair_display_name(src: str, tgt: str) -> str:
261
+ """Get display name for a language pair."""
262
+ src_name = LANGUAGE_NAMES.get(src, src.upper())
263
+ tgt_name = LANGUAGE_NAMES.get(tgt, tgt.upper())
264
+ return f"{src_name} → {tgt_name}"
265
+
266
+
267
+ def validate_submission_completeness(
268
+ predictions: pd.DataFrame, test_set: pd.DataFrame, track: str = None
269
+ ) -> Dict:
270
+ """Validate submission completeness."""
271
 
272
+ if predictions.empty or test_set.empty:
273
+ return {
274
+ "is_complete": False,
275
+ "missing_count": len(test_set) if not test_set.empty else 0,
276
+ "extra_count": len(predictions) if not predictions.empty else 0,
277
+ "missing_ids": [],
278
+ "coverage": 0.0,
279
+ }
280
+
281
+ # If track specified, filter to track languages
282
+ if track and track in EVALUATION_TRACKS:
283
+ track_languages = EVALUATION_TRACKS[track]["languages"]
284
+ test_set = test_set[
285
+ (test_set["source_language"].isin(track_languages)) &
286
+ (test_set["target_language"].isin(track_languages))
287
+ ]
288
+
289
+ try:
290
+ required_ids = set(test_set["sample_id"].astype(str))
291
+ provided_ids = set(predictions["sample_id"].astype(str))
292
+
293
+ missing_ids = required_ids - provided_ids
294
+ extra_ids = provided_ids - required_ids
295
+ matching_ids = provided_ids & required_ids
296
+
297
+ return {
298
+ "is_complete": len(missing_ids) == 0,
299
+ "missing_count": len(missing_ids),
300
+ "extra_count": len(extra_ids),
301
+ "missing_ids": list(missing_ids)[:10],
302
+ "coverage": len(matching_ids) / len(required_ids) if required_ids else 0.0,
303
+ }
304
+
305
+ except Exception as e:
306
+ print(f"Error in submission completeness validation: {e}")
307
+ return {
308
+ "is_complete": False,
309
+ "missing_count": 0,
310
+ "extra_count": 0,
311
+ "missing_ids": [],
312
+ "coverage": 0.0,
313
+ }
314
 
315
 
316
+ def get_model_summary_stats(model_results: Dict, track: str = None) -> Dict:
317
+ """Extract summary statistics from model evaluation results."""
318
 
319
  if not model_results or "tracks" not in model_results:
320
  return {}
 
328
  return {"error": f"No valid data for {track} track"}
329
 
330
  track_averages = track_data.get("track_averages", {})
 
331
  summary = track_data.get("summary", {})
332
 
333
  stats = {
 
338
  "chrf": track_averages.get("chrf", 0.0),
339
  "total_samples": summary.get("total_samples", 0),
340
  "language_pairs": summary.get("language_pairs_evaluated", 0),
 
341
  }
342
 
 
 
 
 
 
 
 
 
343
  return stats
344
 
345
  # Otherwise, return summary across all tracks
 
360
  "pairs": summary.get("language_pairs_evaluated", 0),
361
  }
362
 
363
+ return all_tracks_summary