akera commited on
Commit
62d1d8a
Β·
verified Β·
1 Parent(s): aed11c8

Update src/validation.py

Browse files
Files changed (1) hide show
  1. src/validation.py +160 -221
src/validation.py CHANGED
@@ -16,74 +16,49 @@ from config import (
16
 
17
  def detect_model_category(model_name: str, author: str, description: str) -> str:
18
  """Automatically detect model category based on name and metadata."""
19
-
20
  # Combine all text for analysis
21
  text_to_analyze = f"{model_name} {author} {description}".lower()
22
-
23
  # Category detection patterns
24
  detection_patterns = PREDICTION_FORMAT["category_detection"]
25
-
26
  # Check for specific patterns
27
- if any(
28
- pattern in text_to_analyze for pattern in detection_patterns.get("google", [])
29
- ):
30
  return "commercial"
31
-
32
- if any(
33
- pattern in text_to_analyze for pattern in detection_patterns.get("nllb", [])
34
- ):
35
  return "research"
36
-
37
  if any(pattern in text_to_analyze for pattern in detection_patterns.get("m2m", [])):
38
  return "research"
39
-
40
- if any(
41
- pattern in text_to_analyze for pattern in detection_patterns.get("baseline", [])
42
- ):
43
  return "baseline"
44
-
45
  # Check for research indicators
46
  research_indicators = [
47
- "university",
48
- "research",
49
- "paper",
50
- "arxiv",
51
- "acl",
52
- "emnlp",
53
- "naacl",
54
- "transformer",
55
- "bert",
56
- "gpt",
57
- "t5",
58
- "mbart",
59
- "academic",
60
  ]
61
  if any(indicator in text_to_analyze for indicator in research_indicators):
62
  return "research"
63
-
64
  # Check for commercial indicators
65
  commercial_indicators = [
66
- "google",
67
- "microsoft",
68
- "azure",
69
- "aws",
70
- "openai",
71
- "anthropic",
72
- "commercial",
73
- "api",
74
- "cloud",
75
- "translate",
76
  ]
77
  if any(indicator in text_to_analyze for indicator in commercial_indicators):
78
  return "commercial"
79
-
80
  # Default to community
81
  return "community"
82
 
83
 
84
  def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
85
  """Enhanced file format validation with stricter requirements."""
86
-
87
  try:
88
  # Determine file type
89
  if filename.endswith(".csv"):
@@ -98,7 +73,7 @@ def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
98
  "valid": False,
99
  "error": f"Unsupported file type. Use: {', '.join(PREDICTION_FORMAT['file_types'])}",
100
  }
101
-
102
  # Check required columns
103
  missing_cols = set(PREDICTION_FORMAT["required_columns"]) - set(df.columns)
104
  if missing_cols:
@@ -106,46 +81,38 @@ def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
106
  "valid": False,
107
  "error": f"Missing required columns: {', '.join(missing_cols)}",
108
  }
109
-
110
  # Basic data validation
111
  if len(df) == 0:
112
  return {"valid": False, "error": "File is empty"}
113
-
114
  # Enhanced validation checks
115
  validation_issues = []
116
-
117
  # Check for required data
118
  if df["sample_id"].isna().any():
119
  validation_issues.append("Missing sample_id values found")
120
-
121
  if df["prediction"].isna().any():
122
  na_count = df["prediction"].isna().sum()
123
- validation_issues.append(
124
- f"Missing prediction values found ({na_count} empty predictions)"
125
- )
126
-
127
  # Check for duplicates
128
  duplicates = df["sample_id"].duplicated()
129
  if duplicates.any():
130
  dup_count = duplicates.sum()
131
- validation_issues.append(
132
- f"Duplicate sample_id values found ({dup_count} duplicates)"
133
- )
134
-
135
  # Data type validation
136
- if not df["sample_id"].dtype == "object" and not df[
137
- "sample_id"
138
- ].dtype.name.startswith("str"):
139
  df["sample_id"] = df["sample_id"].astype(str)
140
-
141
  # Check sample_id format
142
  invalid_ids = ~df["sample_id"].str.match(r"salt_\d{6}", na=False)
143
  if invalid_ids.any():
144
  invalid_count = invalid_ids.sum()
145
- validation_issues.append(
146
- f"Invalid sample_id format found ({invalid_count} invalid IDs)"
147
- )
148
-
149
  # Return results
150
  if validation_issues:
151
  return {
@@ -155,55 +122,53 @@ def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
155
  "row_count": len(df),
156
  "columns": list(df.columns),
157
  }
158
-
159
  return {
160
  "valid": True,
161
  "dataframe": df,
162
  "row_count": len(df),
163
  "columns": list(df.columns),
164
  }
165
-
166
  except Exception as e:
167
  return {"valid": False, "error": f"Error parsing file: {str(e)}"}
168
 
169
 
170
  def validate_predictions_content_enhanced(predictions: pd.DataFrame) -> Dict:
171
  """Enhanced prediction content validation with stricter quality checks."""
172
-
173
  issues = []
174
  warnings = []
175
  quality_metrics = {}
176
-
177
  # Basic content checks
178
  empty_predictions = predictions["prediction"].str.strip().eq("").sum()
179
  if empty_predictions > 0:
180
  issues.append(f"{empty_predictions} empty predictions found")
181
-
182
  # Length analysis
183
  pred_lengths = predictions["prediction"].str.len()
184
  quality_metrics["avg_length"] = float(pred_lengths.mean())
185
  quality_metrics["std_length"] = float(pred_lengths.std())
186
-
187
  # Check for suspiciously short predictions
188
  short_predictions = (pred_lengths < 3).sum()
189
  if short_predictions > len(predictions) * 0.05: # More than 5%
190
  issues.append(f"{short_predictions} very short predictions (< 3 characters)")
191
-
192
  # Check for suspiciously long predictions
193
  long_predictions = (pred_lengths > 500).sum()
194
  if long_predictions > len(predictions) * 0.01: # More than 1%
195
  warnings.append(f"{long_predictions} very long predictions (> 500 characters)")
196
-
197
  # Check for repeated predictions (more stringent)
198
  duplicate_predictions = predictions["prediction"].duplicated().sum()
199
  duplicate_rate = duplicate_predictions / len(predictions)
200
  quality_metrics["duplicate_rate"] = float(duplicate_rate)
201
-
202
  if duplicate_rate > VALIDATION_CONFIG["quality_thresholds"]["max_duplicate_rate"]:
203
- issues.append(
204
- f"{duplicate_predictions} duplicate prediction texts ({duplicate_rate:.1%})"
205
- )
206
-
207
  # Check for placeholder text
208
  placeholder_patterns = [
209
  r"^(test|placeholder|todo|xxx|aaa|bbb)$",
@@ -211,53 +176,37 @@ def validate_predictions_content_enhanced(predictions: pd.DataFrame) -> Dict:
211
  r"^\d+$", # Just numbers
212
  r"^[^\w\s]*$", # Only punctuation
213
  ]
214
-
215
  placeholder_count = 0
216
  for pattern in placeholder_patterns:
217
- placeholder_matches = (
218
- predictions["prediction"]
219
- .str.match(pattern, flags=re.IGNORECASE, na=False)
220
- .sum()
221
- )
222
  placeholder_count += placeholder_matches
223
-
224
  if placeholder_count > len(predictions) * 0.02: # More than 2%
225
  issues.append(f"{placeholder_count} placeholder-like predictions detected")
226
-
227
  # Language detection (basic)
228
- non_ascii_rate = (
229
- predictions["prediction"].str.contains(r"[^\x00-\x7f]", na=False).mean()
230
- )
231
  quality_metrics["non_ascii_rate"] = float(non_ascii_rate)
232
-
233
  # Check for appropriate character distribution for African languages
234
  if non_ascii_rate < 0.1: # Less than 10% non-ASCII might indicate English-only
235
- warnings.append(
236
- "Low non-ASCII character rate - check if translations include local language scripts"
237
- )
238
-
239
  # Calculate overall quality score
240
  quality_score = 1.0
241
  quality_score -= len(issues) * 0.3 # Major penalty for issues
242
  quality_score -= len(warnings) * 0.1 # Minor penalty for warnings
243
- quality_score -= (
244
- max(0, duplicate_rate - 0.05) * 2
245
- ) # Penalty for excessive duplicates
246
-
247
  # Length appropriateness
248
- if (
249
- quality_metrics["avg_length"]
250
- < VALIDATION_CONFIG["quality_thresholds"]["min_avg_length"]
251
- ):
252
  quality_score -= 0.2
253
- elif (
254
- quality_metrics["avg_length"]
255
- > VALIDATION_CONFIG["quality_thresholds"]["max_avg_length"]
256
- ):
257
  quality_score -= 0.1
258
-
259
  quality_score = max(0.0, min(1.0, quality_score))
260
-
261
  return {
262
  "has_issues": len(issues) > 0,
263
  "issues": issues,
@@ -271,65 +220,64 @@ def validate_against_test_set_enhanced(
271
  predictions: pd.DataFrame, test_set: pd.DataFrame
272
  ) -> Dict:
273
  """Enhanced validation against test set with track-specific analysis."""
274
-
275
  # Convert IDs to string for comparison
276
  pred_ids = set(predictions["sample_id"].astype(str))
277
  test_ids = set(test_set["sample_id"].astype(str))
278
-
279
  # Check overall coverage
280
  missing_ids = test_ids - pred_ids
281
  extra_ids = pred_ids - test_ids
282
  matching_ids = pred_ids & test_ids
283
-
284
  overall_coverage = len(matching_ids) / len(test_ids)
285
-
286
  # Track-specific coverage analysis
287
  track_coverage = {}
288
-
289
  for track_name, track_config in EVALUATION_TRACKS.items():
290
  track_languages = track_config["languages"]
291
-
292
  # Filter test set to track languages
293
  track_test_set = test_set[
294
- (test_set["source_language"].isin(track_languages))
295
- & (test_set["target_language"].isin(track_languages))
296
  ]
297
-
298
  if len(track_test_set) == 0:
299
  continue
300
-
301
  track_test_ids = set(track_test_set["sample_id"].astype(str))
302
  track_matching_ids = pred_ids & track_test_ids
303
-
304
  track_coverage[track_name] = {
305
  "total_samples": len(track_test_set),
306
  "covered_samples": len(track_matching_ids),
307
  "coverage_rate": len(track_matching_ids) / len(track_test_set),
308
- "meets_minimum": len(track_matching_ids)
309
- >= VALIDATION_CONFIG["min_samples_per_track"][track_name],
310
  "min_required": VALIDATION_CONFIG["min_samples_per_track"][track_name],
311
  }
312
-
313
  # Language pair coverage analysis
314
  pair_coverage = {}
315
  for _, row in test_set.iterrows():
316
  pair_key = f"{row['source_language']}_{row['target_language']}"
317
  if pair_key not in pair_coverage:
318
  pair_coverage[pair_key] = {"total": 0, "covered": 0}
319
-
320
  pair_coverage[pair_key]["total"] += 1
321
  if str(row["sample_id"]) in pred_ids:
322
  pair_coverage[pair_key]["covered"] += 1
323
-
324
  # Calculate pair-wise coverage rates
325
  for pair_key in pair_coverage:
326
  pair_info = pair_coverage[pair_key]
327
  pair_info["coverage_rate"] = pair_info["covered"] / pair_info["total"]
328
-
329
  # Missing rate validation
330
  missing_rate = len(missing_ids) / len(test_ids)
331
  meets_missing_threshold = missing_rate <= VALIDATION_CONFIG["max_missing_rate"]
332
-
333
  return {
334
  "overall_coverage": overall_coverage,
335
  "missing_count": len(missing_ids),
@@ -345,36 +293,38 @@ def validate_against_test_set_enhanced(
345
  }
346
 
347
 
348
- def assess_statistical_adequacy(validation_result: Dict, model_category: str) -> Dict:
 
 
349
  """Assess statistical adequacy for scientific evaluation."""
350
-
351
  adequacy_assessment = {
352
  "overall_adequate": True,
353
  "track_adequacy": {},
354
  "recommendations": [],
355
  "statistical_power_estimate": {},
356
  }
357
-
358
  track_coverage = validation_result.get("track_coverage", {})
359
-
360
  for track_name, coverage_info in track_coverage.items():
361
  track_config = EVALUATION_TRACKS[track_name]
362
-
363
  # Sample size adequacy
364
  covered_samples = coverage_info["covered_samples"]
365
  min_required = coverage_info["min_required"]
366
-
367
  sample_adequate = covered_samples >= min_required
368
-
369
  # Coverage rate adequacy
370
  coverage_rate = coverage_info["coverage_rate"]
371
  coverage_adequate = coverage_rate >= 0.8 # 80% coverage minimum
372
-
373
  # Statistical power estimation (simplified)
374
  estimated_power = min(1.0, covered_samples / (min_required * 1.5))
375
-
376
  track_adequate = sample_adequate and coverage_adequate
377
-
378
  adequacy_assessment["track_adequacy"][track_name] = {
379
  "sample_adequate": sample_adequate,
380
  "coverage_adequate": coverage_adequate,
@@ -384,31 +334,28 @@ def assess_statistical_adequacy(validation_result: Dict, model_category: str) ->
384
  "coverage_rate": coverage_rate,
385
  "estimated_power": estimated_power,
386
  }
387
-
388
  if not track_adequate:
389
  adequacy_assessment["overall_adequate"] = False
390
-
391
  adequacy_assessment["statistical_power_estimate"][track_name] = estimated_power
392
-
393
  # Generate recommendations
394
  if not adequacy_assessment["overall_adequate"]:
395
  inadequate_tracks = [
396
- track
397
- for track, info in adequacy_assessment["track_adequacy"].items()
398
  if not info["overall_adequate"]
399
  ]
400
  adequacy_assessment["recommendations"].append(
401
  f"Insufficient samples for tracks: {', '.join(inadequate_tracks)}"
402
  )
403
-
404
  # Category-specific recommendations
405
- if model_category == "commercial" and not adequacy_assessment["track_adequacy"].get(
406
- "google_comparable", {}
407
- ).get("overall_adequate", False):
408
  adequacy_assessment["recommendations"].append(
409
  "Commercial models should ensure adequate coverage of Google-comparable track"
410
  )
411
-
412
  return adequacy_assessment
413
 
414
 
@@ -421,21 +368,19 @@ def generate_scientific_validation_report(
421
  detected_category: str = "community",
422
  ) -> str:
423
  """Generate comprehensive scientific validation report."""
424
-
425
  report = []
426
-
427
  # Header
428
  report.append(f"# πŸ”¬ Scientific Validation Report: {model_name or 'Submission'}")
429
  report.append("")
430
-
431
  # Model categorization
432
- category_info = MODEL_CATEGORIES.get(
433
- detected_category, MODEL_CATEGORIES["community"]
434
- )
435
  report.append(f"**Detected Model Category**: {category_info['name']}")
436
  report.append(f"**Category Description**: {category_info['description']}")
437
  report.append("")
438
-
439
  # File format validation
440
  if format_result["valid"]:
441
  report.append("βœ… **File Format**: Valid")
@@ -445,128 +390,117 @@ def generate_scientific_validation_report(
445
  report.append("❌ **File Format**: Invalid")
446
  report.append(f" - Error: {format_result['error']}")
447
  return "\n".join(report)
448
-
449
  # Content quality validation
450
  quality_score = content_result.get("quality_score", 0.0)
451
-
452
  if content_result["has_issues"]:
453
  report.append("❌ **Content Quality**: Issues Found")
454
  for issue in content_result["issues"]:
455
  report.append(f" - ❌ {issue}")
456
  else:
457
  report.append("βœ… **Content Quality**: Good")
458
-
459
  if content_result["warnings"]:
460
  for warning in content_result["warnings"]:
461
  report.append(f" - ⚠️ {warning}")
462
-
463
  report.append(f" - **Quality Score**: {quality_score:.2f}/1.00")
464
  report.append("")
465
-
466
  # Test set coverage validation
467
  overall_coverage = test_set_result["overall_coverage"]
468
  meets_threshold = test_set_result["meets_missing_threshold"]
469
-
470
  if overall_coverage == 1.0:
471
  report.append("βœ… **Test Set Coverage**: Complete")
472
  elif overall_coverage >= 0.95 and meets_threshold:
473
  report.append("βœ… **Test Set Coverage**: Adequate")
474
  else:
475
  report.append("❌ **Test Set Coverage**: Insufficient")
476
-
477
- report.append(
478
- f" - Coverage: {overall_coverage:.1%} ({test_set_result['matching_count']:,} / {test_set_result['matching_count'] + test_set_result['missing_count']:,})"
479
- )
480
  report.append(f" - Missing Rate: {test_set_result['missing_rate']:.1%}")
481
  report.append("")
482
-
483
  # Track-specific coverage analysis
484
  report.append("## πŸ“Š Track-Specific Analysis")
485
-
486
  track_coverage = test_set_result.get("track_coverage", {})
487
  for track_name, coverage_info in track_coverage.items():
488
  track_config = EVALUATION_TRACKS[track_name]
489
-
490
  status = "βœ…" if coverage_info["meets_minimum"] else "❌"
491
  report.append(f"### {status} {track_config['name']}")
492
-
493
- report.append(
494
- f" - **Samples**: {coverage_info['covered_samples']:,} / {coverage_info['total_samples']:,}"
495
- )
496
  report.append(f" - **Coverage**: {coverage_info['coverage_rate']:.1%}")
497
  report.append(f" - **Minimum Required**: {coverage_info['min_required']:,}")
498
- report.append(
499
- f" - **Status**: {'Adequate' if coverage_info['meets_minimum'] else 'Insufficient'}"
500
- )
501
  report.append("")
502
-
503
  # Statistical adequacy assessment
504
  report.append("## πŸ”¬ Statistical Adequacy Assessment")
505
-
506
  if adequacy_result["overall_adequate"]:
507
- report.append(
508
- "βœ… **Overall Assessment**: Statistically adequate for scientific evaluation"
509
- )
510
  else:
511
- report.append(
512
- "❌ **Overall Assessment**: Insufficient for rigorous scientific evaluation"
513
- )
514
-
515
  # Track adequacy details
516
  for track_name, track_adequacy in adequacy_result["track_adequacy"].items():
517
  track_config = EVALUATION_TRACKS[track_name]
518
  power = track_adequacy["estimated_power"]
519
-
520
  status = "βœ…" if track_adequacy["overall_adequate"] else "❌"
521
- report.append(
522
- f" - {status} **{track_config['name']}**: Statistical power β‰ˆ {power:.1%}"
523
- )
524
-
525
  # Recommendations
526
  if adequacy_result["recommendations"]:
527
  report.append("")
528
  report.append("## πŸ’‘ Recommendations")
529
  for rec in adequacy_result["recommendations"]:
530
  report.append(f" - {rec}")
531
-
532
  # Final verdict
533
  report.append("")
534
  all_checks_pass = (
535
- format_result["valid"]
536
- and not content_result["has_issues"]
537
- and overall_coverage >= 0.95
538
- and meets_threshold
539
- and adequacy_result["overall_adequate"]
540
  )
541
-
542
  if all_checks_pass:
543
  report.append("πŸŽ‰ **Final Verdict**: Ready for scientific evaluation!")
544
  elif format_result["valid"] and overall_coverage >= 0.8:
545
  report.append("⚠️ **Final Verdict**: Can be evaluated with limitations")
546
  else:
547
  report.append("❌ **Final Verdict**: Please address issues before submission")
548
-
549
  return "\n".join(report)
550
 
551
 
552
  def validate_submission_scientific(
553
- file_content: bytes,
554
- filename: str,
555
- test_set: pd.DataFrame,
556
  model_name: str = "",
557
  author: str = "",
558
- description: str = "",
559
  ) -> Dict:
560
  """Complete scientific validation pipeline for submissions."""
561
-
562
  # Step 1: Detect model category
563
  detected_category = detect_model_category(model_name, author, description)
564
-
565
  # Step 2: Enhanced file format validation
566
  format_result = validate_file_format_enhanced(file_content, filename)
567
  if not format_result["valid"]:
568
  return {
569
  "valid": False,
 
570
  "category": detected_category,
571
  "report": generate_scientific_validation_report(
572
  format_result, {}, {}, {}, model_name, detected_category
@@ -574,39 +508,43 @@ def validate_submission_scientific(
574
  "predictions": None,
575
  "adequacy": {},
576
  }
577
-
578
  predictions = format_result["dataframe"]
579
-
580
  # Step 3: Enhanced content validation
581
  content_result = validate_predictions_content_enhanced(predictions)
582
-
583
  # Step 4: Enhanced test set validation
584
  test_set_result = validate_against_test_set_enhanced(predictions, test_set)
585
-
586
  # Step 5: Statistical adequacy assessment
587
  adequacy_result = assess_statistical_adequacy(test_set_result, detected_category)
588
-
589
  # Step 6: Generate comprehensive report
590
  report = generate_scientific_validation_report(
591
- format_result,
592
- content_result,
593
- test_set_result,
594
- adequacy_result,
595
- model_name,
596
- detected_category,
597
  )
598
-
599
- # Overall validity determination
600
- is_valid = (
601
- format_result["valid"]
602
- and not content_result["has_issues"]
603
- and test_set_result["overall_coverage"] >= 0.95
604
- and test_set_result["meets_missing_threshold"]
605
- and adequacy_result["overall_adequate"]
606
  )
607
-
 
 
 
 
 
 
 
608
  return {
609
- "valid": is_valid,
 
610
  "category": detected_category,
611
  "coverage": test_set_result["overall_coverage"],
612
  "report": report,
@@ -619,5 +557,6 @@ def validate_submission_scientific(
619
  "validation_version": "2.0-scientific",
620
  "detected_category": detected_category,
621
  "statistical_adequacy": adequacy_result["overall_adequate"],
 
622
  },
623
- }
 
16
 
17
  def detect_model_category(model_name: str, author: str, description: str) -> str:
18
  """Automatically detect model category based on name and metadata."""
19
+
20
  # Combine all text for analysis
21
  text_to_analyze = f"{model_name} {author} {description}".lower()
22
+
23
  # Category detection patterns
24
  detection_patterns = PREDICTION_FORMAT["category_detection"]
25
+
26
  # Check for specific patterns
27
+ if any(pattern in text_to_analyze for pattern in detection_patterns.get("google", [])):
 
 
28
  return "commercial"
29
+
30
+ if any(pattern in text_to_analyze for pattern in detection_patterns.get("nllb", [])):
 
 
31
  return "research"
32
+
33
  if any(pattern in text_to_analyze for pattern in detection_patterns.get("m2m", [])):
34
  return "research"
35
+
36
+ if any(pattern in text_to_analyze for pattern in detection_patterns.get("baseline", [])):
 
 
37
  return "baseline"
38
+
39
  # Check for research indicators
40
  research_indicators = [
41
+ "university", "research", "paper", "arxiv", "acl", "emnlp", "naacl",
42
+ "transformer", "bert", "gpt", "t5", "mbart", "academic"
 
 
 
 
 
 
 
 
 
 
 
43
  ]
44
  if any(indicator in text_to_analyze for indicator in research_indicators):
45
  return "research"
46
+
47
  # Check for commercial indicators
48
  commercial_indicators = [
49
+ "google", "microsoft", "azure", "aws", "openai", "anthropic",
50
+ "commercial", "api", "cloud", "translate"
 
 
 
 
 
 
 
 
51
  ]
52
  if any(indicator in text_to_analyze for indicator in commercial_indicators):
53
  return "commercial"
54
+
55
  # Default to community
56
  return "community"
57
 
58
 
59
  def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
60
  """Enhanced file format validation with stricter requirements."""
61
+
62
  try:
63
  # Determine file type
64
  if filename.endswith(".csv"):
 
73
  "valid": False,
74
  "error": f"Unsupported file type. Use: {', '.join(PREDICTION_FORMAT['file_types'])}",
75
  }
76
+
77
  # Check required columns
78
  missing_cols = set(PREDICTION_FORMAT["required_columns"]) - set(df.columns)
79
  if missing_cols:
 
81
  "valid": False,
82
  "error": f"Missing required columns: {', '.join(missing_cols)}",
83
  }
84
+
85
  # Basic data validation
86
  if len(df) == 0:
87
  return {"valid": False, "error": "File is empty"}
88
+
89
  # Enhanced validation checks
90
  validation_issues = []
91
+
92
  # Check for required data
93
  if df["sample_id"].isna().any():
94
  validation_issues.append("Missing sample_id values found")
95
+
96
  if df["prediction"].isna().any():
97
  na_count = df["prediction"].isna().sum()
98
+ validation_issues.append(f"Missing prediction values found ({na_count} empty predictions)")
99
+
 
 
100
  # Check for duplicates
101
  duplicates = df["sample_id"].duplicated()
102
  if duplicates.any():
103
  dup_count = duplicates.sum()
104
+ validation_issues.append(f"Duplicate sample_id values found ({dup_count} duplicates)")
105
+
 
 
106
  # Data type validation
107
+ if not df["sample_id"].dtype == "object" and not df["sample_id"].dtype.name.startswith("str"):
 
 
108
  df["sample_id"] = df["sample_id"].astype(str)
109
+
110
  # Check sample_id format
111
  invalid_ids = ~df["sample_id"].str.match(r"salt_\d{6}", na=False)
112
  if invalid_ids.any():
113
  invalid_count = invalid_ids.sum()
114
+ validation_issues.append(f"Invalid sample_id format found ({invalid_count} invalid IDs)")
115
+
 
 
116
  # Return results
117
  if validation_issues:
118
  return {
 
122
  "row_count": len(df),
123
  "columns": list(df.columns),
124
  }
125
+
126
  return {
127
  "valid": True,
128
  "dataframe": df,
129
  "row_count": len(df),
130
  "columns": list(df.columns),
131
  }
132
+
133
  except Exception as e:
134
  return {"valid": False, "error": f"Error parsing file: {str(e)}"}
135
 
136
 
137
  def validate_predictions_content_enhanced(predictions: pd.DataFrame) -> Dict:
138
  """Enhanced prediction content validation with stricter quality checks."""
139
+
140
  issues = []
141
  warnings = []
142
  quality_metrics = {}
143
+
144
  # Basic content checks
145
  empty_predictions = predictions["prediction"].str.strip().eq("").sum()
146
  if empty_predictions > 0:
147
  issues.append(f"{empty_predictions} empty predictions found")
148
+
149
  # Length analysis
150
  pred_lengths = predictions["prediction"].str.len()
151
  quality_metrics["avg_length"] = float(pred_lengths.mean())
152
  quality_metrics["std_length"] = float(pred_lengths.std())
153
+
154
  # Check for suspiciously short predictions
155
  short_predictions = (pred_lengths < 3).sum()
156
  if short_predictions > len(predictions) * 0.05: # More than 5%
157
  issues.append(f"{short_predictions} very short predictions (< 3 characters)")
158
+
159
  # Check for suspiciously long predictions
160
  long_predictions = (pred_lengths > 500).sum()
161
  if long_predictions > len(predictions) * 0.01: # More than 1%
162
  warnings.append(f"{long_predictions} very long predictions (> 500 characters)")
163
+
164
  # Check for repeated predictions (more stringent)
165
  duplicate_predictions = predictions["prediction"].duplicated().sum()
166
  duplicate_rate = duplicate_predictions / len(predictions)
167
  quality_metrics["duplicate_rate"] = float(duplicate_rate)
168
+
169
  if duplicate_rate > VALIDATION_CONFIG["quality_thresholds"]["max_duplicate_rate"]:
170
+ issues.append(f"{duplicate_predictions} duplicate prediction texts ({duplicate_rate:.1%})")
171
+
 
 
172
  # Check for placeholder text
173
  placeholder_patterns = [
174
  r"^(test|placeholder|todo|xxx|aaa|bbb)$",
 
176
  r"^\d+$", # Just numbers
177
  r"^[^\w\s]*$", # Only punctuation
178
  ]
179
+
180
  placeholder_count = 0
181
  for pattern in placeholder_patterns:
182
+ placeholder_matches = predictions["prediction"].str.match(pattern, flags=re.IGNORECASE, na=False).sum()
 
 
 
 
183
  placeholder_count += placeholder_matches
184
+
185
  if placeholder_count > len(predictions) * 0.02: # More than 2%
186
  issues.append(f"{placeholder_count} placeholder-like predictions detected")
187
+
188
  # Language detection (basic)
189
+ non_ascii_rate = predictions["prediction"].str.contains(r"[^\x00-\x7f]", na=False).mean()
 
 
190
  quality_metrics["non_ascii_rate"] = float(non_ascii_rate)
191
+
192
  # Check for appropriate character distribution for African languages
193
  if non_ascii_rate < 0.1: # Less than 10% non-ASCII might indicate English-only
194
+ warnings.append("Low non-ASCII character rate - check if translations include local language scripts")
195
+
 
 
196
  # Calculate overall quality score
197
  quality_score = 1.0
198
  quality_score -= len(issues) * 0.3 # Major penalty for issues
199
  quality_score -= len(warnings) * 0.1 # Minor penalty for warnings
200
+ quality_score -= max(0, duplicate_rate - 0.05) * 2 # Penalty for excessive duplicates
201
+
 
 
202
  # Length appropriateness
203
+ if quality_metrics["avg_length"] < VALIDATION_CONFIG["quality_thresholds"]["min_avg_length"]:
 
 
 
204
  quality_score -= 0.2
205
+ elif quality_metrics["avg_length"] > VALIDATION_CONFIG["quality_thresholds"]["max_avg_length"]:
 
 
 
206
  quality_score -= 0.1
207
+
208
  quality_score = max(0.0, min(1.0, quality_score))
209
+
210
  return {
211
  "has_issues": len(issues) > 0,
212
  "issues": issues,
 
220
  predictions: pd.DataFrame, test_set: pd.DataFrame
221
  ) -> Dict:
222
  """Enhanced validation against test set with track-specific analysis."""
223
+
224
  # Convert IDs to string for comparison
225
  pred_ids = set(predictions["sample_id"].astype(str))
226
  test_ids = set(test_set["sample_id"].astype(str))
227
+
228
  # Check overall coverage
229
  missing_ids = test_ids - pred_ids
230
  extra_ids = pred_ids - test_ids
231
  matching_ids = pred_ids & test_ids
232
+
233
  overall_coverage = len(matching_ids) / len(test_ids)
234
+
235
  # Track-specific coverage analysis
236
  track_coverage = {}
237
+
238
  for track_name, track_config in EVALUATION_TRACKS.items():
239
  track_languages = track_config["languages"]
240
+
241
  # Filter test set to track languages
242
  track_test_set = test_set[
243
+ (test_set["source_language"].isin(track_languages)) &
244
+ (test_set["target_language"].isin(track_languages))
245
  ]
246
+
247
  if len(track_test_set) == 0:
248
  continue
249
+
250
  track_test_ids = set(track_test_set["sample_id"].astype(str))
251
  track_matching_ids = pred_ids & track_test_ids
252
+
253
  track_coverage[track_name] = {
254
  "total_samples": len(track_test_set),
255
  "covered_samples": len(track_matching_ids),
256
  "coverage_rate": len(track_matching_ids) / len(track_test_set),
257
+ "meets_minimum": len(track_matching_ids) >= VALIDATION_CONFIG["min_samples_per_track"][track_name],
 
258
  "min_required": VALIDATION_CONFIG["min_samples_per_track"][track_name],
259
  }
260
+
261
  # Language pair coverage analysis
262
  pair_coverage = {}
263
  for _, row in test_set.iterrows():
264
  pair_key = f"{row['source_language']}_{row['target_language']}"
265
  if pair_key not in pair_coverage:
266
  pair_coverage[pair_key] = {"total": 0, "covered": 0}
267
+
268
  pair_coverage[pair_key]["total"] += 1
269
  if str(row["sample_id"]) in pred_ids:
270
  pair_coverage[pair_key]["covered"] += 1
271
+
272
  # Calculate pair-wise coverage rates
273
  for pair_key in pair_coverage:
274
  pair_info = pair_coverage[pair_key]
275
  pair_info["coverage_rate"] = pair_info["covered"] / pair_info["total"]
276
+
277
  # Missing rate validation
278
  missing_rate = len(missing_ids) / len(test_ids)
279
  meets_missing_threshold = missing_rate <= VALIDATION_CONFIG["max_missing_rate"]
280
+
281
  return {
282
  "overall_coverage": overall_coverage,
283
  "missing_count": len(missing_ids),
 
293
  }
294
 
295
 
296
+ def assess_statistical_adequacy(
297
+ validation_result: Dict, model_category: str
298
+ ) -> Dict:
299
  """Assess statistical adequacy for scientific evaluation."""
300
+
301
  adequacy_assessment = {
302
  "overall_adequate": True,
303
  "track_adequacy": {},
304
  "recommendations": [],
305
  "statistical_power_estimate": {},
306
  }
307
+
308
  track_coverage = validation_result.get("track_coverage", {})
309
+
310
  for track_name, coverage_info in track_coverage.items():
311
  track_config = EVALUATION_TRACKS[track_name]
312
+
313
  # Sample size adequacy
314
  covered_samples = coverage_info["covered_samples"]
315
  min_required = coverage_info["min_required"]
316
+
317
  sample_adequate = covered_samples >= min_required
318
+
319
  # Coverage rate adequacy
320
  coverage_rate = coverage_info["coverage_rate"]
321
  coverage_adequate = coverage_rate >= 0.8 # 80% coverage minimum
322
+
323
  # Statistical power estimation (simplified)
324
  estimated_power = min(1.0, covered_samples / (min_required * 1.5))
325
+
326
  track_adequate = sample_adequate and coverage_adequate
327
+
328
  adequacy_assessment["track_adequacy"][track_name] = {
329
  "sample_adequate": sample_adequate,
330
  "coverage_adequate": coverage_adequate,
 
334
  "coverage_rate": coverage_rate,
335
  "estimated_power": estimated_power,
336
  }
337
+
338
  if not track_adequate:
339
  adequacy_assessment["overall_adequate"] = False
340
+
341
  adequacy_assessment["statistical_power_estimate"][track_name] = estimated_power
342
+
343
  # Generate recommendations
344
  if not adequacy_assessment["overall_adequate"]:
345
  inadequate_tracks = [
346
+ track for track, info in adequacy_assessment["track_adequacy"].items()
 
347
  if not info["overall_adequate"]
348
  ]
349
  adequacy_assessment["recommendations"].append(
350
  f"Insufficient samples for tracks: {', '.join(inadequate_tracks)}"
351
  )
352
+
353
  # Category-specific recommendations
354
+ if model_category == "commercial" and not adequacy_assessment["track_adequacy"].get("google_comparable", {}).get("overall_adequate", False):
 
 
355
  adequacy_assessment["recommendations"].append(
356
  "Commercial models should ensure adequate coverage of Google-comparable track"
357
  )
358
+
359
  return adequacy_assessment
360
 
361
 
 
368
  detected_category: str = "community",
369
  ) -> str:
370
  """Generate comprehensive scientific validation report."""
371
+
372
  report = []
373
+
374
  # Header
375
  report.append(f"# πŸ”¬ Scientific Validation Report: {model_name or 'Submission'}")
376
  report.append("")
377
+
378
  # Model categorization
379
+ category_info = MODEL_CATEGORIES.get(detected_category, MODEL_CATEGORIES["community"])
 
 
380
  report.append(f"**Detected Model Category**: {category_info['name']}")
381
  report.append(f"**Category Description**: {category_info['description']}")
382
  report.append("")
383
+
384
  # File format validation
385
  if format_result["valid"]:
386
  report.append("βœ… **File Format**: Valid")
 
390
  report.append("❌ **File Format**: Invalid")
391
  report.append(f" - Error: {format_result['error']}")
392
  return "\n".join(report)
393
+
394
  # Content quality validation
395
  quality_score = content_result.get("quality_score", 0.0)
396
+
397
  if content_result["has_issues"]:
398
  report.append("❌ **Content Quality**: Issues Found")
399
  for issue in content_result["issues"]:
400
  report.append(f" - ❌ {issue}")
401
  else:
402
  report.append("βœ… **Content Quality**: Good")
403
+
404
  if content_result["warnings"]:
405
  for warning in content_result["warnings"]:
406
  report.append(f" - ⚠️ {warning}")
407
+
408
  report.append(f" - **Quality Score**: {quality_score:.2f}/1.00")
409
  report.append("")
410
+
411
  # Test set coverage validation
412
  overall_coverage = test_set_result["overall_coverage"]
413
  meets_threshold = test_set_result["meets_missing_threshold"]
414
+
415
  if overall_coverage == 1.0:
416
  report.append("βœ… **Test Set Coverage**: Complete")
417
  elif overall_coverage >= 0.95 and meets_threshold:
418
  report.append("βœ… **Test Set Coverage**: Adequate")
419
  else:
420
  report.append("❌ **Test Set Coverage**: Insufficient")
421
+
422
+ report.append(f" - Coverage: {overall_coverage:.1%} ({test_set_result['matching_count']:,} / {test_set_result['matching_count'] + test_set_result['missing_count']:,})")
 
 
423
  report.append(f" - Missing Rate: {test_set_result['missing_rate']:.1%}")
424
  report.append("")
425
+
426
  # Track-specific coverage analysis
427
  report.append("## πŸ“Š Track-Specific Analysis")
428
+
429
  track_coverage = test_set_result.get("track_coverage", {})
430
  for track_name, coverage_info in track_coverage.items():
431
  track_config = EVALUATION_TRACKS[track_name]
432
+
433
  status = "βœ…" if coverage_info["meets_minimum"] else "❌"
434
  report.append(f"### {status} {track_config['name']}")
435
+
436
+ report.append(f" - **Samples**: {coverage_info['covered_samples']:,} / {coverage_info['total_samples']:,}")
 
 
437
  report.append(f" - **Coverage**: {coverage_info['coverage_rate']:.1%}")
438
  report.append(f" - **Minimum Required**: {coverage_info['min_required']:,}")
439
+ report.append(f" - **Status**: {'Adequate' if coverage_info['meets_minimum'] else 'Insufficient'}")
 
 
440
  report.append("")
441
+
442
  # Statistical adequacy assessment
443
  report.append("## πŸ”¬ Statistical Adequacy Assessment")
444
+
445
  if adequacy_result["overall_adequate"]:
446
+ report.append("βœ… **Overall Assessment**: Statistically adequate for scientific evaluation")
 
 
447
  else:
448
+ report.append("❌ **Overall Assessment**: Insufficient for rigorous scientific evaluation")
449
+
 
 
450
  # Track adequacy details
451
  for track_name, track_adequacy in adequacy_result["track_adequacy"].items():
452
  track_config = EVALUATION_TRACKS[track_name]
453
  power = track_adequacy["estimated_power"]
454
+
455
  status = "βœ…" if track_adequacy["overall_adequate"] else "❌"
456
+ report.append(f" - {status} **{track_config['name']}**: Statistical power β‰ˆ {power:.1%}")
457
+
 
 
458
  # Recommendations
459
  if adequacy_result["recommendations"]:
460
  report.append("")
461
  report.append("## πŸ’‘ Recommendations")
462
  for rec in adequacy_result["recommendations"]:
463
  report.append(f" - {rec}")
464
+
465
  # Final verdict
466
  report.append("")
467
  all_checks_pass = (
468
+ format_result["valid"] and
469
+ not content_result["has_issues"] and
470
+ overall_coverage >= 0.95 and
471
+ meets_threshold and
472
+ adequacy_result["overall_adequate"]
473
  )
474
+
475
  if all_checks_pass:
476
  report.append("πŸŽ‰ **Final Verdict**: Ready for scientific evaluation!")
477
  elif format_result["valid"] and overall_coverage >= 0.8:
478
  report.append("⚠️ **Final Verdict**: Can be evaluated with limitations")
479
  else:
480
  report.append("❌ **Final Verdict**: Please address issues before submission")
481
+
482
  return "\n".join(report)
483
 
484
 
485
  def validate_submission_scientific(
486
+ file_content: bytes,
487
+ filename: str,
488
+ test_set: pd.DataFrame,
489
  model_name: str = "",
490
  author: str = "",
491
+ description: str = ""
492
  ) -> Dict:
493
  """Complete scientific validation pipeline for submissions."""
494
+
495
  # Step 1: Detect model category
496
  detected_category = detect_model_category(model_name, author, description)
497
+
498
  # Step 2: Enhanced file format validation
499
  format_result = validate_file_format_enhanced(file_content, filename)
500
  if not format_result["valid"]:
501
  return {
502
  "valid": False,
503
+ "can_evaluate": False, # New field for evaluation eligibility
504
  "category": detected_category,
505
  "report": generate_scientific_validation_report(
506
  format_result, {}, {}, {}, model_name, detected_category
 
508
  "predictions": None,
509
  "adequacy": {},
510
  }
511
+
512
  predictions = format_result["dataframe"]
513
+
514
  # Step 3: Enhanced content validation
515
  content_result = validate_predictions_content_enhanced(predictions)
516
+
517
  # Step 4: Enhanced test set validation
518
  test_set_result = validate_against_test_set_enhanced(predictions, test_set)
519
+
520
  # Step 5: Statistical adequacy assessment
521
  adequacy_result = assess_statistical_adequacy(test_set_result, detected_category)
522
+
523
  # Step 6: Generate comprehensive report
524
  report = generate_scientific_validation_report(
525
+ format_result, content_result, test_set_result, adequacy_result,
526
+ model_name, detected_category
 
 
 
 
527
  )
528
+
529
+ # Overall validity determination (strict scientific standards)
530
+ is_scientifically_valid = (
531
+ format_result["valid"] and
532
+ not content_result["has_issues"] and
533
+ test_set_result["overall_coverage"] >= 0.95 and
534
+ test_set_result["meets_missing_threshold"] and
535
+ adequacy_result["overall_adequate"]
536
  )
537
+
538
+ # Evaluation eligibility (more permissive - can evaluate with limitations)
539
+ can_evaluate = (
540
+ format_result["valid"] and
541
+ test_set_result["overall_coverage"] >= 0.8 and # 80% coverage minimum
542
+ not any("❌" in issue for issue in content_result.get("issues", [])) # No critical content issues
543
+ )
544
+
545
  return {
546
+ "valid": is_scientifically_valid,
547
+ "can_evaluate": can_evaluate, # New field
548
  "category": detected_category,
549
  "coverage": test_set_result["overall_coverage"],
550
  "report": report,
 
557
  "validation_version": "2.0-scientific",
558
  "detected_category": detected_category,
559
  "statistical_adequacy": adequacy_result["overall_adequate"],
560
+ "evaluation_recommended": can_evaluate,
561
  },
562
+ }