akera commited on
Commit
7827065
Β·
verified Β·
1 Parent(s): 75faa66

Update src/validation.py

Browse files
Files changed (1) hide show
  1. src/validation.py +39 -173
src/validation.py CHANGED
@@ -11,7 +11,6 @@ from config import (
11
  MODEL_CATEGORIES,
12
  EVALUATION_TRACKS,
13
  ALL_UG40_LANGUAGES,
14
- SAMPLE_SIZE_RECOMMENDATIONS,
15
  )
16
 
17
 
@@ -57,8 +56,8 @@ def detect_model_category(model_name: str, author: str, description: str) -> str
57
  return "community"
58
 
59
 
60
- def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
61
- """Enhanced file format validation with stricter requirements."""
62
 
63
  try:
64
  # Determine file type
@@ -87,7 +86,7 @@ def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
87
  if len(df) == 0:
88
  return {"valid": False, "error": "File is empty"}
89
 
90
- # Enhanced validation checks
91
  validation_issues = []
92
 
93
  # Check for required data
@@ -105,7 +104,7 @@ def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
105
  validation_issues.append(f"Duplicate sample_id values found ({dup_count} duplicates)")
106
 
107
  # Data type validation
108
- if not df["sample_id"].dtype == "object" and not df["sample_id"].dtype.name.startswith("str"):
109
  df["sample_id"] = df["sample_id"].astype(str)
110
 
111
  # Check sample_id format
@@ -135,8 +134,8 @@ def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
135
  return {"valid": False, "error": f"Error parsing file: {str(e)}"}
136
 
137
 
138
- def validate_predictions_content_enhanced(predictions: pd.DataFrame) -> Dict:
139
- """Enhanced prediction content validation with stricter quality checks."""
140
 
141
  issues = []
142
  warnings = []
@@ -162,7 +161,7 @@ def validate_predictions_content_enhanced(predictions: pd.DataFrame) -> Dict:
162
  if long_predictions > len(predictions) * 0.01: # More than 1%
163
  warnings.append(f"{long_predictions} very long predictions (> 500 characters)")
164
 
165
- # Check for repeated predictions (more stringent)
166
  duplicate_predictions = predictions["prediction"].duplicated().sum()
167
  duplicate_rate = duplicate_predictions / len(predictions)
168
  quality_metrics["duplicate_rate"] = float(duplicate_rate)
@@ -186,14 +185,6 @@ def validate_predictions_content_enhanced(predictions: pd.DataFrame) -> Dict:
186
  if placeholder_count > len(predictions) * 0.02: # More than 2%
187
  issues.append(f"{placeholder_count} placeholder-like predictions detected")
188
 
189
- # Language detection (basic)
190
- non_ascii_rate = predictions["prediction"].str.contains(r"[^\x00-\x7f]", na=False).mean()
191
- quality_metrics["non_ascii_rate"] = float(non_ascii_rate)
192
-
193
- # Check for appropriate character distribution for African languages
194
- if non_ascii_rate < 0.1: # Less than 10% non-ASCII might indicate English-only
195
- warnings.append("Low non-ASCII character rate - check if translations include local language scripts")
196
-
197
  # Calculate overall quality score
198
  quality_score = 1.0
199
  quality_score -= len(issues) * 0.3 # Major penalty for issues
@@ -217,10 +208,10 @@ def validate_predictions_content_enhanced(predictions: pd.DataFrame) -> Dict:
217
  }
218
 
219
 
220
- def validate_against_test_set_enhanced(
221
  predictions: pd.DataFrame, test_set: pd.DataFrame
222
  ) -> Dict:
223
- """Enhanced validation against test set with track-specific analysis."""
224
 
225
  # Convert IDs to string for comparison
226
  pred_ids = set(predictions["sample_id"].astype(str))
@@ -259,22 +250,6 @@ def validate_against_test_set_enhanced(
259
  "min_required": VALIDATION_CONFIG["min_samples_per_track"][track_name],
260
  }
261
 
262
- # Language pair coverage analysis
263
- pair_coverage = {}
264
- for _, row in test_set.iterrows():
265
- pair_key = f"{row['source_language']}_{row['target_language']}"
266
- if pair_key not in pair_coverage:
267
- pair_coverage[pair_key] = {"total": 0, "covered": 0}
268
-
269
- pair_coverage[pair_key]["total"] += 1
270
- if str(row["sample_id"]) in pred_ids:
271
- pair_coverage[pair_key]["covered"] += 1
272
-
273
- # Calculate pair-wise coverage rates
274
- for pair_key in pair_coverage:
275
- pair_info = pair_coverage[pair_key]
276
- pair_info["coverage_rate"] = pair_info["covered"] / pair_info["total"]
277
-
278
  # Missing rate validation
279
  missing_rate = len(missing_ids) / len(test_ids)
280
  meets_missing_threshold = missing_rate <= VALIDATION_CONFIG["max_missing_rate"]
@@ -288,98 +263,29 @@ def validate_against_test_set_enhanced(
288
  "meets_missing_threshold": meets_missing_threshold,
289
  "is_complete": overall_coverage == 1.0,
290
  "track_coverage": track_coverage,
291
- "pair_coverage": pair_coverage,
292
  "missing_ids_sample": list(missing_ids)[:10],
293
  "extra_ids_sample": list(extra_ids)[:10],
294
  }
295
 
296
 
297
- def assess_statistical_adequacy(
298
- validation_result: Dict, model_category: str
299
- ) -> Dict:
300
- """Assess statistical adequacy for scientific evaluation."""
301
-
302
- adequacy_assessment = {
303
- "overall_adequate": True,
304
- "track_adequacy": {},
305
- "recommendations": [],
306
- "statistical_power_estimate": {},
307
- }
308
-
309
- track_coverage = validation_result.get("track_coverage", {})
310
-
311
- for track_name, coverage_info in track_coverage.items():
312
- track_config = EVALUATION_TRACKS[track_name]
313
-
314
- # Sample size adequacy
315
- covered_samples = coverage_info["covered_samples"]
316
- min_required = coverage_info["min_required"]
317
-
318
- sample_adequate = covered_samples >= min_required
319
-
320
- # Coverage rate adequacy
321
- coverage_rate = coverage_info["coverage_rate"]
322
- coverage_adequate = coverage_rate >= 0.8 # 80% coverage minimum
323
-
324
- # Statistical power estimation (simplified)
325
- estimated_power = min(1.0, covered_samples / (min_required * 1.5))
326
-
327
- track_adequate = sample_adequate and coverage_adequate
328
-
329
- adequacy_assessment["track_adequacy"][track_name] = {
330
- "sample_adequate": sample_adequate,
331
- "coverage_adequate": coverage_adequate,
332
- "overall_adequate": track_adequate,
333
- "covered_samples": covered_samples,
334
- "min_required": min_required,
335
- "coverage_rate": coverage_rate,
336
- "estimated_power": estimated_power,
337
- }
338
-
339
- if not track_adequate:
340
- adequacy_assessment["overall_adequate"] = False
341
-
342
- adequacy_assessment["statistical_power_estimate"][track_name] = estimated_power
343
-
344
- # Generate recommendations
345
- if not adequacy_assessment["overall_adequate"]:
346
- inadequate_tracks = [
347
- track for track, info in adequacy_assessment["track_adequacy"].items()
348
- if not info["overall_adequate"]
349
- ]
350
- adequacy_assessment["recommendations"].append(
351
- f"Insufficient samples for tracks: {', '.join(inadequate_tracks)}"
352
- )
353
-
354
- # Category-specific recommendations
355
- if model_category == "commercial" and not adequacy_assessment["track_adequacy"].get("google_comparable", {}).get("overall_adequate", False):
356
- adequacy_assessment["recommendations"].append(
357
- "Commercial models should ensure adequate coverage of Google-comparable track"
358
- )
359
-
360
- return adequacy_assessment
361
-
362
-
363
- def generate_scientific_validation_report(
364
  format_result: Dict,
365
  content_result: Dict,
366
  test_set_result: Dict,
367
- adequacy_result: Dict,
368
  model_name: str = "",
369
  detected_category: str = "community",
370
  ) -> str:
371
- """Generate comprehensive scientific validation report."""
372
 
373
  report = []
374
 
375
  # Header
376
- report.append(f"# πŸ”¬ Scientific Validation Report: {model_name or 'Submission'}")
377
  report.append("")
378
 
379
  # Model categorization
380
  category_info = MODEL_CATEGORIES.get(detected_category, MODEL_CATEGORIES["community"])
381
  report.append(f"**Detected Model Category**: {category_info['name']}")
382
- report.append(f"**Category Description**: {category_info['description']}")
383
  report.append("")
384
 
385
  # File format validation
@@ -425,14 +331,14 @@ def generate_scientific_validation_report(
425
  report.append("")
426
 
427
  # Track-specific coverage analysis
428
- report.append("## πŸ“Š Track-Specific Analysis")
429
 
430
  track_coverage = test_set_result.get("track_coverage", {})
431
  for track_name, coverage_info in track_coverage.items():
432
  track_config = EVALUATION_TRACKS[track_name]
433
 
434
  status = "βœ…" if coverage_info["meets_minimum"] else "❌"
435
- report.append(f"### {status} {track_config['name']}")
436
 
437
  report.append(f" - **Samples**: {coverage_info['covered_samples']:,} / {coverage_info['total_samples']:,}")
438
  report.append(f" - **Coverage**: {coverage_info['coverage_rate']:.1%}")
@@ -440,37 +346,12 @@ def generate_scientific_validation_report(
440
  report.append(f" - **Status**: {'Adequate' if coverage_info['meets_minimum'] else 'Insufficient'}")
441
  report.append("")
442
 
443
- # Statistical adequacy assessment
444
- report.append("## πŸ”¬ Statistical Adequacy Assessment")
445
-
446
- if adequacy_result["overall_adequate"]:
447
- report.append("βœ… **Overall Assessment**: Statistically adequate for scientific evaluation")
448
- else:
449
- report.append("❌ **Overall Assessment**: Insufficient for rigorous scientific evaluation")
450
-
451
- # Track adequacy details
452
- for track_name, track_adequacy in adequacy_result["track_adequacy"].items():
453
- track_config = EVALUATION_TRACKS[track_name]
454
- power = track_adequacy["estimated_power"]
455
-
456
- status = "βœ…" if track_adequacy["overall_adequate"] else "❌"
457
- report.append(f" - {status} **{track_config['name']}**: Statistical power β‰ˆ {power:.1%}")
458
-
459
- # Recommendations
460
- if adequacy_result["recommendations"]:
461
- report.append("")
462
- report.append("## πŸ’‘ Recommendations")
463
- for rec in adequacy_result["recommendations"]:
464
- report.append(f" - {rec}")
465
-
466
  # Final verdict
467
- report.append("")
468
  all_checks_pass = (
469
  format_result["valid"] and
470
  not content_result["has_issues"] and
471
  overall_coverage >= 0.95 and
472
- meets_threshold and
473
- adequacy_result["overall_adequate"]
474
  )
475
 
476
  can_evaluate_with_limits = (
@@ -480,18 +361,17 @@ def generate_scientific_validation_report(
480
  )
481
 
482
  if all_checks_pass:
483
- report.append("πŸŽ‰ **Final Verdict**: Ready for scientific evaluation!")
484
  elif can_evaluate_with_limits:
485
  report.append("⚠️ **Final Verdict**: Can be evaluated with limitations")
486
- report.append(" - Results will include notes about statistical limitations")
487
- report.append(" - Consider improving coverage/quality for publication-grade results")
488
  else:
489
  report.append("❌ **Final Verdict**: Please address critical issues before submission")
490
 
491
  return "\n".join(report)
492
 
493
 
494
- def validate_submission_scientific(
495
  file_content: bytes,
496
  filename: str,
497
  test_set: pd.DataFrame,
@@ -499,73 +379,59 @@ def validate_submission_scientific(
499
  author: str = "",
500
  description: str = ""
501
  ) -> Dict:
502
- """Complete scientific validation pipeline for submissions."""
503
 
504
  # Step 1: Detect model category
505
  detected_category = detect_model_category(model_name, author, description)
506
 
507
- # Step 2: Enhanced file format validation
508
- format_result = validate_file_format_enhanced(file_content, filename)
509
  if not format_result["valid"]:
510
  return {
511
  "valid": False,
512
- "can_evaluate": False, # New field for evaluation eligibility
513
  "category": detected_category,
514
- "report": generate_scientific_validation_report(
515
- format_result, {}, {}, {}, model_name, detected_category
516
  ),
517
  "predictions": None,
518
- "adequacy": {},
519
  }
520
 
521
  predictions = format_result["dataframe"]
522
 
523
- # Step 3: Enhanced content validation
524
- content_result = validate_predictions_content_enhanced(predictions)
525
 
526
- # Step 4: Enhanced test set validation
527
- test_set_result = validate_against_test_set_enhanced(predictions, test_set)
528
 
529
- # Step 5: Statistical adequacy assessment
530
- adequacy_result = assess_statistical_adequacy(test_set_result, detected_category)
531
-
532
- # Step 6: Generate comprehensive report
533
- report = generate_scientific_validation_report(
534
- format_result, content_result, test_set_result, adequacy_result,
535
- model_name, detected_category
536
  )
537
 
538
- # Overall validity determination (strict scientific standards)
539
- is_scientifically_valid = (
540
  format_result["valid"] and
541
  not content_result["has_issues"] and
542
  test_set_result["overall_coverage"] >= 0.95 and
543
- test_set_result["meets_missing_threshold"] and
544
- adequacy_result["overall_adequate"]
545
  )
546
 
547
- # Evaluation eligibility (more permissive - can evaluate with limitations)
548
  can_evaluate = (
549
  format_result["valid"] and
550
- test_set_result["overall_coverage"] >= 0.8 and # 80% coverage minimum
551
- not any("❌" in issue for issue in content_result.get("issues", [])) # No critical content issues
552
  )
553
 
554
  return {
555
- "valid": is_scientifically_valid,
556
- "can_evaluate": can_evaluate, # New field
557
  "category": detected_category,
558
  "coverage": test_set_result["overall_coverage"],
559
  "report": report,
560
  "predictions": predictions,
561
- "adequacy": adequacy_result,
562
  "quality_score": content_result.get("quality_score", 0.8),
563
  "track_coverage": test_set_result.get("track_coverage", {}),
564
- "scientific_metadata": {
565
- "validation_timestamp": pd.Timestamp.now().isoformat(),
566
- "validation_version": "2.0-scientific",
567
- "detected_category": detected_category,
568
- "statistical_adequacy": adequacy_result["overall_adequate"],
569
- "evaluation_recommended": can_evaluate,
570
- },
571
  }
 
11
  MODEL_CATEGORIES,
12
  EVALUATION_TRACKS,
13
  ALL_UG40_LANGUAGES,
 
14
  )
15
 
16
 
 
56
  return "community"
57
 
58
 
59
+ def validate_file_format(file_content: bytes, filename: str) -> Dict:
60
+ """Validate file format and structure."""
61
 
62
  try:
63
  # Determine file type
 
86
  if len(df) == 0:
87
  return {"valid": False, "error": "File is empty"}
88
 
89
+ # Validation checks
90
  validation_issues = []
91
 
92
  # Check for required data
 
104
  validation_issues.append(f"Duplicate sample_id values found ({dup_count} duplicates)")
105
 
106
  # Data type validation
107
+ if not df["sample_id"].dtype == "object":
108
  df["sample_id"] = df["sample_id"].astype(str)
109
 
110
  # Check sample_id format
 
134
  return {"valid": False, "error": f"Error parsing file: {str(e)}"}
135
 
136
 
137
+ def validate_predictions_content(predictions: pd.DataFrame) -> Dict:
138
+ """Validate prediction content quality."""
139
 
140
  issues = []
141
  warnings = []
 
161
  if long_predictions > len(predictions) * 0.01: # More than 1%
162
  warnings.append(f"{long_predictions} very long predictions (> 500 characters)")
163
 
164
+ # Check for repeated predictions
165
  duplicate_predictions = predictions["prediction"].duplicated().sum()
166
  duplicate_rate = duplicate_predictions / len(predictions)
167
  quality_metrics["duplicate_rate"] = float(duplicate_rate)
 
185
  if placeholder_count > len(predictions) * 0.02: # More than 2%
186
  issues.append(f"{placeholder_count} placeholder-like predictions detected")
187
 
 
 
 
 
 
 
 
 
188
  # Calculate overall quality score
189
  quality_score = 1.0
190
  quality_score -= len(issues) * 0.3 # Major penalty for issues
 
208
  }
209
 
210
 
211
+ def validate_against_test_set(
212
  predictions: pd.DataFrame, test_set: pd.DataFrame
213
  ) -> Dict:
214
+ """Validate predictions against test set."""
215
 
216
  # Convert IDs to string for comparison
217
  pred_ids = set(predictions["sample_id"].astype(str))
 
250
  "min_required": VALIDATION_CONFIG["min_samples_per_track"][track_name],
251
  }
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  # Missing rate validation
254
  missing_rate = len(missing_ids) / len(test_ids)
255
  meets_missing_threshold = missing_rate <= VALIDATION_CONFIG["max_missing_rate"]
 
263
  "meets_missing_threshold": meets_missing_threshold,
264
  "is_complete": overall_coverage == 1.0,
265
  "track_coverage": track_coverage,
 
266
  "missing_ids_sample": list(missing_ids)[:10],
267
  "extra_ids_sample": list(extra_ids)[:10],
268
  }
269
 
270
 
271
+ def generate_validation_report(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  format_result: Dict,
273
  content_result: Dict,
274
  test_set_result: Dict,
 
275
  model_name: str = "",
276
  detected_category: str = "community",
277
  ) -> str:
278
+ """Generate comprehensive validation report."""
279
 
280
  report = []
281
 
282
  # Header
283
+ report.append(f"### πŸ”¬ Validation Report: {model_name or 'Submission'}")
284
  report.append("")
285
 
286
  # Model categorization
287
  category_info = MODEL_CATEGORIES.get(detected_category, MODEL_CATEGORIES["community"])
288
  report.append(f"**Detected Model Category**: {category_info['name']}")
 
289
  report.append("")
290
 
291
  # File format validation
 
331
  report.append("")
332
 
333
  # Track-specific coverage analysis
334
+ report.append("#### πŸ“Š Track-Specific Analysis")
335
 
336
  track_coverage = test_set_result.get("track_coverage", {})
337
  for track_name, coverage_info in track_coverage.items():
338
  track_config = EVALUATION_TRACKS[track_name]
339
 
340
  status = "βœ…" if coverage_info["meets_minimum"] else "❌"
341
+ report.append(f"**{status} {track_config['name']}**:")
342
 
343
  report.append(f" - **Samples**: {coverage_info['covered_samples']:,} / {coverage_info['total_samples']:,}")
344
  report.append(f" - **Coverage**: {coverage_info['coverage_rate']:.1%}")
 
346
  report.append(f" - **Status**: {'Adequate' if coverage_info['meets_minimum'] else 'Insufficient'}")
347
  report.append("")
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  # Final verdict
 
350
  all_checks_pass = (
351
  format_result["valid"] and
352
  not content_result["has_issues"] and
353
  overall_coverage >= 0.95 and
354
+ meets_threshold
 
355
  )
356
 
357
  can_evaluate_with_limits = (
 
361
  )
362
 
363
  if all_checks_pass:
364
+ report.append("πŸŽ‰ **Final Verdict**: Ready for evaluation!")
365
  elif can_evaluate_with_limits:
366
  report.append("⚠️ **Final Verdict**: Can be evaluated with limitations")
367
+ report.append(" - Results will include notes about limitations")
 
368
  else:
369
  report.append("❌ **Final Verdict**: Please address critical issues before submission")
370
 
371
  return "\n".join(report)
372
 
373
 
374
+ def validate_submission(
375
  file_content: bytes,
376
  filename: str,
377
  test_set: pd.DataFrame,
 
379
  author: str = "",
380
  description: str = ""
381
  ) -> Dict:
382
+ """Complete validation pipeline for submissions."""
383
 
384
  # Step 1: Detect model category
385
  detected_category = detect_model_category(model_name, author, description)
386
 
387
+ # Step 2: File format validation
388
+ format_result = validate_file_format(file_content, filename)
389
  if not format_result["valid"]:
390
  return {
391
  "valid": False,
392
+ "can_evaluate": False,
393
  "category": detected_category,
394
+ "report": generate_validation_report(
395
+ format_result, {}, {}, model_name, detected_category
396
  ),
397
  "predictions": None,
 
398
  }
399
 
400
  predictions = format_result["dataframe"]
401
 
402
+ # Step 3: Content validation
403
+ content_result = validate_predictions_content(predictions)
404
 
405
+ # Step 4: Test set validation
406
+ test_set_result = validate_against_test_set(predictions, test_set)
407
 
408
+ # Step 5: Generate report
409
+ report = generate_validation_report(
410
+ format_result, content_result, test_set_result, model_name, detected_category
 
 
 
 
411
  )
412
 
413
+ # Overall validity determination
414
+ is_valid = (
415
  format_result["valid"] and
416
  not content_result["has_issues"] and
417
  test_set_result["overall_coverage"] >= 0.95 and
418
+ test_set_result["meets_missing_threshold"]
 
419
  )
420
 
421
+ # Evaluation eligibility (more permissive)
422
  can_evaluate = (
423
  format_result["valid"] and
424
+ test_set_result["overall_coverage"] >= 0.8 and
425
+ not any("❌" in issue for issue in content_result.get("issues", []))
426
  )
427
 
428
  return {
429
+ "valid": is_valid,
430
+ "can_evaluate": can_evaluate,
431
  "category": detected_category,
432
  "coverage": test_set_result["overall_coverage"],
433
  "report": report,
434
  "predictions": predictions,
 
435
  "quality_score": content_result.get("quality_score", 0.8),
436
  "track_coverage": test_set_result.get("track_coverage", {}),
 
 
 
 
 
 
 
437
  }