Spaces:
Running
Running
Update src/validation.py
Browse files- src/validation.py +39 -173
src/validation.py
CHANGED
@@ -11,7 +11,6 @@ from config import (
|
|
11 |
MODEL_CATEGORIES,
|
12 |
EVALUATION_TRACKS,
|
13 |
ALL_UG40_LANGUAGES,
|
14 |
-
SAMPLE_SIZE_RECOMMENDATIONS,
|
15 |
)
|
16 |
|
17 |
|
@@ -57,8 +56,8 @@ def detect_model_category(model_name: str, author: str, description: str) -> str
|
|
57 |
return "community"
|
58 |
|
59 |
|
60 |
-
def
|
61 |
-
"""
|
62 |
|
63 |
try:
|
64 |
# Determine file type
|
@@ -87,7 +86,7 @@ def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
|
|
87 |
if len(df) == 0:
|
88 |
return {"valid": False, "error": "File is empty"}
|
89 |
|
90 |
-
#
|
91 |
validation_issues = []
|
92 |
|
93 |
# Check for required data
|
@@ -105,7 +104,7 @@ def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
|
|
105 |
validation_issues.append(f"Duplicate sample_id values found ({dup_count} duplicates)")
|
106 |
|
107 |
# Data type validation
|
108 |
-
if not df["sample_id"].dtype == "object"
|
109 |
df["sample_id"] = df["sample_id"].astype(str)
|
110 |
|
111 |
# Check sample_id format
|
@@ -135,8 +134,8 @@ def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
|
|
135 |
return {"valid": False, "error": f"Error parsing file: {str(e)}"}
|
136 |
|
137 |
|
138 |
-
def
|
139 |
-
"""
|
140 |
|
141 |
issues = []
|
142 |
warnings = []
|
@@ -162,7 +161,7 @@ def validate_predictions_content_enhanced(predictions: pd.DataFrame) -> Dict:
|
|
162 |
if long_predictions > len(predictions) * 0.01: # More than 1%
|
163 |
warnings.append(f"{long_predictions} very long predictions (> 500 characters)")
|
164 |
|
165 |
-
# Check for repeated predictions
|
166 |
duplicate_predictions = predictions["prediction"].duplicated().sum()
|
167 |
duplicate_rate = duplicate_predictions / len(predictions)
|
168 |
quality_metrics["duplicate_rate"] = float(duplicate_rate)
|
@@ -186,14 +185,6 @@ def validate_predictions_content_enhanced(predictions: pd.DataFrame) -> Dict:
|
|
186 |
if placeholder_count > len(predictions) * 0.02: # More than 2%
|
187 |
issues.append(f"{placeholder_count} placeholder-like predictions detected")
|
188 |
|
189 |
-
# Language detection (basic)
|
190 |
-
non_ascii_rate = predictions["prediction"].str.contains(r"[^\x00-\x7f]", na=False).mean()
|
191 |
-
quality_metrics["non_ascii_rate"] = float(non_ascii_rate)
|
192 |
-
|
193 |
-
# Check for appropriate character distribution for African languages
|
194 |
-
if non_ascii_rate < 0.1: # Less than 10% non-ASCII might indicate English-only
|
195 |
-
warnings.append("Low non-ASCII character rate - check if translations include local language scripts")
|
196 |
-
|
197 |
# Calculate overall quality score
|
198 |
quality_score = 1.0
|
199 |
quality_score -= len(issues) * 0.3 # Major penalty for issues
|
@@ -217,10 +208,10 @@ def validate_predictions_content_enhanced(predictions: pd.DataFrame) -> Dict:
|
|
217 |
}
|
218 |
|
219 |
|
220 |
-
def
|
221 |
predictions: pd.DataFrame, test_set: pd.DataFrame
|
222 |
) -> Dict:
|
223 |
-
"""
|
224 |
|
225 |
# Convert IDs to string for comparison
|
226 |
pred_ids = set(predictions["sample_id"].astype(str))
|
@@ -259,22 +250,6 @@ def validate_against_test_set_enhanced(
|
|
259 |
"min_required": VALIDATION_CONFIG["min_samples_per_track"][track_name],
|
260 |
}
|
261 |
|
262 |
-
# Language pair coverage analysis
|
263 |
-
pair_coverage = {}
|
264 |
-
for _, row in test_set.iterrows():
|
265 |
-
pair_key = f"{row['source_language']}_{row['target_language']}"
|
266 |
-
if pair_key not in pair_coverage:
|
267 |
-
pair_coverage[pair_key] = {"total": 0, "covered": 0}
|
268 |
-
|
269 |
-
pair_coverage[pair_key]["total"] += 1
|
270 |
-
if str(row["sample_id"]) in pred_ids:
|
271 |
-
pair_coverage[pair_key]["covered"] += 1
|
272 |
-
|
273 |
-
# Calculate pair-wise coverage rates
|
274 |
-
for pair_key in pair_coverage:
|
275 |
-
pair_info = pair_coverage[pair_key]
|
276 |
-
pair_info["coverage_rate"] = pair_info["covered"] / pair_info["total"]
|
277 |
-
|
278 |
# Missing rate validation
|
279 |
missing_rate = len(missing_ids) / len(test_ids)
|
280 |
meets_missing_threshold = missing_rate <= VALIDATION_CONFIG["max_missing_rate"]
|
@@ -288,98 +263,29 @@ def validate_against_test_set_enhanced(
|
|
288 |
"meets_missing_threshold": meets_missing_threshold,
|
289 |
"is_complete": overall_coverage == 1.0,
|
290 |
"track_coverage": track_coverage,
|
291 |
-
"pair_coverage": pair_coverage,
|
292 |
"missing_ids_sample": list(missing_ids)[:10],
|
293 |
"extra_ids_sample": list(extra_ids)[:10],
|
294 |
}
|
295 |
|
296 |
|
297 |
-
def
|
298 |
-
validation_result: Dict, model_category: str
|
299 |
-
) -> Dict:
|
300 |
-
"""Assess statistical adequacy for scientific evaluation."""
|
301 |
-
|
302 |
-
adequacy_assessment = {
|
303 |
-
"overall_adequate": True,
|
304 |
-
"track_adequacy": {},
|
305 |
-
"recommendations": [],
|
306 |
-
"statistical_power_estimate": {},
|
307 |
-
}
|
308 |
-
|
309 |
-
track_coverage = validation_result.get("track_coverage", {})
|
310 |
-
|
311 |
-
for track_name, coverage_info in track_coverage.items():
|
312 |
-
track_config = EVALUATION_TRACKS[track_name]
|
313 |
-
|
314 |
-
# Sample size adequacy
|
315 |
-
covered_samples = coverage_info["covered_samples"]
|
316 |
-
min_required = coverage_info["min_required"]
|
317 |
-
|
318 |
-
sample_adequate = covered_samples >= min_required
|
319 |
-
|
320 |
-
# Coverage rate adequacy
|
321 |
-
coverage_rate = coverage_info["coverage_rate"]
|
322 |
-
coverage_adequate = coverage_rate >= 0.8 # 80% coverage minimum
|
323 |
-
|
324 |
-
# Statistical power estimation (simplified)
|
325 |
-
estimated_power = min(1.0, covered_samples / (min_required * 1.5))
|
326 |
-
|
327 |
-
track_adequate = sample_adequate and coverage_adequate
|
328 |
-
|
329 |
-
adequacy_assessment["track_adequacy"][track_name] = {
|
330 |
-
"sample_adequate": sample_adequate,
|
331 |
-
"coverage_adequate": coverage_adequate,
|
332 |
-
"overall_adequate": track_adequate,
|
333 |
-
"covered_samples": covered_samples,
|
334 |
-
"min_required": min_required,
|
335 |
-
"coverage_rate": coverage_rate,
|
336 |
-
"estimated_power": estimated_power,
|
337 |
-
}
|
338 |
-
|
339 |
-
if not track_adequate:
|
340 |
-
adequacy_assessment["overall_adequate"] = False
|
341 |
-
|
342 |
-
adequacy_assessment["statistical_power_estimate"][track_name] = estimated_power
|
343 |
-
|
344 |
-
# Generate recommendations
|
345 |
-
if not adequacy_assessment["overall_adequate"]:
|
346 |
-
inadequate_tracks = [
|
347 |
-
track for track, info in adequacy_assessment["track_adequacy"].items()
|
348 |
-
if not info["overall_adequate"]
|
349 |
-
]
|
350 |
-
adequacy_assessment["recommendations"].append(
|
351 |
-
f"Insufficient samples for tracks: {', '.join(inadequate_tracks)}"
|
352 |
-
)
|
353 |
-
|
354 |
-
# Category-specific recommendations
|
355 |
-
if model_category == "commercial" and not adequacy_assessment["track_adequacy"].get("google_comparable", {}).get("overall_adequate", False):
|
356 |
-
adequacy_assessment["recommendations"].append(
|
357 |
-
"Commercial models should ensure adequate coverage of Google-comparable track"
|
358 |
-
)
|
359 |
-
|
360 |
-
return adequacy_assessment
|
361 |
-
|
362 |
-
|
363 |
-
def generate_scientific_validation_report(
|
364 |
format_result: Dict,
|
365 |
content_result: Dict,
|
366 |
test_set_result: Dict,
|
367 |
-
adequacy_result: Dict,
|
368 |
model_name: str = "",
|
369 |
detected_category: str = "community",
|
370 |
) -> str:
|
371 |
-
"""Generate comprehensive
|
372 |
|
373 |
report = []
|
374 |
|
375 |
# Header
|
376 |
-
report.append(f"
|
377 |
report.append("")
|
378 |
|
379 |
# Model categorization
|
380 |
category_info = MODEL_CATEGORIES.get(detected_category, MODEL_CATEGORIES["community"])
|
381 |
report.append(f"**Detected Model Category**: {category_info['name']}")
|
382 |
-
report.append(f"**Category Description**: {category_info['description']}")
|
383 |
report.append("")
|
384 |
|
385 |
# File format validation
|
@@ -425,14 +331,14 @@ def generate_scientific_validation_report(
|
|
425 |
report.append("")
|
426 |
|
427 |
# Track-specific coverage analysis
|
428 |
-
report.append("
|
429 |
|
430 |
track_coverage = test_set_result.get("track_coverage", {})
|
431 |
for track_name, coverage_info in track_coverage.items():
|
432 |
track_config = EVALUATION_TRACKS[track_name]
|
433 |
|
434 |
status = "β
" if coverage_info["meets_minimum"] else "β"
|
435 |
-
report.append(f"
|
436 |
|
437 |
report.append(f" - **Samples**: {coverage_info['covered_samples']:,} / {coverage_info['total_samples']:,}")
|
438 |
report.append(f" - **Coverage**: {coverage_info['coverage_rate']:.1%}")
|
@@ -440,37 +346,12 @@ def generate_scientific_validation_report(
|
|
440 |
report.append(f" - **Status**: {'Adequate' if coverage_info['meets_minimum'] else 'Insufficient'}")
|
441 |
report.append("")
|
442 |
|
443 |
-
# Statistical adequacy assessment
|
444 |
-
report.append("## π¬ Statistical Adequacy Assessment")
|
445 |
-
|
446 |
-
if adequacy_result["overall_adequate"]:
|
447 |
-
report.append("β
**Overall Assessment**: Statistically adequate for scientific evaluation")
|
448 |
-
else:
|
449 |
-
report.append("β **Overall Assessment**: Insufficient for rigorous scientific evaluation")
|
450 |
-
|
451 |
-
# Track adequacy details
|
452 |
-
for track_name, track_adequacy in adequacy_result["track_adequacy"].items():
|
453 |
-
track_config = EVALUATION_TRACKS[track_name]
|
454 |
-
power = track_adequacy["estimated_power"]
|
455 |
-
|
456 |
-
status = "β
" if track_adequacy["overall_adequate"] else "β"
|
457 |
-
report.append(f" - {status} **{track_config['name']}**: Statistical power β {power:.1%}")
|
458 |
-
|
459 |
-
# Recommendations
|
460 |
-
if adequacy_result["recommendations"]:
|
461 |
-
report.append("")
|
462 |
-
report.append("## π‘ Recommendations")
|
463 |
-
for rec in adequacy_result["recommendations"]:
|
464 |
-
report.append(f" - {rec}")
|
465 |
-
|
466 |
# Final verdict
|
467 |
-
report.append("")
|
468 |
all_checks_pass = (
|
469 |
format_result["valid"] and
|
470 |
not content_result["has_issues"] and
|
471 |
overall_coverage >= 0.95 and
|
472 |
-
meets_threshold
|
473 |
-
adequacy_result["overall_adequate"]
|
474 |
)
|
475 |
|
476 |
can_evaluate_with_limits = (
|
@@ -480,18 +361,17 @@ def generate_scientific_validation_report(
|
|
480 |
)
|
481 |
|
482 |
if all_checks_pass:
|
483 |
-
report.append("π **Final Verdict**: Ready for
|
484 |
elif can_evaluate_with_limits:
|
485 |
report.append("β οΈ **Final Verdict**: Can be evaluated with limitations")
|
486 |
-
report.append(" - Results will include notes about
|
487 |
-
report.append(" - Consider improving coverage/quality for publication-grade results")
|
488 |
else:
|
489 |
report.append("β **Final Verdict**: Please address critical issues before submission")
|
490 |
|
491 |
return "\n".join(report)
|
492 |
|
493 |
|
494 |
-
def
|
495 |
file_content: bytes,
|
496 |
filename: str,
|
497 |
test_set: pd.DataFrame,
|
@@ -499,73 +379,59 @@ def validate_submission_scientific(
|
|
499 |
author: str = "",
|
500 |
description: str = ""
|
501 |
) -> Dict:
|
502 |
-
"""Complete
|
503 |
|
504 |
# Step 1: Detect model category
|
505 |
detected_category = detect_model_category(model_name, author, description)
|
506 |
|
507 |
-
# Step 2:
|
508 |
-
format_result =
|
509 |
if not format_result["valid"]:
|
510 |
return {
|
511 |
"valid": False,
|
512 |
-
"can_evaluate": False,
|
513 |
"category": detected_category,
|
514 |
-
"report":
|
515 |
-
format_result, {}, {},
|
516 |
),
|
517 |
"predictions": None,
|
518 |
-
"adequacy": {},
|
519 |
}
|
520 |
|
521 |
predictions = format_result["dataframe"]
|
522 |
|
523 |
-
# Step 3:
|
524 |
-
content_result =
|
525 |
|
526 |
-
# Step 4:
|
527 |
-
test_set_result =
|
528 |
|
529 |
-
# Step 5:
|
530 |
-
|
531 |
-
|
532 |
-
# Step 6: Generate comprehensive report
|
533 |
-
report = generate_scientific_validation_report(
|
534 |
-
format_result, content_result, test_set_result, adequacy_result,
|
535 |
-
model_name, detected_category
|
536 |
)
|
537 |
|
538 |
-
# Overall validity determination
|
539 |
-
|
540 |
format_result["valid"] and
|
541 |
not content_result["has_issues"] and
|
542 |
test_set_result["overall_coverage"] >= 0.95 and
|
543 |
-
test_set_result["meets_missing_threshold"]
|
544 |
-
adequacy_result["overall_adequate"]
|
545 |
)
|
546 |
|
547 |
-
# Evaluation eligibility (more permissive
|
548 |
can_evaluate = (
|
549 |
format_result["valid"] and
|
550 |
-
test_set_result["overall_coverage"] >= 0.8 and
|
551 |
-
not any("β" in issue for issue in content_result.get("issues", []))
|
552 |
)
|
553 |
|
554 |
return {
|
555 |
-
"valid":
|
556 |
-
"can_evaluate": can_evaluate,
|
557 |
"category": detected_category,
|
558 |
"coverage": test_set_result["overall_coverage"],
|
559 |
"report": report,
|
560 |
"predictions": predictions,
|
561 |
-
"adequacy": adequacy_result,
|
562 |
"quality_score": content_result.get("quality_score", 0.8),
|
563 |
"track_coverage": test_set_result.get("track_coverage", {}),
|
564 |
-
"scientific_metadata": {
|
565 |
-
"validation_timestamp": pd.Timestamp.now().isoformat(),
|
566 |
-
"validation_version": "2.0-scientific",
|
567 |
-
"detected_category": detected_category,
|
568 |
-
"statistical_adequacy": adequacy_result["overall_adequate"],
|
569 |
-
"evaluation_recommended": can_evaluate,
|
570 |
-
},
|
571 |
}
|
|
|
11 |
MODEL_CATEGORIES,
|
12 |
EVALUATION_TRACKS,
|
13 |
ALL_UG40_LANGUAGES,
|
|
|
14 |
)
|
15 |
|
16 |
|
|
|
56 |
return "community"
|
57 |
|
58 |
|
59 |
+
def validate_file_format(file_content: bytes, filename: str) -> Dict:
|
60 |
+
"""Validate file format and structure."""
|
61 |
|
62 |
try:
|
63 |
# Determine file type
|
|
|
86 |
if len(df) == 0:
|
87 |
return {"valid": False, "error": "File is empty"}
|
88 |
|
89 |
+
# Validation checks
|
90 |
validation_issues = []
|
91 |
|
92 |
# Check for required data
|
|
|
104 |
validation_issues.append(f"Duplicate sample_id values found ({dup_count} duplicates)")
|
105 |
|
106 |
# Data type validation
|
107 |
+
if not df["sample_id"].dtype == "object":
|
108 |
df["sample_id"] = df["sample_id"].astype(str)
|
109 |
|
110 |
# Check sample_id format
|
|
|
134 |
return {"valid": False, "error": f"Error parsing file: {str(e)}"}
|
135 |
|
136 |
|
137 |
+
def validate_predictions_content(predictions: pd.DataFrame) -> Dict:
|
138 |
+
"""Validate prediction content quality."""
|
139 |
|
140 |
issues = []
|
141 |
warnings = []
|
|
|
161 |
if long_predictions > len(predictions) * 0.01: # More than 1%
|
162 |
warnings.append(f"{long_predictions} very long predictions (> 500 characters)")
|
163 |
|
164 |
+
# Check for repeated predictions
|
165 |
duplicate_predictions = predictions["prediction"].duplicated().sum()
|
166 |
duplicate_rate = duplicate_predictions / len(predictions)
|
167 |
quality_metrics["duplicate_rate"] = float(duplicate_rate)
|
|
|
185 |
if placeholder_count > len(predictions) * 0.02: # More than 2%
|
186 |
issues.append(f"{placeholder_count} placeholder-like predictions detected")
|
187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
# Calculate overall quality score
|
189 |
quality_score = 1.0
|
190 |
quality_score -= len(issues) * 0.3 # Major penalty for issues
|
|
|
208 |
}
|
209 |
|
210 |
|
211 |
+
def validate_against_test_set(
|
212 |
predictions: pd.DataFrame, test_set: pd.DataFrame
|
213 |
) -> Dict:
|
214 |
+
"""Validate predictions against test set."""
|
215 |
|
216 |
# Convert IDs to string for comparison
|
217 |
pred_ids = set(predictions["sample_id"].astype(str))
|
|
|
250 |
"min_required": VALIDATION_CONFIG["min_samples_per_track"][track_name],
|
251 |
}
|
252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
# Missing rate validation
|
254 |
missing_rate = len(missing_ids) / len(test_ids)
|
255 |
meets_missing_threshold = missing_rate <= VALIDATION_CONFIG["max_missing_rate"]
|
|
|
263 |
"meets_missing_threshold": meets_missing_threshold,
|
264 |
"is_complete": overall_coverage == 1.0,
|
265 |
"track_coverage": track_coverage,
|
|
|
266 |
"missing_ids_sample": list(missing_ids)[:10],
|
267 |
"extra_ids_sample": list(extra_ids)[:10],
|
268 |
}
|
269 |
|
270 |
|
271 |
+
def generate_validation_report(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
format_result: Dict,
|
273 |
content_result: Dict,
|
274 |
test_set_result: Dict,
|
|
|
275 |
model_name: str = "",
|
276 |
detected_category: str = "community",
|
277 |
) -> str:
|
278 |
+
"""Generate comprehensive validation report."""
|
279 |
|
280 |
report = []
|
281 |
|
282 |
# Header
|
283 |
+
report.append(f"### π¬ Validation Report: {model_name or 'Submission'}")
|
284 |
report.append("")
|
285 |
|
286 |
# Model categorization
|
287 |
category_info = MODEL_CATEGORIES.get(detected_category, MODEL_CATEGORIES["community"])
|
288 |
report.append(f"**Detected Model Category**: {category_info['name']}")
|
|
|
289 |
report.append("")
|
290 |
|
291 |
# File format validation
|
|
|
331 |
report.append("")
|
332 |
|
333 |
# Track-specific coverage analysis
|
334 |
+
report.append("#### π Track-Specific Analysis")
|
335 |
|
336 |
track_coverage = test_set_result.get("track_coverage", {})
|
337 |
for track_name, coverage_info in track_coverage.items():
|
338 |
track_config = EVALUATION_TRACKS[track_name]
|
339 |
|
340 |
status = "β
" if coverage_info["meets_minimum"] else "β"
|
341 |
+
report.append(f"**{status} {track_config['name']}**:")
|
342 |
|
343 |
report.append(f" - **Samples**: {coverage_info['covered_samples']:,} / {coverage_info['total_samples']:,}")
|
344 |
report.append(f" - **Coverage**: {coverage_info['coverage_rate']:.1%}")
|
|
|
346 |
report.append(f" - **Status**: {'Adequate' if coverage_info['meets_minimum'] else 'Insufficient'}")
|
347 |
report.append("")
|
348 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
# Final verdict
|
|
|
350 |
all_checks_pass = (
|
351 |
format_result["valid"] and
|
352 |
not content_result["has_issues"] and
|
353 |
overall_coverage >= 0.95 and
|
354 |
+
meets_threshold
|
|
|
355 |
)
|
356 |
|
357 |
can_evaluate_with_limits = (
|
|
|
361 |
)
|
362 |
|
363 |
if all_checks_pass:
|
364 |
+
report.append("π **Final Verdict**: Ready for evaluation!")
|
365 |
elif can_evaluate_with_limits:
|
366 |
report.append("β οΈ **Final Verdict**: Can be evaluated with limitations")
|
367 |
+
report.append(" - Results will include notes about limitations")
|
|
|
368 |
else:
|
369 |
report.append("β **Final Verdict**: Please address critical issues before submission")
|
370 |
|
371 |
return "\n".join(report)
|
372 |
|
373 |
|
374 |
+
def validate_submission(
|
375 |
file_content: bytes,
|
376 |
filename: str,
|
377 |
test_set: pd.DataFrame,
|
|
|
379 |
author: str = "",
|
380 |
description: str = ""
|
381 |
) -> Dict:
|
382 |
+
"""Complete validation pipeline for submissions."""
|
383 |
|
384 |
# Step 1: Detect model category
|
385 |
detected_category = detect_model_category(model_name, author, description)
|
386 |
|
387 |
+
# Step 2: File format validation
|
388 |
+
format_result = validate_file_format(file_content, filename)
|
389 |
if not format_result["valid"]:
|
390 |
return {
|
391 |
"valid": False,
|
392 |
+
"can_evaluate": False,
|
393 |
"category": detected_category,
|
394 |
+
"report": generate_validation_report(
|
395 |
+
format_result, {}, {}, model_name, detected_category
|
396 |
),
|
397 |
"predictions": None,
|
|
|
398 |
}
|
399 |
|
400 |
predictions = format_result["dataframe"]
|
401 |
|
402 |
+
# Step 3: Content validation
|
403 |
+
content_result = validate_predictions_content(predictions)
|
404 |
|
405 |
+
# Step 4: Test set validation
|
406 |
+
test_set_result = validate_against_test_set(predictions, test_set)
|
407 |
|
408 |
+
# Step 5: Generate report
|
409 |
+
report = generate_validation_report(
|
410 |
+
format_result, content_result, test_set_result, model_name, detected_category
|
|
|
|
|
|
|
|
|
411 |
)
|
412 |
|
413 |
+
# Overall validity determination
|
414 |
+
is_valid = (
|
415 |
format_result["valid"] and
|
416 |
not content_result["has_issues"] and
|
417 |
test_set_result["overall_coverage"] >= 0.95 and
|
418 |
+
test_set_result["meets_missing_threshold"]
|
|
|
419 |
)
|
420 |
|
421 |
+
# Evaluation eligibility (more permissive)
|
422 |
can_evaluate = (
|
423 |
format_result["valid"] and
|
424 |
+
test_set_result["overall_coverage"] >= 0.8 and
|
425 |
+
not any("β" in issue for issue in content_result.get("issues", []))
|
426 |
)
|
427 |
|
428 |
return {
|
429 |
+
"valid": is_valid,
|
430 |
+
"can_evaluate": can_evaluate,
|
431 |
"category": detected_category,
|
432 |
"coverage": test_set_result["overall_coverage"],
|
433 |
"report": report,
|
434 |
"predictions": predictions,
|
|
|
435 |
"quality_score": content_result.get("quality_score", 0.8),
|
436 |
"track_coverage": test_set_result.get("track_coverage", {}),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
}
|