Spaces:
Running
Running
Update src/validation.py
Browse files- src/validation.py +20 -9
src/validation.py
CHANGED
@@ -101,11 +101,17 @@ def validate_predictions_content(predictions: pd.DataFrame) -> Dict:
|
|
101 |
if duplicate_predictions > len(predictions) * 0.5: # More than 50%
|
102 |
warnings.append(f"{duplicate_predictions} duplicate prediction texts")
|
103 |
|
104 |
-
# Check for non-text content
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
return {
|
111 |
'has_issues': len(issues) > 0,
|
@@ -166,8 +172,7 @@ def generate_validation_report(
|
|
166 |
report = []
|
167 |
|
168 |
# Header
|
169 |
-
report.append(f"
|
170 |
-
report.append(f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
171 |
report.append("")
|
172 |
|
173 |
# File format validation
|
@@ -244,7 +249,9 @@ def validate_submission_complete(file_content: bytes, filename: str, test_set: p
|
|
244 |
return {
|
245 |
'valid': False,
|
246 |
'report': generate_validation_report(format_result, {}, {}, model_name),
|
247 |
-
'predictions': None
|
|
|
|
|
248 |
}
|
249 |
|
250 |
predictions = format_result['dataframe']
|
@@ -270,5 +277,9 @@ def validate_submission_complete(file_content: bytes, filename: str, test_set: p
|
|
270 |
'coverage': test_set_result['overall_coverage'],
|
271 |
'report': report,
|
272 |
'predictions': predictions,
|
273 |
-
'pair_coverage': test_set_result['pair_coverage']
|
|
|
|
|
|
|
|
|
274 |
}
|
|
|
101 |
if duplicate_predictions > len(predictions) * 0.5: # More than 50%
|
102 |
warnings.append(f"{duplicate_predictions} duplicate prediction texts")
|
103 |
|
104 |
+
# Check for non-text content (more permissive regex for multiple languages)
|
105 |
+
# Allow Unicode characters for non-English languages
|
106 |
+
non_text_pattern = r'^[\w\s\'".,!?;:()\-àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ]+$'
|
107 |
+
try:
|
108 |
+
non_text_predictions = ~predictions['prediction'].str.match(non_text_pattern, na=False)
|
109 |
+
unusual_char_count = non_text_predictions.sum()
|
110 |
+
if unusual_char_count > len(predictions) * 0.2: # More than 20%
|
111 |
+
warnings.append(f"{unusual_char_count} predictions may contain special characters")
|
112 |
+
except:
|
113 |
+
# Skip this check if regex fails
|
114 |
+
pass
|
115 |
|
116 |
return {
|
117 |
'has_issues': len(issues) > 0,
|
|
|
172 |
report = []
|
173 |
|
174 |
# Header
|
175 |
+
report.append(f"## Validation Report: {model_name or 'Submission'}")
|
|
|
176 |
report.append("")
|
177 |
|
178 |
# File format validation
|
|
|
249 |
return {
|
250 |
'valid': False,
|
251 |
'report': generate_validation_report(format_result, {}, {}, model_name),
|
252 |
+
'predictions': None,
|
253 |
+
'coverage': 0.0,
|
254 |
+
'pair_coverage': {}
|
255 |
}
|
256 |
|
257 |
predictions = format_result['dataframe']
|
|
|
277 |
'coverage': test_set_result['overall_coverage'],
|
278 |
'report': report,
|
279 |
'predictions': predictions,
|
280 |
+
'pair_coverage': test_set_result['pair_coverage'],
|
281 |
+
'quality_score': content_result.get('quality_score', 0.8),
|
282 |
+
'warnings': content_result.get('warnings', []),
|
283 |
+
'matching_count': test_set_result['matching_count'],
|
284 |
+
'missing_count': test_set_result['missing_count']
|
285 |
}
|