akera commited on
Commit
cb7f64d
·
verified ·
1 Parent(s): 8727da4

Update src/validation.py

Browse files
Files changed (1) hide show
  1. src/validation.py +20 -9
src/validation.py CHANGED
@@ -101,11 +101,17 @@ def validate_predictions_content(predictions: pd.DataFrame) -> Dict:
101
  if duplicate_predictions > len(predictions) * 0.5: # More than 50%
102
  warnings.append(f"{duplicate_predictions} duplicate prediction texts")
103
 
104
- # Check for non-text content
105
- non_text_pattern = r'^[A-Za-z\s\'".,!?;:()\-]+$'
106
- non_text_predictions = ~predictions['prediction'].str.match(non_text_pattern, na=False)
107
- if non_text_predictions.sum() > 0:
108
- warnings.append(f"{non_text_predictions.sum()} predictions contain unusual characters")
 
 
 
 
 
 
109
 
110
  return {
111
  'has_issues': len(issues) > 0,
@@ -166,8 +172,7 @@ def generate_validation_report(
166
  report = []
167
 
168
  # Header
169
- report.append(f"# Validation Report: {model_name or 'Submission'}")
170
- report.append(f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
171
  report.append("")
172
 
173
  # File format validation
@@ -244,7 +249,9 @@ def validate_submission_complete(file_content: bytes, filename: str, test_set: p
244
  return {
245
  'valid': False,
246
  'report': generate_validation_report(format_result, {}, {}, model_name),
247
- 'predictions': None
 
 
248
  }
249
 
250
  predictions = format_result['dataframe']
@@ -270,5 +277,9 @@ def validate_submission_complete(file_content: bytes, filename: str, test_set: p
270
  'coverage': test_set_result['overall_coverage'],
271
  'report': report,
272
  'predictions': predictions,
273
- 'pair_coverage': test_set_result['pair_coverage']
 
 
 
 
274
  }
 
101
  if duplicate_predictions > len(predictions) * 0.5: # More than 50%
102
  warnings.append(f"{duplicate_predictions} duplicate prediction texts")
103
 
104
+ # Check for non-text content (more permissive regex for multiple languages)
105
+ # Allow Unicode characters for non-English languages
106
+ non_text_pattern = r'^[\w\s\'".,!?;:()\-àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ]+$'
107
+ try:
108
+ non_text_predictions = ~predictions['prediction'].str.match(non_text_pattern, na=False)
109
+ unusual_char_count = non_text_predictions.sum()
110
+ if unusual_char_count > len(predictions) * 0.2: # More than 20%
111
+ warnings.append(f"{unusual_char_count} predictions may contain special characters")
112
+ except:
113
+ # Skip this check if regex fails
114
+ pass
115
 
116
  return {
117
  'has_issues': len(issues) > 0,
 
172
  report = []
173
 
174
  # Header
175
+ report.append(f"## Validation Report: {model_name or 'Submission'}")
 
176
  report.append("")
177
 
178
  # File format validation
 
249
  return {
250
  'valid': False,
251
  'report': generate_validation_report(format_result, {}, {}, model_name),
252
+ 'predictions': None,
253
+ 'coverage': 0.0,
254
+ 'pair_coverage': {}
255
  }
256
 
257
  predictions = format_result['dataframe']
 
277
  'coverage': test_set_result['overall_coverage'],
278
  'report': report,
279
  'predictions': predictions,
280
+ 'pair_coverage': test_set_result['pair_coverage'],
281
+ 'quality_score': content_result.get('quality_score', 0.8),
282
+ 'warnings': content_result.get('warnings', []),
283
+ 'matching_count': test_set_result['matching_count'],
284
+ 'missing_count': test_set_result['missing_count']
285
  }