akera commited on
Commit
ce36131
·
verified ·
1 Parent(s): 23201ae

Update src/validation.py

Browse files
Files changed (1) hide show
  1. src/validation.py +538 -200
src/validation.py CHANGED
@@ -4,179 +4,440 @@ import numpy as np
4
  from typing import Dict, List, Tuple, Optional
5
  import json
6
  import io
7
- from config import PREDICTION_FORMAT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- def validate_file_format(file_content: bytes, filename: str) -> Dict:
10
- """Validate uploaded file format and structure."""
11
-
12
  try:
13
  # Determine file type
14
- if filename.endswith('.csv'):
15
  df = pd.read_csv(io.BytesIO(file_content))
16
- elif filename.endswith('.tsv'):
17
- df = pd.read_csv(io.BytesIO(file_content), sep='\t')
18
- elif filename.endswith('.json'):
19
- data = json.loads(file_content.decode('utf-8'))
20
  df = pd.DataFrame(data)
21
  else:
22
  return {
23
- 'valid': False,
24
- 'error': f"Unsupported file type. Use: {', '.join(PREDICTION_FORMAT['file_types'])}"
25
  }
26
-
27
  # Check required columns
28
- missing_cols = set(PREDICTION_FORMAT['required_columns']) - set(df.columns)
29
  if missing_cols:
30
  return {
31
- 'valid': False,
32
- 'error': f"Missing required columns: {', '.join(missing_cols)}"
33
  }
34
-
35
  # Basic data validation
36
  if len(df) == 0:
37
- return {
38
- 'valid': False,
39
- 'error': "File is empty"
40
- }
41
-
42
  # Check for required data
43
- if df['sample_id'].isna().any():
44
- return {
45
- 'valid': False,
46
- 'error': "Missing sample_id values found"
47
- }
48
-
49
- if df['prediction'].isna().any():
50
- na_count = df['prediction'].isna().sum()
51
- return {
52
- 'valid': False,
53
- 'error': f"Missing prediction values found ({na_count} empty predictions)"
54
- }
55
-
56
  # Check for duplicates
57
- duplicates = df['sample_id'].duplicated()
58
  if duplicates.any():
59
  dup_count = duplicates.sum()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  return {
61
- 'valid': False,
62
- 'error': f"Duplicate sample_id values found ({dup_count} duplicates)"
 
 
 
63
  }
64
-
65
  return {
66
- 'valid': True,
67
- 'dataframe': df,
68
- 'row_count': len(df),
69
- 'columns': list(df.columns)
70
  }
71
-
72
  except Exception as e:
73
- return {
74
- 'valid': False,
75
- 'error': f"Error parsing file: {str(e)}"
76
- }
 
77
 
78
- def validate_predictions_content(predictions: pd.DataFrame) -> Dict:
79
- """Validate prediction content quality."""
80
-
81
  issues = []
82
  warnings = []
83
-
84
- # Check prediction text quality
85
- empty_predictions = predictions['prediction'].str.strip().eq('').sum()
 
86
  if empty_predictions > 0:
87
  issues.append(f"{empty_predictions} empty predictions found")
88
-
 
 
 
 
 
89
  # Check for suspiciously short predictions
90
- short_predictions = (predictions['prediction'].str.len() < 3).sum()
91
- if short_predictions > len(predictions) * 0.1: # More than 10%
92
- warnings.append(f"{short_predictions} very short predictions (< 3 characters)")
93
-
94
  # Check for suspiciously long predictions
95
- long_predictions = (predictions['prediction'].str.len() > 500).sum()
96
- if long_predictions > 0:
97
  warnings.append(f"{long_predictions} very long predictions (> 500 characters)")
98
-
99
- # Check for repeated predictions
100
- duplicate_predictions = predictions['prediction'].duplicated().sum()
101
- if duplicate_predictions > len(predictions) * 0.5: # More than 50%
102
- warnings.append(f"{duplicate_predictions} duplicate prediction texts")
103
-
104
- # Check for non-text content (more permissive regex for multiple languages)
105
- # Allow Unicode characters for non-English languages
106
- non_text_pattern = r'^[\w\s\'".,!?;:()\-àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ]+$'
107
- try:
108
- non_text_predictions = ~predictions['prediction'].str.match(non_text_pattern, na=False)
109
- unusual_char_count = non_text_predictions.sum()
110
- if unusual_char_count > len(predictions) * 0.2: # More than 20%
111
- warnings.append(f"{unusual_char_count} predictions may contain special characters")
112
- except:
113
- # Skip this check if regex fails
114
- pass
115
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  return {
117
- 'has_issues': len(issues) > 0,
118
- 'issues': issues,
119
- 'warnings': warnings,
120
- 'quality_score': max(0, 1.0 - len(issues) * 0.2 - len(warnings) * 0.1)
 
121
  }
122
 
123
- def validate_against_test_set(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
124
- """Validate predictions against the official test set."""
125
-
 
 
 
126
  # Convert IDs to string for comparison
127
- pred_ids = set(predictions['sample_id'].astype(str))
128
- test_ids = set(test_set['sample_id'].astype(str))
129
-
130
- # Check coverage
131
  missing_ids = test_ids - pred_ids
132
  extra_ids = pred_ids - test_ids
133
  matching_ids = pred_ids & test_ids
134
-
135
- coverage = len(matching_ids) / len(test_ids)
136
-
137
- # Detailed coverage by language pair
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  pair_coverage = {}
139
  for _, row in test_set.iterrows():
140
  pair_key = f"{row['source_language']}_{row['target_language']}"
141
  if pair_key not in pair_coverage:
142
- pair_coverage[pair_key] = {'total': 0, 'covered': 0}
143
-
144
- pair_coverage[pair_key]['total'] += 1
145
- if str(row['sample_id']) in pred_ids:
146
- pair_coverage[pair_key]['covered'] += 1
147
-
148
  # Calculate pair-wise coverage rates
149
  for pair_key in pair_coverage:
150
  pair_info = pair_coverage[pair_key]
151
- pair_info['coverage_rate'] = pair_info['covered'] / pair_info['total']
152
-
 
 
 
 
153
  return {
154
- 'overall_coverage': coverage,
155
- 'missing_count': len(missing_ids),
156
- 'extra_count': len(extra_ids),
157
- 'matching_count': len(matching_ids),
158
- 'is_complete': coverage == 1.0,
159
- 'pair_coverage': pair_coverage,
160
- 'missing_ids_sample': list(missing_ids)[:10], # First 10 for display
161
- 'extra_ids_sample': list(extra_ids)[:10]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  }
163
 
164
- def generate_validation_report(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  format_result: Dict,
166
- content_result: Dict,
167
  test_set_result: Dict,
168
- model_name: str = ""
 
 
169
  ) -> str:
170
- """Generate human-readable validation report."""
171
-
172
  report = []
173
-
174
  # Header
175
- report.append(f"## Validation Report: {model_name or 'Submission'}")
 
 
 
 
 
 
 
 
176
  report.append("")
177
-
178
  # File format validation
179
- if format_result['valid']:
180
  report.append("✅ **File Format**: Valid")
181
  report.append(f" - Rows: {format_result['row_count']:,}")
182
  report.append(f" - Columns: {', '.join(format_result['columns'])}")
@@ -184,102 +445,179 @@ def generate_validation_report(
184
  report.append("❌ **File Format**: Invalid")
185
  report.append(f" - Error: {format_result['error']}")
186
  return "\n".join(report)
187
-
188
- # Content validation
189
- if content_result['has_issues']:
190
- report.append("⚠️ **Content Quality**: Issues Found")
191
- for issue in content_result['issues']:
 
 
192
  report.append(f" - ❌ {issue}")
193
  else:
194
  report.append("✅ **Content Quality**: Good")
195
-
196
- if content_result['warnings']:
197
- for warning in content_result['warnings']:
198
  report.append(f" - ⚠️ {warning}")
199
-
200
- # Test set validation
201
- coverage = test_set_result['overall_coverage']
202
- if coverage == 1.0:
 
 
 
 
 
203
  report.append("✅ **Test Set Coverage**: Complete")
204
- elif coverage >= 0.95:
205
- report.append("⚠️ **Test Set Coverage**: Nearly Complete")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  else:
207
- report.append("❌ **Test Set Coverage**: Incomplete")
208
-
209
- report.append(f" - Coverage: {coverage:.1%} ({test_set_result['matching_count']:,} / {test_set_result['matching_count'] + test_set_result['missing_count']:,})")
210
-
211
- if test_set_result['missing_count'] > 0:
212
- report.append(f" - Missing: {test_set_result['missing_count']:,} samples")
213
-
214
- if test_set_result['extra_count'] > 0:
215
- report.append(f" - Extra: {test_set_result['extra_count']:,} samples")
216
-
217
- # Language pair coverage
218
- pair_cov = test_set_result['pair_coverage']
219
- incomplete_pairs = [k for k, v in pair_cov.items() if v['coverage_rate'] < 1.0]
220
-
221
- if incomplete_pairs:
 
222
  report.append("")
223
- report.append("**Incomplete Language Pairs:**")
224
- for pair in incomplete_pairs[:5]: # Show first 5
225
- info = pair_cov[pair]
226
- src, tgt = pair.split('_')
227
- report.append(f" - {src}→{tgt}: {info['covered']}/{info['total']} ({info['coverage_rate']:.1%})")
228
-
229
- if len(incomplete_pairs) > 5:
230
- report.append(f" - ... and {len(incomplete_pairs) - 5} more pairs")
231
-
232
  # Final verdict
233
  report.append("")
234
- if format_result['valid'] and coverage >= 0.95 and not content_result['has_issues']:
235
- report.append("🎉 **Overall**: Ready for evaluation!")
236
- elif format_result['valid'] and coverage >= 0.8:
237
- report.append("⚠️ **Overall**: Can be evaluated with warnings")
 
 
 
 
 
 
 
 
238
  else:
239
- report.append("❌ **Overall**: Please fix issues before submission")
240
-
241
  return "\n".join(report)
242
 
243
- def validate_submission_complete(file_content: bytes, filename: str, test_set: pd.DataFrame, model_name: str = "") -> Dict:
244
- """Complete validation pipeline for a submission."""
245
-
246
- # Step 1: File format validation
247
- format_result = validate_file_format(file_content, filename)
248
- if not format_result['valid']:
 
 
 
 
 
 
 
 
 
 
 
249
  return {
250
- 'valid': False,
251
- 'report': generate_validation_report(format_result, {}, {}, model_name),
252
- 'predictions': None,
253
- 'coverage': 0.0,
254
- 'pair_coverage': {}
 
 
255
  }
256
-
257
- predictions = format_result['dataframe']
258
-
259
- # Step 2: Content validation
260
- content_result = validate_predictions_content(predictions)
261
-
262
- # Step 3: Test set validation
263
- test_set_result = validate_against_test_set(predictions, test_set)
264
-
265
- # Step 4: Generate report
266
- report = generate_validation_report(format_result, content_result, test_set_result, model_name)
267
-
268
- # Overall validity
 
 
 
 
 
 
 
 
 
 
269
  is_valid = (
270
- format_result['valid'] and
271
- not content_result['has_issues'] and
272
- test_set_result['overall_coverage'] >= 0.95
 
 
273
  )
274
-
275
  return {
276
- 'valid': is_valid,
277
- 'coverage': test_set_result['overall_coverage'],
278
- 'report': report,
279
- 'predictions': predictions,
280
- 'pair_coverage': test_set_result['pair_coverage'],
281
- 'quality_score': content_result.get('quality_score', 0.8),
282
- 'warnings': content_result.get('warnings', []),
283
- 'matching_count': test_set_result['matching_count'],
284
- 'missing_count': test_set_result['missing_count']
285
- }
 
 
 
 
 
 
4
  from typing import Dict, List, Tuple, Optional
5
  import json
6
  import io
7
+ import re
8
+ from config import (
9
+ PREDICTION_FORMAT,
10
+ VALIDATION_CONFIG,
11
+ MODEL_CATEGORIES,
12
+ EVALUATION_TRACKS,
13
+ ALL_UG40_LANGUAGES,
14
+ )
15
+
16
+
17
+ def detect_model_category(model_name: str, author: str, description: str) -> str:
18
+ """Automatically detect model category based on name and metadata."""
19
+
20
+ # Combine all text for analysis
21
+ text_to_analyze = f"{model_name} {author} {description}".lower()
22
+
23
+ # Category detection patterns
24
+ detection_patterns = PREDICTION_FORMAT["category_detection"]
25
+
26
+ # Check for specific patterns
27
+ if any(
28
+ pattern in text_to_analyze for pattern in detection_patterns.get("google", [])
29
+ ):
30
+ return "commercial"
31
+
32
+ if any(
33
+ pattern in text_to_analyze for pattern in detection_patterns.get("nllb", [])
34
+ ):
35
+ return "research"
36
+
37
+ if any(pattern in text_to_analyze for pattern in detection_patterns.get("m2m", [])):
38
+ return "research"
39
+
40
+ if any(
41
+ pattern in text_to_analyze for pattern in detection_patterns.get("baseline", [])
42
+ ):
43
+ return "baseline"
44
+
45
+ # Check for research indicators
46
+ research_indicators = [
47
+ "university",
48
+ "research",
49
+ "paper",
50
+ "arxiv",
51
+ "acl",
52
+ "emnlp",
53
+ "naacl",
54
+ "transformer",
55
+ "bert",
56
+ "gpt",
57
+ "t5",
58
+ "mbart",
59
+ "academic",
60
+ ]
61
+ if any(indicator in text_to_analyze for indicator in research_indicators):
62
+ return "research"
63
+
64
+ # Check for commercial indicators
65
+ commercial_indicators = [
66
+ "google",
67
+ "microsoft",
68
+ "azure",
69
+ "aws",
70
+ "openai",
71
+ "anthropic",
72
+ "commercial",
73
+ "api",
74
+ "cloud",
75
+ "translate",
76
+ ]
77
+ if any(indicator in text_to_analyze for indicator in commercial_indicators):
78
+ return "commercial"
79
+
80
+ # Default to community
81
+ return "community"
82
+
83
+
84
+ def validate_file_format_enhanced(file_content: bytes, filename: str) -> Dict:
85
+ """Enhanced file format validation with stricter requirements."""
86
 
 
 
 
87
  try:
88
  # Determine file type
89
+ if filename.endswith(".csv"):
90
  df = pd.read_csv(io.BytesIO(file_content))
91
+ elif filename.endswith(".tsv"):
92
+ df = pd.read_csv(io.BytesIO(file_content), sep="\t")
93
+ elif filename.endswith(".json"):
94
+ data = json.loads(file_content.decode("utf-8"))
95
  df = pd.DataFrame(data)
96
  else:
97
  return {
98
+ "valid": False,
99
+ "error": f"Unsupported file type. Use: {', '.join(PREDICTION_FORMAT['file_types'])}",
100
  }
101
+
102
  # Check required columns
103
+ missing_cols = set(PREDICTION_FORMAT["required_columns"]) - set(df.columns)
104
  if missing_cols:
105
  return {
106
+ "valid": False,
107
+ "error": f"Missing required columns: {', '.join(missing_cols)}",
108
  }
109
+
110
  # Basic data validation
111
  if len(df) == 0:
112
+ return {"valid": False, "error": "File is empty"}
113
+
114
+ # Enhanced validation checks
115
+ validation_issues = []
116
+
117
  # Check for required data
118
+ if df["sample_id"].isna().any():
119
+ validation_issues.append("Missing sample_id values found")
120
+
121
+ if df["prediction"].isna().any():
122
+ na_count = df["prediction"].isna().sum()
123
+ validation_issues.append(
124
+ f"Missing prediction values found ({na_count} empty predictions)"
125
+ )
126
+
 
 
 
 
127
  # Check for duplicates
128
+ duplicates = df["sample_id"].duplicated()
129
  if duplicates.any():
130
  dup_count = duplicates.sum()
131
+ validation_issues.append(
132
+ f"Duplicate sample_id values found ({dup_count} duplicates)"
133
+ )
134
+
135
+ # Data type validation
136
+ if not df["sample_id"].dtype == "object" and not df[
137
+ "sample_id"
138
+ ].dtype.name.startswith("str"):
139
+ df["sample_id"] = df["sample_id"].astype(str)
140
+
141
+ # Check sample_id format
142
+ invalid_ids = ~df["sample_id"].str.match(r"salt_\d{6}", na=False)
143
+ if invalid_ids.any():
144
+ invalid_count = invalid_ids.sum()
145
+ validation_issues.append(
146
+ f"Invalid sample_id format found ({invalid_count} invalid IDs)"
147
+ )
148
+
149
+ # Return results
150
+ if validation_issues:
151
  return {
152
+ "valid": False,
153
+ "error": "; ".join(validation_issues),
154
+ "dataframe": df,
155
+ "row_count": len(df),
156
+ "columns": list(df.columns),
157
  }
158
+
159
  return {
160
+ "valid": True,
161
+ "dataframe": df,
162
+ "row_count": len(df),
163
+ "columns": list(df.columns),
164
  }
165
+
166
  except Exception as e:
167
+ return {"valid": False, "error": f"Error parsing file: {str(e)}"}
168
+
169
+
170
+ def validate_predictions_content_enhanced(predictions: pd.DataFrame) -> Dict:
171
+ """Enhanced prediction content validation with stricter quality checks."""
172
 
 
 
 
173
  issues = []
174
  warnings = []
175
+ quality_metrics = {}
176
+
177
+ # Basic content checks
178
+ empty_predictions = predictions["prediction"].str.strip().eq("").sum()
179
  if empty_predictions > 0:
180
  issues.append(f"{empty_predictions} empty predictions found")
181
+
182
+ # Length analysis
183
+ pred_lengths = predictions["prediction"].str.len()
184
+ quality_metrics["avg_length"] = float(pred_lengths.mean())
185
+ quality_metrics["std_length"] = float(pred_lengths.std())
186
+
187
  # Check for suspiciously short predictions
188
+ short_predictions = (pred_lengths < 3).sum()
189
+ if short_predictions > len(predictions) * 0.05: # More than 5%
190
+ issues.append(f"{short_predictions} very short predictions (< 3 characters)")
191
+
192
  # Check for suspiciously long predictions
193
+ long_predictions = (pred_lengths > 500).sum()
194
+ if long_predictions > len(predictions) * 0.01: # More than 1%
195
  warnings.append(f"{long_predictions} very long predictions (> 500 characters)")
196
+
197
+ # Check for repeated predictions (more stringent)
198
+ duplicate_predictions = predictions["prediction"].duplicated().sum()
199
+ duplicate_rate = duplicate_predictions / len(predictions)
200
+ quality_metrics["duplicate_rate"] = float(duplicate_rate)
201
+
202
+ if duplicate_rate > VALIDATION_CONFIG["quality_thresholds"]["max_duplicate_rate"]:
203
+ issues.append(
204
+ f"{duplicate_predictions} duplicate prediction texts ({duplicate_rate:.1%})"
205
+ )
206
+
207
+ # Check for placeholder text
208
+ placeholder_patterns = [
209
+ r"^(test|placeholder|todo|xxx|aaa|bbb)$",
210
+ r"^[a-z]{1,3}$", # Very short gibberish
211
+ r"^\d+$", # Just numbers
212
+ r"^[^\w\s]*$", # Only punctuation
213
+ ]
214
+
215
+ placeholder_count = 0
216
+ for pattern in placeholder_patterns:
217
+ placeholder_matches = (
218
+ predictions["prediction"]
219
+ .str.match(pattern, flags=re.IGNORECASE, na=False)
220
+ .sum()
221
+ )
222
+ placeholder_count += placeholder_matches
223
+
224
+ if placeholder_count > len(predictions) * 0.02: # More than 2%
225
+ issues.append(f"{placeholder_count} placeholder-like predictions detected")
226
+
227
+ # Language detection (basic)
228
+ non_ascii_rate = (
229
+ predictions["prediction"].str.contains(r"[^\x00-\x7f]", na=False).mean()
230
+ )
231
+ quality_metrics["non_ascii_rate"] = float(non_ascii_rate)
232
+
233
+ # Check for appropriate character distribution for African languages
234
+ if non_ascii_rate < 0.1: # Less than 10% non-ASCII might indicate English-only
235
+ warnings.append(
236
+ "Low non-ASCII character rate - check if translations include local language scripts"
237
+ )
238
+
239
+ # Calculate overall quality score
240
+ quality_score = 1.0
241
+ quality_score -= len(issues) * 0.3 # Major penalty for issues
242
+ quality_score -= len(warnings) * 0.1 # Minor penalty for warnings
243
+ quality_score -= (
244
+ max(0, duplicate_rate - 0.05) * 2
245
+ ) # Penalty for excessive duplicates
246
+
247
+ # Length appropriateness
248
+ if (
249
+ quality_metrics["avg_length"]
250
+ < VALIDATION_CONFIG["quality_thresholds"]["min_avg_length"]
251
+ ):
252
+ quality_score -= 0.2
253
+ elif (
254
+ quality_metrics["avg_length"]
255
+ > VALIDATION_CONFIG["quality_thresholds"]["max_avg_length"]
256
+ ):
257
+ quality_score -= 0.1
258
+
259
+ quality_score = max(0.0, min(1.0, quality_score))
260
+
261
  return {
262
+ "has_issues": len(issues) > 0,
263
+ "issues": issues,
264
+ "warnings": warnings,
265
+ "quality_score": quality_score,
266
+ "quality_metrics": quality_metrics,
267
  }
268
 
269
+
270
+ def validate_against_test_set_enhanced(
271
+ predictions: pd.DataFrame, test_set: pd.DataFrame
272
+ ) -> Dict:
273
+ """Enhanced validation against test set with track-specific analysis."""
274
+
275
  # Convert IDs to string for comparison
276
+ pred_ids = set(predictions["sample_id"].astype(str))
277
+ test_ids = set(test_set["sample_id"].astype(str))
278
+
279
+ # Check overall coverage
280
  missing_ids = test_ids - pred_ids
281
  extra_ids = pred_ids - test_ids
282
  matching_ids = pred_ids & test_ids
283
+
284
+ overall_coverage = len(matching_ids) / len(test_ids)
285
+
286
+ # Track-specific coverage analysis
287
+ track_coverage = {}
288
+
289
+ for track_name, track_config in EVALUATION_TRACKS.items():
290
+ track_languages = track_config["languages"]
291
+
292
+ # Filter test set to track languages
293
+ track_test_set = test_set[
294
+ (test_set["source_language"].isin(track_languages))
295
+ & (test_set["target_language"].isin(track_languages))
296
+ ]
297
+
298
+ if len(track_test_set) == 0:
299
+ continue
300
+
301
+ track_test_ids = set(track_test_set["sample_id"].astype(str))
302
+ track_matching_ids = pred_ids & track_test_ids
303
+
304
+ track_coverage[track_name] = {
305
+ "total_samples": len(track_test_set),
306
+ "covered_samples": len(track_matching_ids),
307
+ "coverage_rate": len(track_matching_ids) / len(track_test_set),
308
+ "meets_minimum": len(track_matching_ids)
309
+ >= VALIDATION_CONFIG["min_samples_per_track"][track_name],
310
+ "min_required": VALIDATION_CONFIG["min_samples_per_track"][track_name],
311
+ }
312
+
313
+ # Language pair coverage analysis
314
  pair_coverage = {}
315
  for _, row in test_set.iterrows():
316
  pair_key = f"{row['source_language']}_{row['target_language']}"
317
  if pair_key not in pair_coverage:
318
+ pair_coverage[pair_key] = {"total": 0, "covered": 0}
319
+
320
+ pair_coverage[pair_key]["total"] += 1
321
+ if str(row["sample_id"]) in pred_ids:
322
+ pair_coverage[pair_key]["covered"] += 1
323
+
324
  # Calculate pair-wise coverage rates
325
  for pair_key in pair_coverage:
326
  pair_info = pair_coverage[pair_key]
327
+ pair_info["coverage_rate"] = pair_info["covered"] / pair_info["total"]
328
+
329
+ # Missing rate validation
330
+ missing_rate = len(missing_ids) / len(test_ids)
331
+ meets_missing_threshold = missing_rate <= VALIDATION_CONFIG["max_missing_rate"]
332
+
333
  return {
334
+ "overall_coverage": overall_coverage,
335
+ "missing_count": len(missing_ids),
336
+ "extra_count": len(extra_ids),
337
+ "matching_count": len(matching_ids),
338
+ "missing_rate": missing_rate,
339
+ "meets_missing_threshold": meets_missing_threshold,
340
+ "is_complete": overall_coverage == 1.0,
341
+ "track_coverage": track_coverage,
342
+ "pair_coverage": pair_coverage,
343
+ "missing_ids_sample": list(missing_ids)[:10],
344
+ "extra_ids_sample": list(extra_ids)[:10],
345
+ }
346
+
347
+
348
+ def assess_statistical_adequacy(validation_result: Dict, model_category: str) -> Dict:
349
+ """Assess statistical adequacy for scientific evaluation."""
350
+
351
+ adequacy_assessment = {
352
+ "overall_adequate": True,
353
+ "track_adequacy": {},
354
+ "recommendations": [],
355
+ "statistical_power_estimate": {},
356
  }
357
 
358
+ track_coverage = validation_result.get("track_coverage", {})
359
+
360
+ for track_name, coverage_info in track_coverage.items():
361
+ track_config = EVALUATION_TRACKS[track_name]
362
+
363
+ # Sample size adequacy
364
+ covered_samples = coverage_info["covered_samples"]
365
+ min_required = coverage_info["min_required"]
366
+
367
+ sample_adequate = covered_samples >= min_required
368
+
369
+ # Coverage rate adequacy
370
+ coverage_rate = coverage_info["coverage_rate"]
371
+ coverage_adequate = coverage_rate >= 0.8 # 80% coverage minimum
372
+
373
+ # Statistical power estimation (simplified)
374
+ estimated_power = min(1.0, covered_samples / (min_required * 1.5))
375
+
376
+ track_adequate = sample_adequate and coverage_adequate
377
+
378
+ adequacy_assessment["track_adequacy"][track_name] = {
379
+ "sample_adequate": sample_adequate,
380
+ "coverage_adequate": coverage_adequate,
381
+ "overall_adequate": track_adequate,
382
+ "covered_samples": covered_samples,
383
+ "min_required": min_required,
384
+ "coverage_rate": coverage_rate,
385
+ "estimated_power": estimated_power,
386
+ }
387
+
388
+ if not track_adequate:
389
+ adequacy_assessment["overall_adequate"] = False
390
+
391
+ adequacy_assessment["statistical_power_estimate"][track_name] = estimated_power
392
+
393
+ # Generate recommendations
394
+ if not adequacy_assessment["overall_adequate"]:
395
+ inadequate_tracks = [
396
+ track
397
+ for track, info in adequacy_assessment["track_adequacy"].items()
398
+ if not info["overall_adequate"]
399
+ ]
400
+ adequacy_assessment["recommendations"].append(
401
+ f"Insufficient samples for tracks: {', '.join(inadequate_tracks)}"
402
+ )
403
+
404
+ # Category-specific recommendations
405
+ if model_category == "commercial" and not adequacy_assessment["track_adequacy"].get(
406
+ "google_comparable", {}
407
+ ).get("overall_adequate", False):
408
+ adequacy_assessment["recommendations"].append(
409
+ "Commercial models should ensure adequate coverage of Google-comparable track"
410
+ )
411
+
412
+ return adequacy_assessment
413
+
414
+
415
+ def generate_scientific_validation_report(
416
  format_result: Dict,
417
+ content_result: Dict,
418
  test_set_result: Dict,
419
+ adequacy_result: Dict,
420
+ model_name: str = "",
421
+ detected_category: str = "community",
422
  ) -> str:
423
+ """Generate comprehensive scientific validation report."""
424
+
425
  report = []
426
+
427
  # Header
428
+ report.append(f"# 🔬 Scientific Validation Report: {model_name or 'Submission'}")
429
+ report.append("")
430
+
431
+ # Model categorization
432
+ category_info = MODEL_CATEGORIES.get(
433
+ detected_category, MODEL_CATEGORIES["community"]
434
+ )
435
+ report.append(f"**Detected Model Category**: {category_info['name']}")
436
+ report.append(f"**Category Description**: {category_info['description']}")
437
  report.append("")
438
+
439
  # File format validation
440
+ if format_result["valid"]:
441
  report.append("✅ **File Format**: Valid")
442
  report.append(f" - Rows: {format_result['row_count']:,}")
443
  report.append(f" - Columns: {', '.join(format_result['columns'])}")
 
445
  report.append("❌ **File Format**: Invalid")
446
  report.append(f" - Error: {format_result['error']}")
447
  return "\n".join(report)
448
+
449
+ # Content quality validation
450
+ quality_score = content_result.get("quality_score", 0.0)
451
+
452
+ if content_result["has_issues"]:
453
+ report.append("❌ **Content Quality**: Issues Found")
454
+ for issue in content_result["issues"]:
455
  report.append(f" - ❌ {issue}")
456
  else:
457
  report.append("✅ **Content Quality**: Good")
458
+
459
+ if content_result["warnings"]:
460
+ for warning in content_result["warnings"]:
461
  report.append(f" - ⚠️ {warning}")
462
+
463
+ report.append(f" - **Quality Score**: {quality_score:.2f}/1.00")
464
+ report.append("")
465
+
466
+ # Test set coverage validation
467
+ overall_coverage = test_set_result["overall_coverage"]
468
+ meets_threshold = test_set_result["meets_missing_threshold"]
469
+
470
+ if overall_coverage == 1.0:
471
  report.append("✅ **Test Set Coverage**: Complete")
472
+ elif overall_coverage >= 0.95 and meets_threshold:
473
+ report.append(" **Test Set Coverage**: Adequate")
474
+ else:
475
+ report.append("❌ **Test Set Coverage**: Insufficient")
476
+
477
+ report.append(
478
+ f" - Coverage: {overall_coverage:.1%} ({test_set_result['matching_count']:,} / {test_set_result['matching_count'] + test_set_result['missing_count']:,})"
479
+ )
480
+ report.append(f" - Missing Rate: {test_set_result['missing_rate']:.1%}")
481
+ report.append("")
482
+
483
+ # Track-specific coverage analysis
484
+ report.append("## 📊 Track-Specific Analysis")
485
+
486
+ track_coverage = test_set_result.get("track_coverage", {})
487
+ for track_name, coverage_info in track_coverage.items():
488
+ track_config = EVALUATION_TRACKS[track_name]
489
+
490
+ status = "✅" if coverage_info["meets_minimum"] else "❌"
491
+ report.append(f"### {status} {track_config['name']}")
492
+
493
+ report.append(
494
+ f" - **Samples**: {coverage_info['covered_samples']:,} / {coverage_info['total_samples']:,}"
495
+ )
496
+ report.append(f" - **Coverage**: {coverage_info['coverage_rate']:.1%}")
497
+ report.append(f" - **Minimum Required**: {coverage_info['min_required']:,}")
498
+ report.append(
499
+ f" - **Status**: {'Adequate' if coverage_info['meets_minimum'] else 'Insufficient'}"
500
+ )
501
+ report.append("")
502
+
503
+ # Statistical adequacy assessment
504
+ report.append("## 🔬 Statistical Adequacy Assessment")
505
+
506
+ if adequacy_result["overall_adequate"]:
507
+ report.append(
508
+ "✅ **Overall Assessment**: Statistically adequate for scientific evaluation"
509
+ )
510
  else:
511
+ report.append(
512
+ "❌ **Overall Assessment**: Insufficient for rigorous scientific evaluation"
513
+ )
514
+
515
+ # Track adequacy details
516
+ for track_name, track_adequacy in adequacy_result["track_adequacy"].items():
517
+ track_config = EVALUATION_TRACKS[track_name]
518
+ power = track_adequacy["estimated_power"]
519
+
520
+ status = "✅" if track_adequacy["overall_adequate"] else "❌"
521
+ report.append(
522
+ f" - {status} **{track_config['name']}**: Statistical power ≈ {power:.1%}"
523
+ )
524
+
525
+ # Recommendations
526
+ if adequacy_result["recommendations"]:
527
  report.append("")
528
+ report.append("## 💡 Recommendations")
529
+ for rec in adequacy_result["recommendations"]:
530
+ report.append(f" - {rec}")
531
+
 
 
 
 
 
532
  # Final verdict
533
  report.append("")
534
+ all_checks_pass = (
535
+ format_result["valid"]
536
+ and not content_result["has_issues"]
537
+ and overall_coverage >= 0.95
538
+ and meets_threshold
539
+ and adequacy_result["overall_adequate"]
540
+ )
541
+
542
+ if all_checks_pass:
543
+ report.append("🎉 **Final Verdict**: Ready for scientific evaluation!")
544
+ elif format_result["valid"] and overall_coverage >= 0.8:
545
+ report.append("⚠️ **Final Verdict**: Can be evaluated with limitations")
546
  else:
547
+ report.append("❌ **Final Verdict**: Please address issues before submission")
548
+
549
  return "\n".join(report)
550
 
551
+
552
+ def validate_submission_scientific(
553
+ file_content: bytes,
554
+ filename: str,
555
+ test_set: pd.DataFrame,
556
+ model_name: str = "",
557
+ author: str = "",
558
+ description: str = "",
559
+ ) -> Dict:
560
+ """Complete scientific validation pipeline for submissions."""
561
+
562
+ # Step 1: Detect model category
563
+ detected_category = detect_model_category(model_name, author, description)
564
+
565
+ # Step 2: Enhanced file format validation
566
+ format_result = validate_file_format_enhanced(file_content, filename)
567
+ if not format_result["valid"]:
568
  return {
569
+ "valid": False,
570
+ "category": detected_category,
571
+ "report": generate_scientific_validation_report(
572
+ format_result, {}, {}, {}, model_name, detected_category
573
+ ),
574
+ "predictions": None,
575
+ "adequacy": {},
576
  }
577
+
578
+ predictions = format_result["dataframe"]
579
+
580
+ # Step 3: Enhanced content validation
581
+ content_result = validate_predictions_content_enhanced(predictions)
582
+
583
+ # Step 4: Enhanced test set validation
584
+ test_set_result = validate_against_test_set_enhanced(predictions, test_set)
585
+
586
+ # Step 5: Statistical adequacy assessment
587
+ adequacy_result = assess_statistical_adequacy(test_set_result, detected_category)
588
+
589
+ # Step 6: Generate comprehensive report
590
+ report = generate_scientific_validation_report(
591
+ format_result,
592
+ content_result,
593
+ test_set_result,
594
+ adequacy_result,
595
+ model_name,
596
+ detected_category,
597
+ )
598
+
599
+ # Overall validity determination
600
  is_valid = (
601
+ format_result["valid"]
602
+ and not content_result["has_issues"]
603
+ and test_set_result["overall_coverage"] >= 0.95
604
+ and test_set_result["meets_missing_threshold"]
605
+ and adequacy_result["overall_adequate"]
606
  )
607
+
608
  return {
609
+ "valid": is_valid,
610
+ "category": detected_category,
611
+ "coverage": test_set_result["overall_coverage"],
612
+ "report": report,
613
+ "predictions": predictions,
614
+ "adequacy": adequacy_result,
615
+ "quality_score": content_result.get("quality_score", 0.8),
616
+ "track_coverage": test_set_result.get("track_coverage", {}),
617
+ "scientific_metadata": {
618
+ "validation_timestamp": pd.Timestamp.now().isoformat(),
619
+ "validation_version": "2.0-scientific",
620
+ "detected_category": detected_category,
621
+ "statistical_adequacy": adequacy_result["overall_adequate"],
622
+ },
623
+ }