Spaces:
Sleeping
Sleeping
Update src/utils.py
Browse files- src/utils.py +128 -155
src/utils.py
CHANGED
@@ -13,6 +13,7 @@ from config import (
|
|
13 |
MODEL_CATEGORIES,
|
14 |
STATISTICAL_CONFIG,
|
15 |
METRICS_CONFIG,
|
|
|
16 |
)
|
17 |
|
18 |
|
@@ -40,7 +41,7 @@ def get_track_language_pairs(track: str) -> List[Tuple[str, str]]:
|
|
40 |
"""Get language pairs for a specific evaluation track."""
|
41 |
if track not in EVALUATION_TRACKS:
|
42 |
return []
|
43 |
-
|
44 |
track_languages = EVALUATION_TRACKS[track]["languages"]
|
45 |
pairs = []
|
46 |
for src in track_languages:
|
@@ -73,40 +74,35 @@ def sanitize_model_name(name: str) -> str:
|
|
73 |
"""Sanitize model name for display and storage with enhanced validation."""
|
74 |
if not name or not isinstance(name, str):
|
75 |
return "Anonymous_Model"
|
76 |
-
|
77 |
# Remove special characters, limit length
|
78 |
name = re.sub(r"[^\w\-.]", "_", name.strip())
|
79 |
# Remove multiple consecutive underscores
|
80 |
name = re.sub(r"_+", "_", name)
|
81 |
# Remove leading/trailing underscores
|
82 |
name = name.strip("_")
|
83 |
-
|
84 |
# Ensure minimum length
|
85 |
if len(name) < 3:
|
86 |
name = f"Model_{name}"
|
87 |
-
|
88 |
# Check for reserved names
|
89 |
reserved_names = ["admin", "test", "baseline", "google", "system"]
|
90 |
if name.lower() in reserved_names:
|
91 |
name = f"User_{name}"
|
92 |
-
|
93 |
return name[:50] # Limit to 50 characters
|
94 |
|
95 |
|
96 |
-
def format_metric_value(
|
97 |
-
|
98 |
-
metric: str,
|
99 |
-
include_ci: bool = False,
|
100 |
-
ci_lower: float = None,
|
101 |
-
ci_upper: float = None,
|
102 |
-
) -> str:
|
103 |
"""Format metric value for display with optional confidence intervals."""
|
104 |
if pd.isna(value) or value is None:
|
105 |
return "N/A"
|
106 |
-
|
107 |
try:
|
108 |
precision = METRICS_CONFIG["display_precision"]
|
109 |
-
|
110 |
if metric == "coverage_rate":
|
111 |
formatted = f"{value:.{precision}%}"
|
112 |
elif metric in ["bleu"]:
|
@@ -116,14 +112,14 @@ def format_metric_value(
|
|
116 |
formatted = f"{min(value, 1.0):.{precision}f}"
|
117 |
else:
|
118 |
formatted = f"{value:.{precision}f}"
|
119 |
-
|
120 |
# Add confidence interval if requested
|
121 |
if include_ci and ci_lower is not None and ci_upper is not None:
|
122 |
ci_str = f" [{ci_lower:.{precision}f}, {ci_upper:.{precision}f}]"
|
123 |
formatted += ci_str
|
124 |
-
|
125 |
return formatted
|
126 |
-
|
127 |
except (ValueError, TypeError):
|
128 |
return str(value)
|
129 |
|
@@ -132,32 +128,32 @@ def calculate_effect_size(values1: List[float], values2: List[float]) -> float:
|
|
132 |
"""Calculate Cohen's d effect size between two groups."""
|
133 |
if len(values1) < 2 or len(values2) < 2:
|
134 |
return 0.0
|
135 |
-
|
136 |
try:
|
137 |
values1 = np.array(values1)
|
138 |
values2 = np.array(values2)
|
139 |
-
|
140 |
# Remove NaN values
|
141 |
values1 = values1[~np.isnan(values1)]
|
142 |
values2 = values2[~np.isnan(values2)]
|
143 |
-
|
144 |
if len(values1) < 2 or len(values2) < 2:
|
145 |
return 0.0
|
146 |
-
|
147 |
# Calculate pooled standard deviation
|
148 |
n1, n2 = len(values1), len(values2)
|
149 |
pooled_std = np.sqrt(
|
150 |
((n1 - 1) * np.var(values1, ddof=1) + (n2 - 1) * np.var(values2, ddof=1))
|
151 |
/ (n1 + n2 - 2)
|
152 |
)
|
153 |
-
|
154 |
if pooled_std == 0:
|
155 |
return 0.0
|
156 |
-
|
157 |
# Cohen's d
|
158 |
effect_size = (np.mean(values1) - np.mean(values2)) / pooled_std
|
159 |
return abs(effect_size)
|
160 |
-
|
161 |
except Exception:
|
162 |
return 0.0
|
163 |
|
@@ -165,7 +161,7 @@ def calculate_effect_size(values1: List[float], values2: List[float]) -> float:
|
|
165 |
def interpret_effect_size(effect_size: float) -> str:
|
166 |
"""Interpret effect size according to Cohen's conventions."""
|
167 |
thresholds = STATISTICAL_CONFIG["effect_size_thresholds"]
|
168 |
-
|
169 |
if effect_size < thresholds["small"]:
|
170 |
return "negligible"
|
171 |
elif effect_size < thresholds["medium"]:
|
@@ -182,28 +178,24 @@ def calculate_statistical_power(
|
|
182 |
"""Estimate statistical power for given effect size and sample sizes."""
|
183 |
if n1 < 2 or n2 < 2:
|
184 |
return 0.0
|
185 |
-
|
186 |
try:
|
187 |
# Simplified power calculation using t-test
|
188 |
# This is an approximation
|
189 |
df = n1 + n2 - 2
|
190 |
-
pooled_se = np.sqrt((1
|
191 |
-
|
192 |
# Critical t-value
|
193 |
-
t_critical = stats.t.ppf(1 - alpha
|
194 |
-
|
195 |
# Non-centrality parameter
|
196 |
ncp = effect_size / pooled_se
|
197 |
-
|
198 |
# Power (approximate)
|
199 |
-
power = (
|
200 |
-
|
201 |
-
- stats.t.cdf(t_critical, df, loc=ncp)
|
202 |
-
+ stats.t.cdf(-t_critical, df, loc=ncp)
|
203 |
-
)
|
204 |
-
|
205 |
return min(1.0, max(0.0, power))
|
206 |
-
|
207 |
except Exception:
|
208 |
return 0.0
|
209 |
|
@@ -211,16 +203,16 @@ def calculate_statistical_power(
|
|
211 |
def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
|
212 |
"""Get comprehensive statistics about test data coverage for each track."""
|
213 |
track_stats = {}
|
214 |
-
|
215 |
for track_name, track_config in EVALUATION_TRACKS.items():
|
216 |
track_languages = track_config["languages"]
|
217 |
-
|
218 |
# Filter test data to track languages
|
219 |
track_data = test_data[
|
220 |
-
(test_data["source_language"].isin(track_languages))
|
221 |
-
|
222 |
]
|
223 |
-
|
224 |
if track_data.empty:
|
225 |
track_stats[track_name] = {
|
226 |
"total_samples": 0,
|
@@ -230,31 +222,29 @@ def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
|
|
230 |
"adequacy_assessment": "insufficient",
|
231 |
}
|
232 |
continue
|
233 |
-
|
234 |
# Calculate pair-wise statistics
|
235 |
pair_counts = {}
|
236 |
for src in track_languages:
|
237 |
for tgt in track_languages:
|
238 |
if src == tgt:
|
239 |
continue
|
240 |
-
|
241 |
pair_data = track_data[
|
242 |
-
(track_data["source_language"] == src)
|
243 |
-
|
244 |
]
|
245 |
-
|
246 |
pair_key = f"{src}_to_{tgt}"
|
247 |
pair_counts[pair_key] = len(pair_data)
|
248 |
-
|
249 |
# Calculate adequacy
|
250 |
min_required = track_config["min_samples_per_pair"]
|
251 |
-
adequate_pairs = sum(
|
252 |
-
1 for count in pair_counts.values() if count >= min_required
|
253 |
-
)
|
254 |
total_possible_pairs = len(track_languages) * (len(track_languages) - 1)
|
255 |
-
|
256 |
adequacy_rate = adequate_pairs / max(total_possible_pairs, 1)
|
257 |
-
|
258 |
if adequacy_rate >= 0.8:
|
259 |
adequacy = "excellent"
|
260 |
elif adequacy_rate >= 0.6:
|
@@ -263,7 +253,7 @@ def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
|
|
263 |
adequacy = "fair"
|
264 |
else:
|
265 |
adequacy = "insufficient"
|
266 |
-
|
267 |
track_stats[track_name] = {
|
268 |
"total_samples": len(track_data),
|
269 |
"language_pairs": len([k for k, v in pair_counts.items() if v > 0]),
|
@@ -273,7 +263,7 @@ def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
|
|
273 |
"adequacy_rate": adequacy_rate,
|
274 |
"min_samples_per_pair": min_required,
|
275 |
}
|
276 |
-
|
277 |
return track_stats
|
278 |
|
279 |
|
@@ -281,7 +271,7 @@ def validate_submission_completeness_scientific(
|
|
281 |
predictions: pd.DataFrame, test_set: pd.DataFrame, track: str = None
|
282 |
) -> Dict:
|
283 |
"""Enhanced validation with track-specific analysis."""
|
284 |
-
|
285 |
if predictions.empty or test_set.empty:
|
286 |
return {
|
287 |
"is_complete": False,
|
@@ -291,23 +281,23 @@ def validate_submission_completeness_scientific(
|
|
291 |
"coverage": 0.0,
|
292 |
"track_analysis": {},
|
293 |
}
|
294 |
-
|
295 |
# If track specified, filter to track languages
|
296 |
if track and track in EVALUATION_TRACKS:
|
297 |
track_languages = EVALUATION_TRACKS[track]["languages"]
|
298 |
test_set = test_set[
|
299 |
-
(test_set["source_language"].isin(track_languages))
|
300 |
-
|
301 |
]
|
302 |
-
|
303 |
try:
|
304 |
required_ids = set(test_set["sample_id"].astype(str))
|
305 |
provided_ids = set(predictions["sample_id"].astype(str))
|
306 |
-
|
307 |
missing_ids = required_ids - provided_ids
|
308 |
extra_ids = provided_ids - required_ids
|
309 |
matching_ids = provided_ids & required_ids
|
310 |
-
|
311 |
base_result = {
|
312 |
"is_complete": len(missing_ids) == 0,
|
313 |
"missing_count": len(missing_ids),
|
@@ -315,14 +305,14 @@ def validate_submission_completeness_scientific(
|
|
315 |
"missing_ids": list(missing_ids)[:10],
|
316 |
"coverage": len(matching_ids) / len(required_ids) if required_ids else 0.0,
|
317 |
}
|
318 |
-
|
319 |
# Add track-specific analysis if requested
|
320 |
if track:
|
321 |
track_analysis = analyze_track_coverage(predictions, test_set, track)
|
322 |
base_result["track_analysis"] = track_analysis
|
323 |
-
|
324 |
return base_result
|
325 |
-
|
326 |
except Exception as e:
|
327 |
print(f"Error in submission completeness validation: {e}")
|
328 |
return {
|
@@ -339,38 +329,37 @@ def analyze_track_coverage(
|
|
339 |
predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
|
340 |
) -> Dict:
|
341 |
"""Analyze coverage for a specific track."""
|
342 |
-
|
343 |
if track not in EVALUATION_TRACKS:
|
344 |
return {"error": f"Unknown track: {track}"}
|
345 |
-
|
346 |
track_config = EVALUATION_TRACKS[track]
|
347 |
track_languages = track_config["languages"]
|
348 |
-
|
349 |
# Filter test set to track languages
|
350 |
track_test_set = test_set[
|
351 |
-
(test_set["source_language"].isin(track_languages))
|
352 |
-
|
353 |
]
|
354 |
-
|
355 |
if track_test_set.empty:
|
356 |
return {"error": f"No test data available for {track} track"}
|
357 |
-
|
358 |
# Merge with predictions
|
359 |
-
merged = track_test_set.merge(
|
360 |
-
|
361 |
-
)
|
362 |
-
|
363 |
# Analyze by language pair
|
364 |
pair_analysis = {}
|
365 |
for src in track_languages:
|
366 |
for tgt in track_languages:
|
367 |
if src == tgt:
|
368 |
continue
|
369 |
-
|
370 |
pair_data = merged[
|
371 |
-
(merged["source_language"] == src) &
|
|
|
372 |
]
|
373 |
-
|
374 |
if len(pair_data) > 0:
|
375 |
covered = pair_data["prediction"].notna().sum()
|
376 |
pair_analysis[f"{src}_to_{tgt}"] = {
|
@@ -379,19 +368,18 @@ def analyze_track_coverage(
|
|
379 |
"coverage_rate": covered / len(pair_data),
|
380 |
"meets_minimum": covered >= track_config["min_samples_per_pair"],
|
381 |
}
|
382 |
-
|
383 |
# Overall track statistics
|
384 |
total_pairs = len(pair_analysis)
|
385 |
adequate_pairs = sum(1 for info in pair_analysis.values() if info["meets_minimum"])
|
386 |
-
|
387 |
return {
|
388 |
"track_name": track_config["name"],
|
389 |
"total_language_pairs": total_pairs,
|
390 |
"adequate_pairs": adequate_pairs,
|
391 |
"adequacy_rate": adequate_pairs / max(total_pairs, 1),
|
392 |
"pair_analysis": pair_analysis,
|
393 |
-
"overall_adequate": adequate_pairs
|
394 |
-
>= total_pairs * 0.8, # 80% of pairs adequate
|
395 |
}
|
396 |
|
397 |
|
@@ -399,40 +387,35 @@ def calculate_language_pair_coverage_scientific(
|
|
399 |
predictions: pd.DataFrame, test_set: pd.DataFrame
|
400 |
) -> Dict:
|
401 |
"""Calculate comprehensive language pair coverage with statistical metrics."""
|
402 |
-
|
403 |
if predictions.empty or test_set.empty:
|
404 |
return {}
|
405 |
-
|
406 |
try:
|
407 |
# Merge to get language info
|
408 |
-
merged = test_set.merge(
|
409 |
-
|
410 |
-
)
|
411 |
-
|
412 |
coverage = {}
|
413 |
for src in ALL_UG40_LANGUAGES:
|
414 |
for tgt in ALL_UG40_LANGUAGES:
|
415 |
if src == tgt:
|
416 |
continue
|
417 |
-
|
418 |
pair_data = merged[
|
419 |
-
(merged["source_language"] == src)
|
420 |
-
|
421 |
]
|
422 |
-
|
423 |
if len(pair_data) > 0:
|
424 |
predicted_count = pair_data["prediction"].notna().sum()
|
425 |
coverage_rate = predicted_count / len(pair_data)
|
426 |
-
|
427 |
# Determine which tracks include this pair
|
428 |
tracks_included = []
|
429 |
for track_name, track_config in EVALUATION_TRACKS.items():
|
430 |
-
if
|
431 |
-
src in track_config["languages"]
|
432 |
-
and tgt in track_config["languages"]
|
433 |
-
):
|
434 |
tracks_included.append(track_name)
|
435 |
-
|
436 |
coverage[f"{src}_{tgt}"] = {
|
437 |
"total": len(pair_data),
|
438 |
"predicted": predicted_count,
|
@@ -440,18 +423,17 @@ def calculate_language_pair_coverage_scientific(
|
|
440 |
"display_name": format_language_pair(src, tgt),
|
441 |
"tracks_included": tracks_included,
|
442 |
"google_comparable": (
|
443 |
-
src in GOOGLE_SUPPORTED_LANGUAGES
|
444 |
-
|
445 |
),
|
446 |
"statistical_adequacy": {
|
447 |
-
track: predicted_count
|
448 |
-
>= EVALUATION_TRACKS[track]["min_samples_per_pair"]
|
449 |
for track in tracks_included
|
450 |
},
|
451 |
}
|
452 |
-
|
453 |
return coverage
|
454 |
-
|
455 |
except Exception as e:
|
456 |
print(f"Error calculating language pair coverage: {e}")
|
457 |
return {}
|
@@ -474,37 +456,37 @@ def clean_text_for_evaluation(text: str) -> str:
|
|
474 |
"""Clean text for evaluation, handling common encoding issues."""
|
475 |
if not isinstance(text, str):
|
476 |
return str(text) if text is not None else ""
|
477 |
-
|
478 |
# Remove extra whitespace
|
479 |
text = re.sub(r"\s+", " ", text.strip())
|
480 |
-
|
481 |
# Handle common encoding issues
|
482 |
text = text.replace("\u00a0", " ") # Non-breaking space
|
483 |
text = text.replace("\u2019", "'") # Right single quotation mark
|
484 |
text = text.replace("\u201c", '"') # Left double quotation mark
|
485 |
text = text.replace("\u201d", '"') # Right double quotation mark
|
486 |
-
|
487 |
return text
|
488 |
|
489 |
|
490 |
def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -> Dict:
|
491 |
"""Extract comprehensive summary statistics from model evaluation results."""
|
492 |
-
|
493 |
if not model_results or "tracks" not in model_results:
|
494 |
return {}
|
495 |
-
|
496 |
tracks = model_results["tracks"]
|
497 |
-
|
498 |
# If specific track requested
|
499 |
if track and track in tracks:
|
500 |
track_data = tracks[track]
|
501 |
if track_data.get("error"):
|
502 |
return {"error": f"No valid data for {track} track"}
|
503 |
-
|
504 |
track_averages = track_data.get("track_averages", {})
|
505 |
track_statistics = track_data.get("track_statistics", {})
|
506 |
summary = track_data.get("summary", {})
|
507 |
-
|
508 |
stats = {
|
509 |
"track": track,
|
510 |
"track_name": EVALUATION_TRACKS[track]["name"],
|
@@ -513,10 +495,9 @@ def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -
|
|
513 |
"chrf": track_averages.get("chrf", 0.0),
|
514 |
"total_samples": summary.get("total_samples", 0),
|
515 |
"language_pairs": summary.get("language_pairs_evaluated", 0),
|
516 |
-
"statistical_adequacy": summary.get("total_samples", 0)
|
517 |
-
>= 100, # Simple threshold
|
518 |
}
|
519 |
-
|
520 |
# Add confidence intervals if available
|
521 |
if "quality_score" in track_statistics:
|
522 |
quality_stats = track_statistics["quality_score"]
|
@@ -524,27 +505,27 @@ def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -
|
|
524 |
quality_stats.get("ci_lower", 0.0),
|
525 |
quality_stats.get("ci_upper", 0.0),
|
526 |
]
|
527 |
-
|
528 |
return stats
|
529 |
-
|
530 |
# Otherwise, return summary across all tracks
|
531 |
all_tracks_summary = {
|
532 |
"tracks_evaluated": len([t for t in tracks.values() if not t.get("error")]),
|
533 |
"total_tracks": len(EVALUATION_TRACKS),
|
534 |
"by_track": {},
|
535 |
}
|
536 |
-
|
537 |
for track_name, track_data in tracks.items():
|
538 |
if not track_data.get("error"):
|
539 |
track_averages = track_data.get("track_averages", {})
|
540 |
summary = track_data.get("summary", {})
|
541 |
-
|
542 |
all_tracks_summary["by_track"][track_name] = {
|
543 |
"quality_score": track_averages.get("quality_score", 0.0),
|
544 |
"samples": summary.get("total_samples", 0),
|
545 |
"pairs": summary.get("language_pairs_evaluated", 0),
|
546 |
}
|
547 |
-
|
548 |
return all_tracks_summary
|
549 |
|
550 |
|
@@ -553,12 +534,10 @@ def generate_model_identifier_scientific(
|
|
553 |
) -> str:
|
554 |
"""Generate a unique scientific identifier for a model."""
|
555 |
clean_name = sanitize_model_name(model_name)
|
556 |
-
clean_author = (
|
557 |
-
re.sub(r"[^\w\-]", "_", author.strip())[:20] if author else "Anonymous"
|
558 |
-
)
|
559 |
clean_category = category[:10] if category in MODEL_CATEGORIES else "community"
|
560 |
timestamp = datetime.datetime.now().strftime("%m%d_%H%M")
|
561 |
-
|
562 |
return f"{clean_category}_{clean_name}_{clean_author}_{timestamp}"
|
563 |
|
564 |
|
@@ -566,28 +545,26 @@ def validate_dataframe_structure_enhanced(
|
|
566 |
df: pd.DataFrame, required_columns: List[str], track: str = None
|
567 |
) -> Tuple[bool, List[str]]:
|
568 |
"""Enhanced DataFrame structure validation with track-specific checks."""
|
569 |
-
|
570 |
if df.empty:
|
571 |
return False, ["DataFrame is empty"]
|
572 |
-
|
573 |
issues = []
|
574 |
-
|
575 |
# Check required columns
|
576 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
577 |
if missing_columns:
|
578 |
issues.append(f"Missing columns: {', '.join(missing_columns)}")
|
579 |
-
|
580 |
# Check for track-specific requirements
|
581 |
if track and track in EVALUATION_TRACKS:
|
582 |
track_config = EVALUATION_TRACKS[track]
|
583 |
min_samples = track_config.get("min_samples_per_pair", 10)
|
584 |
-
|
585 |
# Check sample size adequacy
|
586 |
if len(df) < min_samples * 5: # At least 5 pairs worth of data
|
587 |
-
issues.append(
|
588 |
-
|
589 |
-
)
|
590 |
-
|
591 |
# Check data types
|
592 |
if "sample_id" in df.columns:
|
593 |
if not df["sample_id"].dtype == "object":
|
@@ -595,7 +572,7 @@ def validate_dataframe_structure_enhanced(
|
|
595 |
df["sample_id"] = df["sample_id"].astype(str)
|
596 |
except Exception:
|
597 |
issues.append("Cannot convert sample_id to string")
|
598 |
-
|
599 |
return len(issues) == 0, issues
|
600 |
|
601 |
|
@@ -613,10 +590,10 @@ def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
|
|
613 |
"""Truncate text to specified length with suffix."""
|
614 |
if not isinstance(text, str):
|
615 |
text = str(text)
|
616 |
-
|
617 |
if len(text) <= max_length:
|
618 |
return text
|
619 |
-
|
620 |
return text[: max_length - len(suffix)] + suffix
|
621 |
|
622 |
|
@@ -624,20 +601,20 @@ def calculate_sample_size_recommendation(
|
|
624 |
desired_power: float = 0.8, effect_size: float = 0.5, alpha: float = 0.05
|
625 |
) -> int:
|
626 |
"""Calculate recommended sample size for statistical analysis."""
|
627 |
-
|
628 |
try:
|
629 |
# Simplified sample size calculation for t-test
|
630 |
# This is an approximation using Cohen's conventions
|
631 |
-
|
632 |
z_alpha = stats.norm.ppf(1 - alpha / 2)
|
633 |
z_beta = stats.norm.ppf(desired_power)
|
634 |
-
|
635 |
# Sample size per group
|
636 |
n_per_group = 2 * ((z_alpha + z_beta) / effect_size) ** 2
|
637 |
-
|
638 |
# Round up to nearest integer
|
639 |
return max(10, int(np.ceil(n_per_group)))
|
640 |
-
|
641 |
except Exception:
|
642 |
return 50 # Default fallback
|
643 |
|
@@ -646,33 +623,29 @@ def assess_model_category_appropriateness(
|
|
646 |
model_name: str, category: str, performance_data: Dict
|
647 |
) -> Dict:
|
648 |
"""Assess if the detected/assigned model category is appropriate."""
|
649 |
-
|
650 |
assessment = {
|
651 |
"category": category,
|
652 |
"appropriate": True,
|
653 |
"confidence": 1.0,
|
654 |
"recommendations": [],
|
655 |
}
|
656 |
-
|
657 |
# Check for category mismatches based on performance
|
658 |
if category == "baseline" and performance_data:
|
659 |
# Baselines shouldn't perform too well
|
660 |
quality_scores = []
|
661 |
for track_data in performance_data.get("tracks", {}).values():
|
662 |
if not track_data.get("error"):
|
663 |
-
quality_scores.append(
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
if (
|
668 |
-
quality_scores and max(quality_scores) > 0.7
|
669 |
-
): # High performance for baseline
|
670 |
assessment["appropriate"] = False
|
671 |
assessment["confidence"] = 0.3
|
672 |
assessment["recommendations"].append(
|
673 |
"High performance suggests this might not be a baseline model"
|
674 |
)
|
675 |
-
|
676 |
# Check for commercial model expectations
|
677 |
if category == "commercial":
|
678 |
# Commercial models should have good Google-comparable performance
|
@@ -683,5 +656,5 @@ def assess_model_category_appropriateness(
|
|
683 |
assessment["recommendations"].append(
|
684 |
"Low performance unexpected for commercial systems"
|
685 |
)
|
686 |
-
|
687 |
-
return assessment
|
|
|
13 |
MODEL_CATEGORIES,
|
14 |
STATISTICAL_CONFIG,
|
15 |
METRICS_CONFIG,
|
16 |
+
SAMPLE_SIZE_RECOMMENDATIONS,
|
17 |
)
|
18 |
|
19 |
|
|
|
41 |
"""Get language pairs for a specific evaluation track."""
|
42 |
if track not in EVALUATION_TRACKS:
|
43 |
return []
|
44 |
+
|
45 |
track_languages = EVALUATION_TRACKS[track]["languages"]
|
46 |
pairs = []
|
47 |
for src in track_languages:
|
|
|
74 |
"""Sanitize model name for display and storage with enhanced validation."""
|
75 |
if not name or not isinstance(name, str):
|
76 |
return "Anonymous_Model"
|
77 |
+
|
78 |
# Remove special characters, limit length
|
79 |
name = re.sub(r"[^\w\-.]", "_", name.strip())
|
80 |
# Remove multiple consecutive underscores
|
81 |
name = re.sub(r"_+", "_", name)
|
82 |
# Remove leading/trailing underscores
|
83 |
name = name.strip("_")
|
84 |
+
|
85 |
# Ensure minimum length
|
86 |
if len(name) < 3:
|
87 |
name = f"Model_{name}"
|
88 |
+
|
89 |
# Check for reserved names
|
90 |
reserved_names = ["admin", "test", "baseline", "google", "system"]
|
91 |
if name.lower() in reserved_names:
|
92 |
name = f"User_{name}"
|
93 |
+
|
94 |
return name[:50] # Limit to 50 characters
|
95 |
|
96 |
|
97 |
+
def format_metric_value(value: float, metric: str, include_ci: bool = False,
|
98 |
+
ci_lower: float = None, ci_upper: float = None) -> str:
|
|
|
|
|
|
|
|
|
|
|
99 |
"""Format metric value for display with optional confidence intervals."""
|
100 |
if pd.isna(value) or value is None:
|
101 |
return "N/A"
|
102 |
+
|
103 |
try:
|
104 |
precision = METRICS_CONFIG["display_precision"]
|
105 |
+
|
106 |
if metric == "coverage_rate":
|
107 |
formatted = f"{value:.{precision}%}"
|
108 |
elif metric in ["bleu"]:
|
|
|
112 |
formatted = f"{min(value, 1.0):.{precision}f}"
|
113 |
else:
|
114 |
formatted = f"{value:.{precision}f}"
|
115 |
+
|
116 |
# Add confidence interval if requested
|
117 |
if include_ci and ci_lower is not None and ci_upper is not None:
|
118 |
ci_str = f" [{ci_lower:.{precision}f}, {ci_upper:.{precision}f}]"
|
119 |
formatted += ci_str
|
120 |
+
|
121 |
return formatted
|
122 |
+
|
123 |
except (ValueError, TypeError):
|
124 |
return str(value)
|
125 |
|
|
|
128 |
"""Calculate Cohen's d effect size between two groups."""
|
129 |
if len(values1) < 2 or len(values2) < 2:
|
130 |
return 0.0
|
131 |
+
|
132 |
try:
|
133 |
values1 = np.array(values1)
|
134 |
values2 = np.array(values2)
|
135 |
+
|
136 |
# Remove NaN values
|
137 |
values1 = values1[~np.isnan(values1)]
|
138 |
values2 = values2[~np.isnan(values2)]
|
139 |
+
|
140 |
if len(values1) < 2 or len(values2) < 2:
|
141 |
return 0.0
|
142 |
+
|
143 |
# Calculate pooled standard deviation
|
144 |
n1, n2 = len(values1), len(values2)
|
145 |
pooled_std = np.sqrt(
|
146 |
((n1 - 1) * np.var(values1, ddof=1) + (n2 - 1) * np.var(values2, ddof=1))
|
147 |
/ (n1 + n2 - 2)
|
148 |
)
|
149 |
+
|
150 |
if pooled_std == 0:
|
151 |
return 0.0
|
152 |
+
|
153 |
# Cohen's d
|
154 |
effect_size = (np.mean(values1) - np.mean(values2)) / pooled_std
|
155 |
return abs(effect_size)
|
156 |
+
|
157 |
except Exception:
|
158 |
return 0.0
|
159 |
|
|
|
161 |
def interpret_effect_size(effect_size: float) -> str:
|
162 |
"""Interpret effect size according to Cohen's conventions."""
|
163 |
thresholds = STATISTICAL_CONFIG["effect_size_thresholds"]
|
164 |
+
|
165 |
if effect_size < thresholds["small"]:
|
166 |
return "negligible"
|
167 |
elif effect_size < thresholds["medium"]:
|
|
|
178 |
"""Estimate statistical power for given effect size and sample sizes."""
|
179 |
if n1 < 2 or n2 < 2:
|
180 |
return 0.0
|
181 |
+
|
182 |
try:
|
183 |
# Simplified power calculation using t-test
|
184 |
# This is an approximation
|
185 |
df = n1 + n2 - 2
|
186 |
+
pooled_se = np.sqrt((1/n1) + (1/n2))
|
187 |
+
|
188 |
# Critical t-value
|
189 |
+
t_critical = stats.t.ppf(1 - alpha/2, df)
|
190 |
+
|
191 |
# Non-centrality parameter
|
192 |
ncp = effect_size / pooled_se
|
193 |
+
|
194 |
# Power (approximate)
|
195 |
+
power = 1 - stats.t.cdf(t_critical, df, loc=ncp) + stats.t.cdf(-t_critical, df, loc=ncp)
|
196 |
+
|
|
|
|
|
|
|
|
|
197 |
return min(1.0, max(0.0, power))
|
198 |
+
|
199 |
except Exception:
|
200 |
return 0.0
|
201 |
|
|
|
203 |
def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
|
204 |
"""Get comprehensive statistics about test data coverage for each track."""
|
205 |
track_stats = {}
|
206 |
+
|
207 |
for track_name, track_config in EVALUATION_TRACKS.items():
|
208 |
track_languages = track_config["languages"]
|
209 |
+
|
210 |
# Filter test data to track languages
|
211 |
track_data = test_data[
|
212 |
+
(test_data["source_language"].isin(track_languages)) &
|
213 |
+
(test_data["target_language"].isin(track_languages))
|
214 |
]
|
215 |
+
|
216 |
if track_data.empty:
|
217 |
track_stats[track_name] = {
|
218 |
"total_samples": 0,
|
|
|
222 |
"adequacy_assessment": "insufficient",
|
223 |
}
|
224 |
continue
|
225 |
+
|
226 |
# Calculate pair-wise statistics
|
227 |
pair_counts = {}
|
228 |
for src in track_languages:
|
229 |
for tgt in track_languages:
|
230 |
if src == tgt:
|
231 |
continue
|
232 |
+
|
233 |
pair_data = track_data[
|
234 |
+
(track_data["source_language"] == src) &
|
235 |
+
(track_data["target_language"] == tgt)
|
236 |
]
|
237 |
+
|
238 |
pair_key = f"{src}_to_{tgt}"
|
239 |
pair_counts[pair_key] = len(pair_data)
|
240 |
+
|
241 |
# Calculate adequacy
|
242 |
min_required = track_config["min_samples_per_pair"]
|
243 |
+
adequate_pairs = sum(1 for count in pair_counts.values() if count >= min_required)
|
|
|
|
|
244 |
total_possible_pairs = len(track_languages) * (len(track_languages) - 1)
|
245 |
+
|
246 |
adequacy_rate = adequate_pairs / max(total_possible_pairs, 1)
|
247 |
+
|
248 |
if adequacy_rate >= 0.8:
|
249 |
adequacy = "excellent"
|
250 |
elif adequacy_rate >= 0.6:
|
|
|
253 |
adequacy = "fair"
|
254 |
else:
|
255 |
adequacy = "insufficient"
|
256 |
+
|
257 |
track_stats[track_name] = {
|
258 |
"total_samples": len(track_data),
|
259 |
"language_pairs": len([k for k, v in pair_counts.items() if v > 0]),
|
|
|
263 |
"adequacy_rate": adequacy_rate,
|
264 |
"min_samples_per_pair": min_required,
|
265 |
}
|
266 |
+
|
267 |
return track_stats
|
268 |
|
269 |
|
|
|
271 |
predictions: pd.DataFrame, test_set: pd.DataFrame, track: str = None
|
272 |
) -> Dict:
|
273 |
"""Enhanced validation with track-specific analysis."""
|
274 |
+
|
275 |
if predictions.empty or test_set.empty:
|
276 |
return {
|
277 |
"is_complete": False,
|
|
|
281 |
"coverage": 0.0,
|
282 |
"track_analysis": {},
|
283 |
}
|
284 |
+
|
285 |
# If track specified, filter to track languages
|
286 |
if track and track in EVALUATION_TRACKS:
|
287 |
track_languages = EVALUATION_TRACKS[track]["languages"]
|
288 |
test_set = test_set[
|
289 |
+
(test_set["source_language"].isin(track_languages)) &
|
290 |
+
(test_set["target_language"].isin(track_languages))
|
291 |
]
|
292 |
+
|
293 |
try:
|
294 |
required_ids = set(test_set["sample_id"].astype(str))
|
295 |
provided_ids = set(predictions["sample_id"].astype(str))
|
296 |
+
|
297 |
missing_ids = required_ids - provided_ids
|
298 |
extra_ids = provided_ids - required_ids
|
299 |
matching_ids = provided_ids & required_ids
|
300 |
+
|
301 |
base_result = {
|
302 |
"is_complete": len(missing_ids) == 0,
|
303 |
"missing_count": len(missing_ids),
|
|
|
305 |
"missing_ids": list(missing_ids)[:10],
|
306 |
"coverage": len(matching_ids) / len(required_ids) if required_ids else 0.0,
|
307 |
}
|
308 |
+
|
309 |
# Add track-specific analysis if requested
|
310 |
if track:
|
311 |
track_analysis = analyze_track_coverage(predictions, test_set, track)
|
312 |
base_result["track_analysis"] = track_analysis
|
313 |
+
|
314 |
return base_result
|
315 |
+
|
316 |
except Exception as e:
|
317 |
print(f"Error in submission completeness validation: {e}")
|
318 |
return {
|
|
|
329 |
predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
|
330 |
) -> Dict:
|
331 |
"""Analyze coverage for a specific track."""
|
332 |
+
|
333 |
if track not in EVALUATION_TRACKS:
|
334 |
return {"error": f"Unknown track: {track}"}
|
335 |
+
|
336 |
track_config = EVALUATION_TRACKS[track]
|
337 |
track_languages = track_config["languages"]
|
338 |
+
|
339 |
# Filter test set to track languages
|
340 |
track_test_set = test_set[
|
341 |
+
(test_set["source_language"].isin(track_languages)) &
|
342 |
+
(test_set["target_language"].isin(track_languages))
|
343 |
]
|
344 |
+
|
345 |
if track_test_set.empty:
|
346 |
return {"error": f"No test data available for {track} track"}
|
347 |
+
|
348 |
# Merge with predictions
|
349 |
+
merged = track_test_set.merge(predictions, on="sample_id", how="left", suffixes=("", "_pred"))
|
350 |
+
|
|
|
|
|
351 |
# Analyze by language pair
|
352 |
pair_analysis = {}
|
353 |
for src in track_languages:
|
354 |
for tgt in track_languages:
|
355 |
if src == tgt:
|
356 |
continue
|
357 |
+
|
358 |
pair_data = merged[
|
359 |
+
(merged["source_language"] == src) &
|
360 |
+
(merged["target_language"] == tgt)
|
361 |
]
|
362 |
+
|
363 |
if len(pair_data) > 0:
|
364 |
covered = pair_data["prediction"].notna().sum()
|
365 |
pair_analysis[f"{src}_to_{tgt}"] = {
|
|
|
368 |
"coverage_rate": covered / len(pair_data),
|
369 |
"meets_minimum": covered >= track_config["min_samples_per_pair"],
|
370 |
}
|
371 |
+
|
372 |
# Overall track statistics
|
373 |
total_pairs = len(pair_analysis)
|
374 |
adequate_pairs = sum(1 for info in pair_analysis.values() if info["meets_minimum"])
|
375 |
+
|
376 |
return {
|
377 |
"track_name": track_config["name"],
|
378 |
"total_language_pairs": total_pairs,
|
379 |
"adequate_pairs": adequate_pairs,
|
380 |
"adequacy_rate": adequate_pairs / max(total_pairs, 1),
|
381 |
"pair_analysis": pair_analysis,
|
382 |
+
"overall_adequate": adequate_pairs >= total_pairs * 0.8, # 80% of pairs adequate
|
|
|
383 |
}
|
384 |
|
385 |
|
|
|
387 |
predictions: pd.DataFrame, test_set: pd.DataFrame
|
388 |
) -> Dict:
|
389 |
"""Calculate comprehensive language pair coverage with statistical metrics."""
|
390 |
+
|
391 |
if predictions.empty or test_set.empty:
|
392 |
return {}
|
393 |
+
|
394 |
try:
|
395 |
# Merge to get language info
|
396 |
+
merged = test_set.merge(predictions, on="sample_id", how="left", suffixes=("", "_pred"))
|
397 |
+
|
|
|
|
|
398 |
coverage = {}
|
399 |
for src in ALL_UG40_LANGUAGES:
|
400 |
for tgt in ALL_UG40_LANGUAGES:
|
401 |
if src == tgt:
|
402 |
continue
|
403 |
+
|
404 |
pair_data = merged[
|
405 |
+
(merged["source_language"] == src) &
|
406 |
+
(merged["target_language"] == tgt)
|
407 |
]
|
408 |
+
|
409 |
if len(pair_data) > 0:
|
410 |
predicted_count = pair_data["prediction"].notna().sum()
|
411 |
coverage_rate = predicted_count / len(pair_data)
|
412 |
+
|
413 |
# Determine which tracks include this pair
|
414 |
tracks_included = []
|
415 |
for track_name, track_config in EVALUATION_TRACKS.items():
|
416 |
+
if src in track_config["languages"] and tgt in track_config["languages"]:
|
|
|
|
|
|
|
417 |
tracks_included.append(track_name)
|
418 |
+
|
419 |
coverage[f"{src}_{tgt}"] = {
|
420 |
"total": len(pair_data),
|
421 |
"predicted": predicted_count,
|
|
|
423 |
"display_name": format_language_pair(src, tgt),
|
424 |
"tracks_included": tracks_included,
|
425 |
"google_comparable": (
|
426 |
+
src in GOOGLE_SUPPORTED_LANGUAGES and
|
427 |
+
tgt in GOOGLE_SUPPORTED_LANGUAGES
|
428 |
),
|
429 |
"statistical_adequacy": {
|
430 |
+
track: predicted_count >= EVALUATION_TRACKS[track]["min_samples_per_pair"]
|
|
|
431 |
for track in tracks_included
|
432 |
},
|
433 |
}
|
434 |
+
|
435 |
return coverage
|
436 |
+
|
437 |
except Exception as e:
|
438 |
print(f"Error calculating language pair coverage: {e}")
|
439 |
return {}
|
|
|
456 |
"""Clean text for evaluation, handling common encoding issues."""
|
457 |
if not isinstance(text, str):
|
458 |
return str(text) if text is not None else ""
|
459 |
+
|
460 |
# Remove extra whitespace
|
461 |
text = re.sub(r"\s+", " ", text.strip())
|
462 |
+
|
463 |
# Handle common encoding issues
|
464 |
text = text.replace("\u00a0", " ") # Non-breaking space
|
465 |
text = text.replace("\u2019", "'") # Right single quotation mark
|
466 |
text = text.replace("\u201c", '"') # Left double quotation mark
|
467 |
text = text.replace("\u201d", '"') # Right double quotation mark
|
468 |
+
|
469 |
return text
|
470 |
|
471 |
|
472 |
def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -> Dict:
|
473 |
"""Extract comprehensive summary statistics from model evaluation results."""
|
474 |
+
|
475 |
if not model_results or "tracks" not in model_results:
|
476 |
return {}
|
477 |
+
|
478 |
tracks = model_results["tracks"]
|
479 |
+
|
480 |
# If specific track requested
|
481 |
if track and track in tracks:
|
482 |
track_data = tracks[track]
|
483 |
if track_data.get("error"):
|
484 |
return {"error": f"No valid data for {track} track"}
|
485 |
+
|
486 |
track_averages = track_data.get("track_averages", {})
|
487 |
track_statistics = track_data.get("track_statistics", {})
|
488 |
summary = track_data.get("summary", {})
|
489 |
+
|
490 |
stats = {
|
491 |
"track": track,
|
492 |
"track_name": EVALUATION_TRACKS[track]["name"],
|
|
|
495 |
"chrf": track_averages.get("chrf", 0.0),
|
496 |
"total_samples": summary.get("total_samples", 0),
|
497 |
"language_pairs": summary.get("language_pairs_evaluated", 0),
|
498 |
+
"statistical_adequacy": summary.get("total_samples", 0) >= 100, # Simple threshold
|
|
|
499 |
}
|
500 |
+
|
501 |
# Add confidence intervals if available
|
502 |
if "quality_score" in track_statistics:
|
503 |
quality_stats = track_statistics["quality_score"]
|
|
|
505 |
quality_stats.get("ci_lower", 0.0),
|
506 |
quality_stats.get("ci_upper", 0.0),
|
507 |
]
|
508 |
+
|
509 |
return stats
|
510 |
+
|
511 |
# Otherwise, return summary across all tracks
|
512 |
all_tracks_summary = {
|
513 |
"tracks_evaluated": len([t for t in tracks.values() if not t.get("error")]),
|
514 |
"total_tracks": len(EVALUATION_TRACKS),
|
515 |
"by_track": {},
|
516 |
}
|
517 |
+
|
518 |
for track_name, track_data in tracks.items():
|
519 |
if not track_data.get("error"):
|
520 |
track_averages = track_data.get("track_averages", {})
|
521 |
summary = track_data.get("summary", {})
|
522 |
+
|
523 |
all_tracks_summary["by_track"][track_name] = {
|
524 |
"quality_score": track_averages.get("quality_score", 0.0),
|
525 |
"samples": summary.get("total_samples", 0),
|
526 |
"pairs": summary.get("language_pairs_evaluated", 0),
|
527 |
}
|
528 |
+
|
529 |
return all_tracks_summary
|
530 |
|
531 |
|
|
|
534 |
) -> str:
|
535 |
"""Generate a unique scientific identifier for a model."""
|
536 |
clean_name = sanitize_model_name(model_name)
|
537 |
+
clean_author = re.sub(r"[^\w\-]", "_", author.strip())[:20] if author else "Anonymous"
|
|
|
|
|
538 |
clean_category = category[:10] if category in MODEL_CATEGORIES else "community"
|
539 |
timestamp = datetime.datetime.now().strftime("%m%d_%H%M")
|
540 |
+
|
541 |
return f"{clean_category}_{clean_name}_{clean_author}_{timestamp}"
|
542 |
|
543 |
|
|
|
545 |
df: pd.DataFrame, required_columns: List[str], track: str = None
|
546 |
) -> Tuple[bool, List[str]]:
|
547 |
"""Enhanced DataFrame structure validation with track-specific checks."""
|
548 |
+
|
549 |
if df.empty:
|
550 |
return False, ["DataFrame is empty"]
|
551 |
+
|
552 |
issues = []
|
553 |
+
|
554 |
# Check required columns
|
555 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
556 |
if missing_columns:
|
557 |
issues.append(f"Missing columns: {', '.join(missing_columns)}")
|
558 |
+
|
559 |
# Check for track-specific requirements
|
560 |
if track and track in EVALUATION_TRACKS:
|
561 |
track_config = EVALUATION_TRACKS[track]
|
562 |
min_samples = track_config.get("min_samples_per_pair", 10)
|
563 |
+
|
564 |
# Check sample size adequacy
|
565 |
if len(df) < min_samples * 5: # At least 5 pairs worth of data
|
566 |
+
issues.append(f"Insufficient samples for {track} track (minimum ~{min_samples * 5})")
|
567 |
+
|
|
|
|
|
568 |
# Check data types
|
569 |
if "sample_id" in df.columns:
|
570 |
if not df["sample_id"].dtype == "object":
|
|
|
572 |
df["sample_id"] = df["sample_id"].astype(str)
|
573 |
except Exception:
|
574 |
issues.append("Cannot convert sample_id to string")
|
575 |
+
|
576 |
return len(issues) == 0, issues
|
577 |
|
578 |
|
|
|
590 |
"""Truncate text to specified length with suffix."""
|
591 |
if not isinstance(text, str):
|
592 |
text = str(text)
|
593 |
+
|
594 |
if len(text) <= max_length:
|
595 |
return text
|
596 |
+
|
597 |
return text[: max_length - len(suffix)] + suffix
|
598 |
|
599 |
|
|
|
601 |
desired_power: float = 0.8, effect_size: float = 0.5, alpha: float = 0.05
|
602 |
) -> int:
|
603 |
"""Calculate recommended sample size for statistical analysis."""
|
604 |
+
|
605 |
try:
|
606 |
# Simplified sample size calculation for t-test
|
607 |
# This is an approximation using Cohen's conventions
|
608 |
+
|
609 |
z_alpha = stats.norm.ppf(1 - alpha / 2)
|
610 |
z_beta = stats.norm.ppf(desired_power)
|
611 |
+
|
612 |
# Sample size per group
|
613 |
n_per_group = 2 * ((z_alpha + z_beta) / effect_size) ** 2
|
614 |
+
|
615 |
# Round up to nearest integer
|
616 |
return max(10, int(np.ceil(n_per_group)))
|
617 |
+
|
618 |
except Exception:
|
619 |
return 50 # Default fallback
|
620 |
|
|
|
623 |
model_name: str, category: str, performance_data: Dict
|
624 |
) -> Dict:
|
625 |
"""Assess if the detected/assigned model category is appropriate."""
|
626 |
+
|
627 |
assessment = {
|
628 |
"category": category,
|
629 |
"appropriate": True,
|
630 |
"confidence": 1.0,
|
631 |
"recommendations": [],
|
632 |
}
|
633 |
+
|
634 |
# Check for category mismatches based on performance
|
635 |
if category == "baseline" and performance_data:
|
636 |
# Baselines shouldn't perform too well
|
637 |
quality_scores = []
|
638 |
for track_data in performance_data.get("tracks", {}).values():
|
639 |
if not track_data.get("error"):
|
640 |
+
quality_scores.append(track_data.get("track_averages", {}).get("quality_score", 0))
|
641 |
+
|
642 |
+
if quality_scores and max(quality_scores) > 0.7: # High performance for baseline
|
|
|
|
|
|
|
|
|
643 |
assessment["appropriate"] = False
|
644 |
assessment["confidence"] = 0.3
|
645 |
assessment["recommendations"].append(
|
646 |
"High performance suggests this might not be a baseline model"
|
647 |
)
|
648 |
+
|
649 |
# Check for commercial model expectations
|
650 |
if category == "commercial":
|
651 |
# Commercial models should have good Google-comparable performance
|
|
|
656 |
assessment["recommendations"].append(
|
657 |
"Low performance unexpected for commercial systems"
|
658 |
)
|
659 |
+
|
660 |
+
return assessment
|