Spaces:
Running
Running
Update src/evaluation.py
Browse files- src/evaluation.py +73 -296
src/evaluation.py
CHANGED
@@ -8,24 +8,21 @@ from collections import defaultdict
|
|
8 |
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
|
9 |
from typing import Dict, List, Tuple, Optional
|
10 |
from scipy import stats
|
11 |
-
from scipy.stats import bootstrap
|
12 |
import warnings
|
13 |
from config import (
|
14 |
ALL_UG40_LANGUAGES,
|
15 |
GOOGLE_SUPPORTED_LANGUAGES,
|
16 |
METRICS_CONFIG,
|
17 |
-
STATISTICAL_CONFIG,
|
18 |
EVALUATION_TRACKS,
|
19 |
MODEL_CATEGORIES,
|
20 |
-
SAMPLE_SIZE_RECOMMENDATIONS,
|
21 |
)
|
22 |
-
from src.utils import get_all_language_pairs
|
23 |
|
24 |
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
25 |
|
26 |
|
27 |
def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
|
28 |
-
"""Calculate all metrics for a single sentence pair
|
29 |
|
30 |
# Handle empty predictions
|
31 |
if not prediction or not isinstance(prediction, str):
|
@@ -75,28 +72,17 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
|
|
75 |
except:
|
76 |
metrics["wer"] = 1.0
|
77 |
|
78 |
-
# Length ratio
|
79 |
-
try:
|
80 |
-
if len(ref_norm) > 0:
|
81 |
-
metrics["len_ratio"] = len(pred_norm) / len(ref_norm)
|
82 |
-
else:
|
83 |
-
metrics["len_ratio"] = 1.0 if len(pred_norm) == 0 else float("inf")
|
84 |
-
except:
|
85 |
-
metrics["len_ratio"] = 1.0
|
86 |
-
|
87 |
# ROUGE scores
|
88 |
try:
|
89 |
scorer = rouge_scorer.RougeScorer(
|
90 |
-
["rouge1", "
|
91 |
)
|
92 |
rouge_scores = scorer.score(ref_norm, pred_norm)
|
93 |
|
94 |
metrics["rouge1"] = rouge_scores["rouge1"].fmeasure
|
95 |
-
metrics["rouge2"] = rouge_scores["rouge2"].fmeasure
|
96 |
metrics["rougeL"] = rouge_scores["rougeL"].fmeasure
|
97 |
except:
|
98 |
metrics["rouge1"] = 0.0
|
99 |
-
metrics["rouge2"] = 0.0
|
100 |
metrics["rougeL"] = 0.0
|
101 |
|
102 |
# Quality score (composite metric)
|
@@ -116,130 +102,53 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
|
|
116 |
return metrics
|
117 |
|
118 |
|
119 |
-
def
|
120 |
-
"""Calculate
|
121 |
|
122 |
if not values or len(values) == 0:
|
123 |
-
return
|
124 |
-
"mean": 0.0,
|
125 |
-
"std": 0.0,
|
126 |
-
"median": 0.0,
|
127 |
-
"ci_lower": 0.0,
|
128 |
-
"ci_upper": 0.0,
|
129 |
-
"n_samples": 0,
|
130 |
-
}
|
131 |
|
132 |
values = np.array(values)
|
133 |
values = values[~np.isnan(values)] # Remove NaN values
|
134 |
|
135 |
if len(values) == 0:
|
136 |
-
return
|
137 |
-
"mean": 0.0,
|
138 |
-
"std": 0.0,
|
139 |
-
"median": 0.0,
|
140 |
-
"ci_lower": 0.0,
|
141 |
-
"ci_upper": 0.0,
|
142 |
-
"n_samples": 0,
|
143 |
-
}
|
144 |
|
145 |
-
|
146 |
-
"mean": float(np.mean(values)),
|
147 |
-
"std": float(np.std(values, ddof=1)) if len(values) > 1 else 0.0,
|
148 |
-
"median": float(np.median(values)),
|
149 |
-
"n_samples": len(values),
|
150 |
-
}
|
151 |
-
|
152 |
-
# Calculate confidence intervals using bootstrap if enough samples
|
153 |
-
if len(values) >= STATISTICAL_CONFIG["min_samples_for_ci"]:
|
154 |
-
try:
|
155 |
-
confidence_level = STATISTICAL_CONFIG["confidence_level"]
|
156 |
-
|
157 |
-
# Bootstrap confidence interval
|
158 |
-
def mean_func(x):
|
159 |
-
return np.mean(x)
|
160 |
-
|
161 |
-
res = bootstrap(
|
162 |
-
(values,),
|
163 |
-
mean_func,
|
164 |
-
n_resamples=STATISTICAL_CONFIG["bootstrap_samples"],
|
165 |
-
confidence_level=confidence_level,
|
166 |
-
random_state=42,
|
167 |
-
)
|
168 |
-
|
169 |
-
stats_dict["ci_lower"] = float(res.confidence_interval.low)
|
170 |
-
stats_dict["ci_upper"] = float(res.confidence_interval.high)
|
171 |
-
|
172 |
-
except Exception as e:
|
173 |
-
# Fallback to t-distribution CI
|
174 |
-
try:
|
175 |
-
alpha = 1 - confidence_level
|
176 |
-
t_val = stats.t.ppf(1 - alpha / 2, len(values) - 1)
|
177 |
-
margin = t_val * stats_dict["std"] / np.sqrt(len(values))
|
178 |
-
stats_dict["ci_lower"] = stats_dict["mean"] - margin
|
179 |
-
stats_dict["ci_upper"] = stats_dict["mean"] + margin
|
180 |
-
except:
|
181 |
-
stats_dict["ci_lower"] = stats_dict["mean"]
|
182 |
-
stats_dict["ci_upper"] = stats_dict["mean"]
|
183 |
-
else:
|
184 |
-
stats_dict["ci_lower"] = stats_dict["mean"]
|
185 |
-
stats_dict["ci_upper"] = stats_dict["mean"]
|
186 |
-
|
187 |
-
return stats_dict
|
188 |
-
|
189 |
-
|
190 |
-
def perform_significance_test(
|
191 |
-
values1: List[float], values2: List[float], metric_name: str
|
192 |
-
) -> Dict[str, float]:
|
193 |
-
"""Perform statistical significance test between two groups."""
|
194 |
|
195 |
-
if len(
|
196 |
-
|
197 |
-
|
198 |
-
values1 = np.array(values1)
|
199 |
-
values2 = np.array(values2)
|
200 |
-
|
201 |
-
# Remove NaN values
|
202 |
-
values1 = values1[~np.isnan(values1)]
|
203 |
-
values2 = values2[~np.isnan(values2)]
|
204 |
-
|
205 |
-
if len(values1) < 2 or len(values2) < 2:
|
206 |
-
return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
|
207 |
|
208 |
try:
|
209 |
-
#
|
210 |
-
|
|
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
(
|
215 |
-
(len(values2) - 1) * np.var(values2, ddof=1)) /
|
216 |
-
(len(values1) + len(values2) - 2)
|
217 |
-
)
|
218 |
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
effect_size = 0.0
|
223 |
|
224 |
-
|
225 |
-
significance_level = EVALUATION_TRACKS["google_comparable"]["significance_level"]
|
226 |
-
significant = p_value < significance_level
|
227 |
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
|
237 |
|
238 |
|
239 |
def evaluate_predictions_by_track(
|
240 |
predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
|
241 |
) -> Dict:
|
242 |
-
"""Evaluate predictions for a specific track
|
243 |
|
244 |
print(f"🔄 Evaluating for {track} track...")
|
245 |
|
@@ -277,7 +186,7 @@ def evaluate_predictions_by_track(
|
|
277 |
|
278 |
sample_df = pd.DataFrame(sample_metrics)
|
279 |
|
280 |
-
# Aggregate by language pairs
|
281 |
pair_metrics = {}
|
282 |
overall_metrics = defaultdict(list)
|
283 |
|
@@ -292,36 +201,44 @@ def evaluate_predictions_by_track(
|
|
292 |
(sample_df["target_language"] == tgt_lang)
|
293 |
]
|
294 |
|
295 |
-
if len(pair_data) >=
|
296 |
pair_key = f"{src_lang}_to_{tgt_lang}"
|
297 |
pair_metrics[pair_key] = {}
|
298 |
|
299 |
-
# Calculate
|
300 |
-
for metric in
|
301 |
-
METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]
|
302 |
-
):
|
303 |
if metric in pair_data.columns:
|
304 |
values = pair_data[metric].replace([np.inf, -np.inf], np.nan).dropna()
|
305 |
|
306 |
if len(values) > 0:
|
307 |
-
|
308 |
-
pair_metrics[pair_key][metric] =
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
|
310 |
# Add to overall metrics for track-level statistics
|
311 |
-
overall_metrics[metric].append(
|
312 |
|
313 |
pair_metrics[pair_key]["sample_count"] = len(pair_data)
|
314 |
-
pair_metrics[pair_key]["languages"] = f"{src_lang}-{tgt_lang}"
|
315 |
|
316 |
# Calculate track-level aggregated statistics
|
317 |
track_averages = {}
|
318 |
-
|
319 |
|
320 |
for metric in overall_metrics:
|
321 |
if overall_metrics[metric]:
|
322 |
-
|
323 |
-
track_averages[metric] =
|
324 |
-
|
|
|
|
|
|
|
|
|
|
|
325 |
|
326 |
# Generate evaluation summary
|
327 |
summary = {
|
@@ -331,15 +248,12 @@ def evaluate_predictions_by_track(
|
|
331 |
"language_pairs_evaluated": len([k for k in pair_metrics if pair_metrics[k].get("sample_count", 0) > 0]),
|
332 |
"languages_covered": len(set(sample_df["source_language"]) | set(sample_df["target_language"])),
|
333 |
"min_samples_per_pair": track_config["min_samples_per_pair"],
|
334 |
-
"statistical_power": track_config["statistical_power"],
|
335 |
-
"significance_level": track_config["significance_level"],
|
336 |
}
|
337 |
|
338 |
return {
|
339 |
-
"sample_metrics": sample_df,
|
340 |
"pair_metrics": pair_metrics,
|
341 |
"track_averages": track_averages,
|
342 |
-
"
|
343 |
"summary": summary,
|
344 |
"evaluated_samples": len(sample_df),
|
345 |
"track": track,
|
@@ -347,12 +261,12 @@ def evaluate_predictions_by_track(
|
|
347 |
}
|
348 |
|
349 |
|
350 |
-
def
|
351 |
predictions: pd.DataFrame, test_set: pd.DataFrame, model_category: str = "community"
|
352 |
) -> Dict:
|
353 |
-
"""Comprehensive evaluation across all tracks
|
354 |
|
355 |
-
print("🔬 Starting
|
356 |
|
357 |
# Validate model category
|
358 |
if model_category not in MODEL_CATEGORIES:
|
@@ -362,8 +276,7 @@ def evaluate_predictions_scientific(
|
|
362 |
"model_category": model_category,
|
363 |
"category_info": MODEL_CATEGORIES[model_category],
|
364 |
"tracks": {},
|
365 |
-
"
|
366 |
-
"scientific_metadata": {
|
367 |
"evaluation_timestamp": pd.Timestamp.now().isoformat(),
|
368 |
"total_samples_submitted": len(predictions),
|
369 |
"total_samples_available": len(test_set),
|
@@ -375,120 +288,24 @@ def evaluate_predictions_scientific(
|
|
375 |
track_result = evaluate_predictions_by_track(predictions, test_set, track_name)
|
376 |
evaluation_results["tracks"][track_name] = track_result
|
377 |
|
378 |
-
# Cross-track consistency analysis
|
379 |
-
evaluation_results["cross_track_analysis"] = analyze_cross_track_consistency(
|
380 |
-
evaluation_results["tracks"]
|
381 |
-
)
|
382 |
-
|
383 |
return evaluation_results
|
384 |
|
385 |
|
386 |
-
def
|
387 |
-
"""
|
388 |
-
|
389 |
-
consistency_analysis = {
|
390 |
-
"track_correlations": {},
|
391 |
-
"performance_stability": {},
|
392 |
-
"language_coverage_analysis": {},
|
393 |
-
}
|
394 |
-
|
395 |
-
# Extract quality scores from each track for correlation analysis
|
396 |
-
track_scores = {}
|
397 |
-
for track_name, track_data in track_results.items():
|
398 |
-
if track_data.get("track_averages") and "quality_score" in track_data["track_averages"]:
|
399 |
-
track_scores[track_name] = track_data["track_averages"]["quality_score"]
|
400 |
-
|
401 |
-
# Calculate pairwise correlations (would need more data points for meaningful correlation)
|
402 |
-
if len(track_scores) >= 2:
|
403 |
-
track_names = list(track_scores.keys())
|
404 |
-
for i, track1 in enumerate(track_names):
|
405 |
-
for track2 in track_names[i + 1:]:
|
406 |
-
# This would be more meaningful with multiple models
|
407 |
-
consistency_analysis["track_correlations"][f"{track1}_vs_{track2}"] = {
|
408 |
-
"score_difference": abs(track_scores[track1] - track_scores[track2]),
|
409 |
-
"relative_performance": track_scores[track1] / max(track_scores[track2], 0.001),
|
410 |
-
}
|
411 |
-
|
412 |
-
# Language coverage analysis
|
413 |
-
for track_name, track_data in track_results.items():
|
414 |
-
if track_data.get("summary"):
|
415 |
-
summary = track_data["summary"]
|
416 |
-
consistency_analysis["language_coverage_analysis"][track_name] = {
|
417 |
-
"coverage_rate": summary["language_pairs_evaluated"] / max(summary.get("total_possible_pairs", 1), 1),
|
418 |
-
"samples_per_pair": summary["total_samples"] / max(summary["language_pairs_evaluated"], 1),
|
419 |
-
"statistical_adequacy": summary["total_samples"] >= EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary["language_pairs_evaluated"],
|
420 |
-
}
|
421 |
-
|
422 |
-
return consistency_analysis
|
423 |
-
|
424 |
-
|
425 |
-
def compare_models_statistically(
|
426 |
-
model1_results: Dict, model2_results: Dict, track: str = "google_comparable"
|
427 |
-
) -> Dict:
|
428 |
-
"""Perform statistical comparison between two models on a specific track."""
|
429 |
-
|
430 |
-
if track not in model1_results.get("tracks", {}) or track not in model2_results.get("tracks", {}):
|
431 |
-
return {"error": f"Track {track} not available for both models"}
|
432 |
-
|
433 |
-
track1_data = model1_results["tracks"][track]
|
434 |
-
track2_data = model2_results["tracks"][track]
|
435 |
-
|
436 |
-
if track1_data.get("error") or track2_data.get("error"):
|
437 |
-
return {"error": "One or both models have evaluation errors"}
|
438 |
-
|
439 |
-
comparison_results = {
|
440 |
-
"track": track,
|
441 |
-
"model1_category": model1_results.get("model_category", "unknown"),
|
442 |
-
"model2_category": model2_results.get("model_category", "unknown"),
|
443 |
-
"metric_comparisons": {},
|
444 |
-
"language_pair_comparisons": {},
|
445 |
-
"overall_significance": {},
|
446 |
-
}
|
447 |
-
|
448 |
-
# Compare each metric
|
449 |
-
for metric in METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]:
|
450 |
-
if (metric in track1_data.get("track_statistics", {}) and
|
451 |
-
metric in track2_data.get("track_statistics", {})):
|
452 |
-
|
453 |
-
# Extract sample-level data for this metric from both models
|
454 |
-
# This would require access to the original sample metrics
|
455 |
-
# For now, we'll use the aggregated statistics
|
456 |
-
|
457 |
-
stats1 = track1_data["track_statistics"][metric]
|
458 |
-
stats2 = track2_data["track_statistics"][metric]
|
459 |
-
|
460 |
-
# Create comparison summary
|
461 |
-
comparison_results["metric_comparisons"][metric] = {
|
462 |
-
"model1_mean": stats1["mean"],
|
463 |
-
"model1_ci": [stats1["ci_lower"], stats1["ci_upper"]],
|
464 |
-
"model2_mean": stats2["mean"],
|
465 |
-
"model2_ci": [stats2["ci_lower"], stats2["ci_upper"]],
|
466 |
-
"difference": stats1["mean"] - stats2["mean"],
|
467 |
-
"ci_overlap": not (stats1["ci_upper"] < stats2["ci_lower"] or
|
468 |
-
stats2["ci_upper"] < stats1["ci_lower"]),
|
469 |
-
}
|
470 |
-
|
471 |
-
return comparison_results
|
472 |
-
|
473 |
-
|
474 |
-
def generate_scientific_report(
|
475 |
-
results: Dict, model_name: str = "", baseline_results: Dict = None
|
476 |
-
) -> str:
|
477 |
-
"""Generate a comprehensive scientific evaluation report."""
|
478 |
|
479 |
if any(track_data.get("error") for track_data in results.get("tracks", {}).values()):
|
480 |
-
return f"❌ **Evaluation Error**: Unable to complete
|
481 |
|
482 |
report = []
|
483 |
|
484 |
# Header
|
485 |
-
report.append(f"
|
486 |
report.append("")
|
487 |
|
488 |
# Model categorization
|
489 |
category_info = results.get("category_info", {})
|
490 |
report.append(f"**Model Category**: {category_info.get('name', 'Unknown')}")
|
491 |
-
report.append(f"**Category Description**: {category_info.get('description', 'N/A')}")
|
492 |
report.append("")
|
493 |
|
494 |
# Track-by-track analysis
|
@@ -498,73 +315,33 @@ def generate_scientific_report(
|
|
498 |
|
499 |
track_config = EVALUATION_TRACKS[track_name]
|
500 |
summary = track_data.get("summary", {})
|
501 |
-
|
|
|
502 |
|
503 |
-
report.append(f"
|
504 |
-
report.append(f"*{track_config['description']}*")
|
505 |
report.append("")
|
506 |
|
507 |
# Summary statistics
|
508 |
-
report.append("
|
509 |
report.append(f"- **Samples Evaluated**: {summary.get('total_samples', 0):,}")
|
510 |
report.append(f"- **Language Pairs**: {summary.get('language_pairs_evaluated', 0)}")
|
511 |
report.append(f"- **Languages Covered**: {summary.get('languages_covered', 0)}")
|
512 |
-
report.append(f"- **Statistical Power**: {track_config['statistical_power']}")
|
513 |
report.append("")
|
514 |
|
515 |
# Primary metrics with confidence intervals
|
516 |
-
report.append("
|
517 |
for metric in METRICS_CONFIG["primary_metrics"]:
|
518 |
-
if metric in
|
519 |
-
stats =
|
520 |
mean_val = stats["mean"]
|
521 |
ci_lower = stats["ci_lower"]
|
522 |
ci_upper = stats["ci_upper"]
|
523 |
|
524 |
report.append(f"- **{metric.upper()}**: {mean_val:.4f} [{ci_lower:.4f}, {ci_upper:.4f}]")
|
525 |
report.append("")
|
526 |
-
|
527 |
-
# Statistical adequacy assessment
|
528 |
-
min_required = track_config["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
|
529 |
-
adequacy = "✅ Adequate" if summary.get("total_samples", 0) >= min_required else "⚠️ Limited"
|
530 |
-
report.append(f"**Statistical Adequacy**: {adequacy}")
|
531 |
-
report.append("")
|
532 |
-
|
533 |
-
# Cross-track analysis
|
534 |
-
cross_track = results.get("cross_track_analysis", {})
|
535 |
-
if cross_track:
|
536 |
-
report.append("## 🔄 Cross-Track Consistency Analysis")
|
537 |
-
|
538 |
-
coverage_analysis = cross_track.get("language_coverage_analysis", {})
|
539 |
-
for track_name, coverage_info in coverage_analysis.items():
|
540 |
-
adequacy = "✅ Statistically adequate" if coverage_info.get("statistical_adequacy") else "⚠️ Limited statistical power"
|
541 |
-
report.append(f"- **{track_name}**: {adequacy}")
|
542 |
-
|
543 |
-
report.append("")
|
544 |
-
|
545 |
-
# Baseline comparison if available
|
546 |
-
if baseline_results:
|
547 |
-
report.append("## 📈 Baseline Comparison")
|
548 |
-
# This would include detailed statistical comparisons
|
549 |
-
report.append("*Statistical comparison with baseline models*")
|
550 |
-
report.append("")
|
551 |
-
|
552 |
-
# Scientific recommendations
|
553 |
-
report.append("## 💡 Scientific Recommendations")
|
554 |
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
if total_samples < SAMPLE_SIZE_RECOMMENDATIONS["publication_quality"]:
|
562 |
-
report.append("- ⚠️ Consider collecting more evaluation samples for publication-quality results")
|
563 |
-
|
564 |
-
google_track = results.get("tracks", {}).get("google_comparable", {})
|
565 |
-
if not google_track.get("error") and google_track.get("summary", {}).get("total_samples", 0) > 100:
|
566 |
-
report.append("- ✅ Sufficient data for comparison with commercial systems")
|
567 |
-
|
568 |
-
report.append("")
|
569 |
-
|
570 |
-
return "\n".join(report)
|
|
|
8 |
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
|
9 |
from typing import Dict, List, Tuple, Optional
|
10 |
from scipy import stats
|
|
|
11 |
import warnings
|
12 |
from config import (
|
13 |
ALL_UG40_LANGUAGES,
|
14 |
GOOGLE_SUPPORTED_LANGUAGES,
|
15 |
METRICS_CONFIG,
|
|
|
16 |
EVALUATION_TRACKS,
|
17 |
MODEL_CATEGORIES,
|
|
|
18 |
)
|
19 |
+
from src.utils import get_all_language_pairs
|
20 |
|
21 |
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
22 |
|
23 |
|
24 |
def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
|
25 |
+
"""Calculate all metrics for a single sentence pair."""
|
26 |
|
27 |
# Handle empty predictions
|
28 |
if not prediction or not isinstance(prediction, str):
|
|
|
72 |
except:
|
73 |
metrics["wer"] = 1.0
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
# ROUGE scores
|
76 |
try:
|
77 |
scorer = rouge_scorer.RougeScorer(
|
78 |
+
["rouge1", "rougeL"], use_stemmer=True
|
79 |
)
|
80 |
rouge_scores = scorer.score(ref_norm, pred_norm)
|
81 |
|
82 |
metrics["rouge1"] = rouge_scores["rouge1"].fmeasure
|
|
|
83 |
metrics["rougeL"] = rouge_scores["rougeL"].fmeasure
|
84 |
except:
|
85 |
metrics["rouge1"] = 0.0
|
|
|
86 |
metrics["rougeL"] = 0.0
|
87 |
|
88 |
# Quality score (composite metric)
|
|
|
102 |
return metrics
|
103 |
|
104 |
|
105 |
+
def calculate_confidence_interval(values: List[float], confidence_level: float = 0.95) -> Tuple[float, float, float]:
|
106 |
+
"""Calculate mean and confidence interval for a list of values."""
|
107 |
|
108 |
if not values or len(values) == 0:
|
109 |
+
return 0.0, 0.0, 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
values = np.array(values)
|
112 |
values = values[~np.isnan(values)] # Remove NaN values
|
113 |
|
114 |
if len(values) == 0:
|
115 |
+
return 0.0, 0.0, 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
+
mean_val = float(np.mean(values))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
+
if len(values) < METRICS_CONFIG["min_samples_for_ci"]:
|
120 |
+
# Not enough samples for meaningful CI
|
121 |
+
return mean_val, mean_val, mean_val
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
try:
|
124 |
+
# Bootstrap confidence interval
|
125 |
+
n_bootstrap = min(METRICS_CONFIG["bootstrap_samples"], 1000)
|
126 |
+
bootstrap_means = []
|
127 |
|
128 |
+
for _ in range(n_bootstrap):
|
129 |
+
bootstrap_sample = np.random.choice(values, size=len(values), replace=True)
|
130 |
+
bootstrap_means.append(np.mean(bootstrap_sample))
|
|
|
|
|
|
|
131 |
|
132 |
+
alpha = 1 - confidence_level
|
133 |
+
ci_lower = np.percentile(bootstrap_means, 100 * alpha / 2)
|
134 |
+
ci_upper = np.percentile(bootstrap_means, 100 * (1 - alpha / 2))
|
|
|
135 |
|
136 |
+
return mean_val, float(ci_lower), float(ci_upper)
|
|
|
|
|
137 |
|
138 |
+
except Exception:
|
139 |
+
# Fallback to t-distribution CI
|
140 |
+
try:
|
141 |
+
std_err = stats.sem(values)
|
142 |
+
h = std_err * stats.t.ppf((1 + confidence_level) / 2, len(values) - 1)
|
143 |
+
return mean_val, mean_val - h, mean_val + h
|
144 |
+
except:
|
145 |
+
return mean_val, mean_val, mean_val
|
|
|
146 |
|
147 |
|
148 |
def evaluate_predictions_by_track(
|
149 |
predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
|
150 |
) -> Dict:
|
151 |
+
"""Evaluate predictions for a specific track."""
|
152 |
|
153 |
print(f"🔄 Evaluating for {track} track...")
|
154 |
|
|
|
186 |
|
187 |
sample_df = pd.DataFrame(sample_metrics)
|
188 |
|
189 |
+
# Aggregate by language pairs
|
190 |
pair_metrics = {}
|
191 |
overall_metrics = defaultdict(list)
|
192 |
|
|
|
201 |
(sample_df["target_language"] == tgt_lang)
|
202 |
]
|
203 |
|
204 |
+
if len(pair_data) >= MIN_SAMPLES_PER_PAIR:
|
205 |
pair_key = f"{src_lang}_to_{tgt_lang}"
|
206 |
pair_metrics[pair_key] = {}
|
207 |
|
208 |
+
# Calculate statistics for each metric
|
209 |
+
for metric in METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]:
|
|
|
|
|
210 |
if metric in pair_data.columns:
|
211 |
values = pair_data[metric].replace([np.inf, -np.inf], np.nan).dropna()
|
212 |
|
213 |
if len(values) > 0:
|
214 |
+
mean_val, ci_lower, ci_upper = calculate_confidence_interval(values.tolist())
|
215 |
+
pair_metrics[pair_key][metric] = {
|
216 |
+
"mean": mean_val,
|
217 |
+
"ci_lower": ci_lower,
|
218 |
+
"ci_upper": ci_upper,
|
219 |
+
"std": float(np.std(values)) if len(values) > 1 else 0.0,
|
220 |
+
"count": len(values)
|
221 |
+
}
|
222 |
|
223 |
# Add to overall metrics for track-level statistics
|
224 |
+
overall_metrics[metric].append(mean_val)
|
225 |
|
226 |
pair_metrics[pair_key]["sample_count"] = len(pair_data)
|
|
|
227 |
|
228 |
# Calculate track-level aggregated statistics
|
229 |
track_averages = {}
|
230 |
+
track_confidence = {}
|
231 |
|
232 |
for metric in overall_metrics:
|
233 |
if overall_metrics[metric]:
|
234 |
+
mean_val, ci_lower, ci_upper = calculate_confidence_interval(overall_metrics[metric])
|
235 |
+
track_averages[metric] = mean_val
|
236 |
+
track_confidence[metric] = {
|
237 |
+
"mean": mean_val,
|
238 |
+
"ci_lower": ci_lower,
|
239 |
+
"ci_upper": ci_upper,
|
240 |
+
"std": float(np.std(overall_metrics[metric])) if len(overall_metrics[metric]) > 1 else 0.0
|
241 |
+
}
|
242 |
|
243 |
# Generate evaluation summary
|
244 |
summary = {
|
|
|
248 |
"language_pairs_evaluated": len([k for k in pair_metrics if pair_metrics[k].get("sample_count", 0) > 0]),
|
249 |
"languages_covered": len(set(sample_df["source_language"]) | set(sample_df["target_language"])),
|
250 |
"min_samples_per_pair": track_config["min_samples_per_pair"],
|
|
|
|
|
251 |
}
|
252 |
|
253 |
return {
|
|
|
254 |
"pair_metrics": pair_metrics,
|
255 |
"track_averages": track_averages,
|
256 |
+
"track_confidence": track_confidence,
|
257 |
"summary": summary,
|
258 |
"evaluated_samples": len(sample_df),
|
259 |
"track": track,
|
|
|
261 |
}
|
262 |
|
263 |
|
264 |
+
def evaluate_predictions(
|
265 |
predictions: pd.DataFrame, test_set: pd.DataFrame, model_category: str = "community"
|
266 |
) -> Dict:
|
267 |
+
"""Comprehensive evaluation across all tracks."""
|
268 |
|
269 |
+
print("🔬 Starting evaluation...")
|
270 |
|
271 |
# Validate model category
|
272 |
if model_category not in MODEL_CATEGORIES:
|
|
|
276 |
"model_category": model_category,
|
277 |
"category_info": MODEL_CATEGORIES[model_category],
|
278 |
"tracks": {},
|
279 |
+
"metadata": {
|
|
|
280 |
"evaluation_timestamp": pd.Timestamp.now().isoformat(),
|
281 |
"total_samples_submitted": len(predictions),
|
282 |
"total_samples_available": len(test_set),
|
|
|
288 |
track_result = evaluate_predictions_by_track(predictions, test_set, track_name)
|
289 |
evaluation_results["tracks"][track_name] = track_result
|
290 |
|
|
|
|
|
|
|
|
|
|
|
291 |
return evaluation_results
|
292 |
|
293 |
|
294 |
+
def generate_evaluation_report(results: Dict, model_name: str = "") -> str:
|
295 |
+
"""Generate a comprehensive evaluation report."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
if any(track_data.get("error") for track_data in results.get("tracks", {}).values()):
|
298 |
+
return f"❌ **Evaluation Error**: Unable to complete evaluation"
|
299 |
|
300 |
report = []
|
301 |
|
302 |
# Header
|
303 |
+
report.append(f"### 🔬 Evaluation Report: {model_name or 'Model'}")
|
304 |
report.append("")
|
305 |
|
306 |
# Model categorization
|
307 |
category_info = results.get("category_info", {})
|
308 |
report.append(f"**Model Category**: {category_info.get('name', 'Unknown')}")
|
|
|
309 |
report.append("")
|
310 |
|
311 |
# Track-by-track analysis
|
|
|
315 |
|
316 |
track_config = EVALUATION_TRACKS[track_name]
|
317 |
summary = track_data.get("summary", {})
|
318 |
+
track_averages = track_data.get("track_averages", {})
|
319 |
+
track_confidence = track_data.get("track_confidence", {})
|
320 |
|
321 |
+
report.append(f"#### {track_config['name']}")
|
|
|
322 |
report.append("")
|
323 |
|
324 |
# Summary statistics
|
325 |
+
report.append("**Summary Statistics:**")
|
326 |
report.append(f"- **Samples Evaluated**: {summary.get('total_samples', 0):,}")
|
327 |
report.append(f"- **Language Pairs**: {summary.get('language_pairs_evaluated', 0)}")
|
328 |
report.append(f"- **Languages Covered**: {summary.get('languages_covered', 0)}")
|
|
|
329 |
report.append("")
|
330 |
|
331 |
# Primary metrics with confidence intervals
|
332 |
+
report.append("**Primary Metrics (95% Confidence Intervals):**")
|
333 |
for metric in METRICS_CONFIG["primary_metrics"]:
|
334 |
+
if metric in track_confidence:
|
335 |
+
stats = track_confidence[metric]
|
336 |
mean_val = stats["mean"]
|
337 |
ci_lower = stats["ci_lower"]
|
338 |
ci_upper = stats["ci_upper"]
|
339 |
|
340 |
report.append(f"- **{metric.upper()}**: {mean_val:.4f} [{ci_lower:.4f}, {ci_upper:.4f}]")
|
341 |
report.append("")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
|
343 |
+
return "\n".join(report)
|
344 |
+
|
345 |
+
|
346 |
+
# Backwards compatibility
|
347 |
+
MIN_SAMPLES_PER_PAIR = 10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|