Spaces:
Sleeping
Sleeping
Update src/utils.py
Browse files- src/utils.py +563 -134
src/utils.py
CHANGED
@@ -2,8 +2,19 @@
|
|
2 |
import re
|
3 |
import datetime
|
4 |
import pandas as pd
|
5 |
-
|
6 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def get_all_language_pairs() -> List[Tuple[str, str]]:
|
9 |
"""Get all possible UG40 language pairs."""
|
@@ -14,6 +25,7 @@ def get_all_language_pairs() -> List[Tuple[str, str]]:
|
|
14 |
pairs.append((src, tgt))
|
15 |
return pairs
|
16 |
|
|
|
17 |
def get_google_comparable_pairs() -> List[Tuple[str, str]]:
|
18 |
"""Get language pairs that can be compared with Google Translate."""
|
19 |
pairs = []
|
@@ -23,220 +35,569 @@ def get_google_comparable_pairs() -> List[Tuple[str, str]]:
|
|
23 |
pairs.append((src, tgt))
|
24 |
return pairs
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def format_language_pair(src: str, tgt: str) -> str:
|
27 |
"""Format language pair for display."""
|
28 |
src_name = LANGUAGE_NAMES.get(src, src.upper())
|
29 |
tgt_name = LANGUAGE_NAMES.get(tgt, tgt.upper())
|
30 |
return f"{src_name} → {tgt_name}"
|
31 |
|
|
|
32 |
def validate_language_code(lang: str) -> bool:
|
33 |
"""Validate if language code is supported."""
|
34 |
return lang in ALL_UG40_LANGUAGES
|
35 |
|
|
|
36 |
def create_submission_id() -> str:
|
37 |
-
"""Create unique submission ID."""
|
38 |
-
|
|
|
|
|
|
|
39 |
|
40 |
def sanitize_model_name(name: str) -> str:
|
41 |
-
"""Sanitize model name for display and storage."""
|
42 |
if not name or not isinstance(name, str):
|
43 |
return "Anonymous_Model"
|
44 |
-
|
45 |
# Remove special characters, limit length
|
46 |
-
name = re.sub(r
|
47 |
# Remove multiple consecutive underscores
|
48 |
-
name = re.sub(r
|
49 |
# Remove leading/trailing underscores
|
50 |
-
name = name.strip(
|
51 |
-
|
52 |
# Ensure minimum length
|
53 |
if len(name) < 3:
|
54 |
name = f"Model_{name}"
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
return name[:50] # Limit to 50 characters
|
57 |
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
if pd.isna(value) or value is None:
|
61 |
return "N/A"
|
62 |
-
|
63 |
try:
|
64 |
-
precision =
|
65 |
-
|
66 |
-
if metric ==
|
67 |
-
|
68 |
-
elif metric in [
|
69 |
-
|
70 |
-
elif metric in [
|
71 |
# Cap error rates at 1.0 for display
|
72 |
-
|
73 |
else:
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
except (ValueError, TypeError):
|
76 |
return str(value)
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
try:
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
-
def validate_submission_completeness(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
|
108 |
-
"""Validate that submission covers all required samples."""
|
109 |
-
|
110 |
if predictions.empty or test_set.empty:
|
111 |
return {
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
117 |
}
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
try:
|
120 |
-
required_ids = set(test_set[
|
121 |
-
provided_ids = set(predictions[
|
122 |
-
|
123 |
missing_ids = required_ids - provided_ids
|
124 |
extra_ids = provided_ids - required_ids
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
132 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
except Exception as e:
|
134 |
-
print(f"Error
|
135 |
return {
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
|
|
141 |
}
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
if predictions.empty or test_set.empty:
|
147 |
return {}
|
148 |
-
|
149 |
try:
|
150 |
# Merge to get language info
|
151 |
-
merged = test_set.merge(
|
152 |
-
|
|
|
|
|
153 |
coverage = {}
|
154 |
for src in ALL_UG40_LANGUAGES:
|
155 |
for tgt in ALL_UG40_LANGUAGES:
|
156 |
-
if src
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
]
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
return coverage
|
|
|
172 |
except Exception as e:
|
173 |
print(f"Error calculating language pair coverage: {e}")
|
174 |
return {}
|
175 |
|
|
|
176 |
def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
|
177 |
"""Safely divide two numbers, handling edge cases."""
|
178 |
try:
|
179 |
if denominator == 0 or pd.isna(denominator) or pd.isna(numerator):
|
180 |
return default
|
181 |
result = numerator / denominator
|
182 |
-
if pd.isna(result) or not
|
183 |
return default
|
184 |
return float(result)
|
185 |
except (TypeError, ValueError, ZeroDivisionError):
|
186 |
return default
|
187 |
|
|
|
188 |
def clean_text_for_evaluation(text: str) -> str:
|
189 |
"""Clean text for evaluation, handling common encoding issues."""
|
190 |
if not isinstance(text, str):
|
191 |
return str(text) if text is not None else ""
|
192 |
-
|
193 |
# Remove extra whitespace
|
194 |
-
text = re.sub(r
|
195 |
-
|
196 |
# Handle common encoding issues
|
197 |
-
text = text.replace(
|
198 |
-
text = text.replace(
|
199 |
-
text = text.replace(
|
200 |
-
text = text.replace(
|
201 |
-
|
202 |
return text
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
207 |
return {}
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
}
|
222 |
|
223 |
-
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
clean_name = sanitize_model_name(model_name)
|
226 |
-
clean_author =
|
|
|
|
|
|
|
227 |
timestamp = datetime.datetime.now().strftime("%m%d_%H%M")
|
228 |
-
return f"{clean_name}_{clean_author}_{timestamp}"
|
229 |
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
if df.empty:
|
233 |
return False, ["DataFrame is empty"]
|
234 |
-
|
|
|
|
|
|
|
235 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
236 |
if missing_columns:
|
237 |
-
|
238 |
-
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
def format_duration(seconds: float) -> str:
|
242 |
"""Format duration in seconds to human-readable format."""
|
@@ -247,12 +608,80 @@ def format_duration(seconds: float) -> str:
|
|
247 |
else:
|
248 |
return f"{seconds/3600:.1f}h"
|
249 |
|
|
|
250 |
def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
|
251 |
"""Truncate text to specified length with suffix."""
|
252 |
if not isinstance(text, str):
|
253 |
text = str(text)
|
254 |
-
|
255 |
if len(text) <= max_length:
|
256 |
return text
|
257 |
-
|
258 |
-
return text[:max_length - len(suffix)] + suffix
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import re
|
3 |
import datetime
|
4 |
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
from typing import Dict, List, Tuple, Set, Optional, Union
|
7 |
+
from scipy import stats
|
8 |
+
from config import (
|
9 |
+
ALL_UG40_LANGUAGES,
|
10 |
+
GOOGLE_SUPPORTED_LANGUAGES,
|
11 |
+
LANGUAGE_NAMES,
|
12 |
+
EVALUATION_TRACKS,
|
13 |
+
MODEL_CATEGORIES,
|
14 |
+
STATISTICAL_CONFIG,
|
15 |
+
METRICS_CONFIG,
|
16 |
+
)
|
17 |
+
|
18 |
|
19 |
def get_all_language_pairs() -> List[Tuple[str, str]]:
|
20 |
"""Get all possible UG40 language pairs."""
|
|
|
25 |
pairs.append((src, tgt))
|
26 |
return pairs
|
27 |
|
28 |
+
|
29 |
def get_google_comparable_pairs() -> List[Tuple[str, str]]:
|
30 |
"""Get language pairs that can be compared with Google Translate."""
|
31 |
pairs = []
|
|
|
35 |
pairs.append((src, tgt))
|
36 |
return pairs
|
37 |
|
38 |
+
|
39 |
+
def get_track_language_pairs(track: str) -> List[Tuple[str, str]]:
|
40 |
+
"""Get language pairs for a specific evaluation track."""
|
41 |
+
if track not in EVALUATION_TRACKS:
|
42 |
+
return []
|
43 |
+
|
44 |
+
track_languages = EVALUATION_TRACKS[track]["languages"]
|
45 |
+
pairs = []
|
46 |
+
for src in track_languages:
|
47 |
+
for tgt in track_languages:
|
48 |
+
if src != tgt:
|
49 |
+
pairs.append((src, tgt))
|
50 |
+
return pairs
|
51 |
+
|
52 |
+
|
53 |
def format_language_pair(src: str, tgt: str) -> str:
|
54 |
"""Format language pair for display."""
|
55 |
src_name = LANGUAGE_NAMES.get(src, src.upper())
|
56 |
tgt_name = LANGUAGE_NAMES.get(tgt, tgt.upper())
|
57 |
return f"{src_name} → {tgt_name}"
|
58 |
|
59 |
+
|
60 |
def validate_language_code(lang: str) -> bool:
|
61 |
"""Validate if language code is supported."""
|
62 |
return lang in ALL_UG40_LANGUAGES
|
63 |
|
64 |
+
|
65 |
def create_submission_id() -> str:
|
66 |
+
"""Create unique submission ID with timestamp and random component."""
|
67 |
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
68 |
+
random_suffix = str(np.random.randint(1000, 9999))
|
69 |
+
return f"sub_{timestamp}_{random_suffix}"
|
70 |
+
|
71 |
|
72 |
def sanitize_model_name(name: str) -> str:
|
73 |
+
"""Sanitize model name for display and storage with enhanced validation."""
|
74 |
if not name or not isinstance(name, str):
|
75 |
return "Anonymous_Model"
|
76 |
+
|
77 |
# Remove special characters, limit length
|
78 |
+
name = re.sub(r"[^\w\-.]", "_", name.strip())
|
79 |
# Remove multiple consecutive underscores
|
80 |
+
name = re.sub(r"_+", "_", name)
|
81 |
# Remove leading/trailing underscores
|
82 |
+
name = name.strip("_")
|
83 |
+
|
84 |
# Ensure minimum length
|
85 |
if len(name) < 3:
|
86 |
name = f"Model_{name}"
|
87 |
+
|
88 |
+
# Check for reserved names
|
89 |
+
reserved_names = ["admin", "test", "baseline", "google", "system"]
|
90 |
+
if name.lower() in reserved_names:
|
91 |
+
name = f"User_{name}"
|
92 |
+
|
93 |
return name[:50] # Limit to 50 characters
|
94 |
|
95 |
+
|
96 |
+
def format_metric_value(
|
97 |
+
value: float,
|
98 |
+
metric: str,
|
99 |
+
include_ci: bool = False,
|
100 |
+
ci_lower: float = None,
|
101 |
+
ci_upper: float = None,
|
102 |
+
) -> str:
|
103 |
+
"""Format metric value for display with optional confidence intervals."""
|
104 |
if pd.isna(value) or value is None:
|
105 |
return "N/A"
|
106 |
+
|
107 |
try:
|
108 |
+
precision = METRICS_CONFIG["display_precision"]
|
109 |
+
|
110 |
+
if metric == "coverage_rate":
|
111 |
+
formatted = f"{value:.{precision}%}"
|
112 |
+
elif metric in ["bleu"]:
|
113 |
+
formatted = f"{value:.2f}"
|
114 |
+
elif metric in ["cer", "wer"] and value > 1:
|
115 |
# Cap error rates at 1.0 for display
|
116 |
+
formatted = f"{min(value, 1.0):.{precision}f}"
|
117 |
else:
|
118 |
+
formatted = f"{value:.{precision}f}"
|
119 |
+
|
120 |
+
# Add confidence interval if requested
|
121 |
+
if include_ci and ci_lower is not None and ci_upper is not None:
|
122 |
+
ci_str = f" [{ci_lower:.{precision}f}, {ci_upper:.{precision}f}]"
|
123 |
+
formatted += ci_str
|
124 |
+
|
125 |
+
return formatted
|
126 |
+
|
127 |
except (ValueError, TypeError):
|
128 |
return str(value)
|
129 |
|
130 |
+
|
131 |
+
def calculate_effect_size(values1: List[float], values2: List[float]) -> float:
|
132 |
+
"""Calculate Cohen's d effect size between two groups."""
|
133 |
+
if len(values1) < 2 or len(values2) < 2:
|
134 |
+
return 0.0
|
135 |
+
|
|
|
136 |
try:
|
137 |
+
values1 = np.array(values1)
|
138 |
+
values2 = np.array(values2)
|
139 |
+
|
140 |
+
# Remove NaN values
|
141 |
+
values1 = values1[~np.isnan(values1)]
|
142 |
+
values2 = values2[~np.isnan(values2)]
|
143 |
+
|
144 |
+
if len(values1) < 2 or len(values2) < 2:
|
145 |
+
return 0.0
|
146 |
+
|
147 |
+
# Calculate pooled standard deviation
|
148 |
+
n1, n2 = len(values1), len(values2)
|
149 |
+
pooled_std = np.sqrt(
|
150 |
+
((n1 - 1) * np.var(values1, ddof=1) + (n2 - 1) * np.var(values2, ddof=1))
|
151 |
+
/ (n1 + n2 - 2)
|
152 |
+
)
|
153 |
+
|
154 |
+
if pooled_std == 0:
|
155 |
+
return 0.0
|
156 |
+
|
157 |
+
# Cohen's d
|
158 |
+
effect_size = (np.mean(values1) - np.mean(values2)) / pooled_std
|
159 |
+
return abs(effect_size)
|
160 |
+
|
161 |
+
except Exception:
|
162 |
+
return 0.0
|
163 |
+
|
164 |
+
|
165 |
+
def interpret_effect_size(effect_size: float) -> str:
|
166 |
+
"""Interpret effect size according to Cohen's conventions."""
|
167 |
+
thresholds = STATISTICAL_CONFIG["effect_size_thresholds"]
|
168 |
+
|
169 |
+
if effect_size < thresholds["small"]:
|
170 |
+
return "negligible"
|
171 |
+
elif effect_size < thresholds["medium"]:
|
172 |
+
return "small"
|
173 |
+
elif effect_size < thresholds["large"]:
|
174 |
+
return "medium"
|
175 |
+
else:
|
176 |
+
return "large"
|
177 |
+
|
178 |
+
|
179 |
+
def calculate_statistical_power(
|
180 |
+
effect_size: float, n1: int, n2: int, alpha: float = 0.05
|
181 |
+
) -> float:
|
182 |
+
"""Estimate statistical power for given effect size and sample sizes."""
|
183 |
+
if n1 < 2 or n2 < 2:
|
184 |
+
return 0.0
|
185 |
+
|
186 |
+
try:
|
187 |
+
# Simplified power calculation using t-test
|
188 |
+
# This is an approximation
|
189 |
+
df = n1 + n2 - 2
|
190 |
+
pooled_se = np.sqrt((1 / n1) + (1 / n2))
|
191 |
+
|
192 |
+
# Critical t-value
|
193 |
+
t_critical = stats.t.ppf(1 - alpha / 2, df)
|
194 |
+
|
195 |
+
# Non-centrality parameter
|
196 |
+
ncp = effect_size / pooled_se
|
197 |
+
|
198 |
+
# Power (approximate)
|
199 |
+
power = (
|
200 |
+
1
|
201 |
+
- stats.t.cdf(t_critical, df, loc=ncp)
|
202 |
+
+ stats.t.cdf(-t_critical, df, loc=ncp)
|
203 |
+
)
|
204 |
+
|
205 |
+
return min(1.0, max(0.0, power))
|
206 |
+
|
207 |
+
except Exception:
|
208 |
+
return 0.0
|
209 |
+
|
210 |
+
|
211 |
+
def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
|
212 |
+
"""Get comprehensive statistics about test data coverage for each track."""
|
213 |
+
track_stats = {}
|
214 |
+
|
215 |
+
for track_name, track_config in EVALUATION_TRACKS.items():
|
216 |
+
track_languages = track_config["languages"]
|
217 |
+
|
218 |
+
# Filter test data to track languages
|
219 |
+
track_data = test_data[
|
220 |
+
(test_data["source_language"].isin(track_languages))
|
221 |
+
& (test_data["target_language"].isin(track_languages))
|
222 |
+
]
|
223 |
+
|
224 |
+
if track_data.empty:
|
225 |
+
track_stats[track_name] = {
|
226 |
+
"total_samples": 0,
|
227 |
+
"language_pairs": 0,
|
228 |
+
"samples_per_pair": {},
|
229 |
+
"coverage_matrix": {},
|
230 |
+
"adequacy_assessment": "insufficient",
|
231 |
+
}
|
232 |
+
continue
|
233 |
+
|
234 |
+
# Calculate pair-wise statistics
|
235 |
+
pair_counts = {}
|
236 |
+
for src in track_languages:
|
237 |
+
for tgt in track_languages:
|
238 |
+
if src == tgt:
|
239 |
+
continue
|
240 |
+
|
241 |
+
pair_data = track_data[
|
242 |
+
(track_data["source_language"] == src)
|
243 |
+
& (track_data["target_language"] == tgt)
|
244 |
+
]
|
245 |
+
|
246 |
+
pair_key = f"{src}_to_{tgt}"
|
247 |
+
pair_counts[pair_key] = len(pair_data)
|
248 |
+
|
249 |
+
# Calculate adequacy
|
250 |
+
min_required = track_config["min_samples_per_pair"]
|
251 |
+
adequate_pairs = sum(
|
252 |
+
1 for count in pair_counts.values() if count >= min_required
|
253 |
+
)
|
254 |
+
total_possible_pairs = len(track_languages) * (len(track_languages) - 1)
|
255 |
+
|
256 |
+
adequacy_rate = adequate_pairs / max(total_possible_pairs, 1)
|
257 |
+
|
258 |
+
if adequacy_rate >= 0.8:
|
259 |
+
adequacy = "excellent"
|
260 |
+
elif adequacy_rate >= 0.6:
|
261 |
+
adequacy = "good"
|
262 |
+
elif adequacy_rate >= 0.4:
|
263 |
+
adequacy = "fair"
|
264 |
+
else:
|
265 |
+
adequacy = "insufficient"
|
266 |
+
|
267 |
+
track_stats[track_name] = {
|
268 |
+
"total_samples": len(track_data),
|
269 |
+
"language_pairs": len([k for k, v in pair_counts.items() if v > 0]),
|
270 |
+
"samples_per_pair": pair_counts,
|
271 |
+
"coverage_matrix": pair_counts,
|
272 |
+
"adequacy_assessment": adequacy,
|
273 |
+
"adequacy_rate": adequacy_rate,
|
274 |
+
"min_samples_per_pair": min_required,
|
275 |
+
}
|
276 |
+
|
277 |
+
return track_stats
|
278 |
+
|
279 |
+
|
280 |
+
def validate_submission_completeness_scientific(
|
281 |
+
predictions: pd.DataFrame, test_set: pd.DataFrame, track: str = None
|
282 |
+
) -> Dict:
|
283 |
+
"""Enhanced validation with track-specific analysis."""
|
284 |
|
|
|
|
|
|
|
285 |
if predictions.empty or test_set.empty:
|
286 |
return {
|
287 |
+
"is_complete": False,
|
288 |
+
"missing_count": len(test_set) if not test_set.empty else 0,
|
289 |
+
"extra_count": len(predictions) if not predictions.empty else 0,
|
290 |
+
"missing_ids": [],
|
291 |
+
"coverage": 0.0,
|
292 |
+
"track_analysis": {},
|
293 |
}
|
294 |
+
|
295 |
+
# If track specified, filter to track languages
|
296 |
+
if track and track in EVALUATION_TRACKS:
|
297 |
+
track_languages = EVALUATION_TRACKS[track]["languages"]
|
298 |
+
test_set = test_set[
|
299 |
+
(test_set["source_language"].isin(track_languages))
|
300 |
+
& (test_set["target_language"].isin(track_languages))
|
301 |
+
]
|
302 |
+
|
303 |
try:
|
304 |
+
required_ids = set(test_set["sample_id"].astype(str))
|
305 |
+
provided_ids = set(predictions["sample_id"].astype(str))
|
306 |
+
|
307 |
missing_ids = required_ids - provided_ids
|
308 |
extra_ids = provided_ids - required_ids
|
309 |
+
matching_ids = provided_ids & required_ids
|
310 |
+
|
311 |
+
base_result = {
|
312 |
+
"is_complete": len(missing_ids) == 0,
|
313 |
+
"missing_count": len(missing_ids),
|
314 |
+
"extra_count": len(extra_ids),
|
315 |
+
"missing_ids": list(missing_ids)[:10],
|
316 |
+
"coverage": len(matching_ids) / len(required_ids) if required_ids else 0.0,
|
317 |
}
|
318 |
+
|
319 |
+
# Add track-specific analysis if requested
|
320 |
+
if track:
|
321 |
+
track_analysis = analyze_track_coverage(predictions, test_set, track)
|
322 |
+
base_result["track_analysis"] = track_analysis
|
323 |
+
|
324 |
+
return base_result
|
325 |
+
|
326 |
except Exception as e:
|
327 |
+
print(f"Error in submission completeness validation: {e}")
|
328 |
return {
|
329 |
+
"is_complete": False,
|
330 |
+
"missing_count": 0,
|
331 |
+
"extra_count": 0,
|
332 |
+
"missing_ids": [],
|
333 |
+
"coverage": 0.0,
|
334 |
+
"track_analysis": {},
|
335 |
}
|
336 |
|
337 |
+
|
338 |
+
def analyze_track_coverage(
|
339 |
+
predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
|
340 |
+
) -> Dict:
|
341 |
+
"""Analyze coverage for a specific track."""
|
342 |
+
|
343 |
+
if track not in EVALUATION_TRACKS:
|
344 |
+
return {"error": f"Unknown track: {track}"}
|
345 |
+
|
346 |
+
track_config = EVALUATION_TRACKS[track]
|
347 |
+
track_languages = track_config["languages"]
|
348 |
+
|
349 |
+
# Filter test set to track languages
|
350 |
+
track_test_set = test_set[
|
351 |
+
(test_set["source_language"].isin(track_languages))
|
352 |
+
& (test_set["target_language"].isin(track_languages))
|
353 |
+
]
|
354 |
+
|
355 |
+
if track_test_set.empty:
|
356 |
+
return {"error": f"No test data available for {track} track"}
|
357 |
+
|
358 |
+
# Merge with predictions
|
359 |
+
merged = track_test_set.merge(
|
360 |
+
predictions, on="sample_id", how="left", suffixes=("", "_pred")
|
361 |
+
)
|
362 |
+
|
363 |
+
# Analyze by language pair
|
364 |
+
pair_analysis = {}
|
365 |
+
for src in track_languages:
|
366 |
+
for tgt in track_languages:
|
367 |
+
if src == tgt:
|
368 |
+
continue
|
369 |
+
|
370 |
+
pair_data = merged[
|
371 |
+
(merged["source_language"] == src) & (merged["target_language"] == tgt)
|
372 |
+
]
|
373 |
+
|
374 |
+
if len(pair_data) > 0:
|
375 |
+
covered = pair_data["prediction"].notna().sum()
|
376 |
+
pair_analysis[f"{src}_to_{tgt}"] = {
|
377 |
+
"total": len(pair_data),
|
378 |
+
"covered": covered,
|
379 |
+
"coverage_rate": covered / len(pair_data),
|
380 |
+
"meets_minimum": covered >= track_config["min_samples_per_pair"],
|
381 |
+
}
|
382 |
+
|
383 |
+
# Overall track statistics
|
384 |
+
total_pairs = len(pair_analysis)
|
385 |
+
adequate_pairs = sum(1 for info in pair_analysis.values() if info["meets_minimum"])
|
386 |
+
|
387 |
+
return {
|
388 |
+
"track_name": track_config["name"],
|
389 |
+
"total_language_pairs": total_pairs,
|
390 |
+
"adequate_pairs": adequate_pairs,
|
391 |
+
"adequacy_rate": adequate_pairs / max(total_pairs, 1),
|
392 |
+
"pair_analysis": pair_analysis,
|
393 |
+
"overall_adequate": adequate_pairs
|
394 |
+
>= total_pairs * 0.8, # 80% of pairs adequate
|
395 |
+
}
|
396 |
+
|
397 |
+
|
398 |
+
def calculate_language_pair_coverage_scientific(
|
399 |
+
predictions: pd.DataFrame, test_set: pd.DataFrame
|
400 |
+
) -> Dict:
|
401 |
+
"""Calculate comprehensive language pair coverage with statistical metrics."""
|
402 |
+
|
403 |
if predictions.empty or test_set.empty:
|
404 |
return {}
|
405 |
+
|
406 |
try:
|
407 |
# Merge to get language info
|
408 |
+
merged = test_set.merge(
|
409 |
+
predictions, on="sample_id", how="left", suffixes=("", "_pred")
|
410 |
+
)
|
411 |
+
|
412 |
coverage = {}
|
413 |
for src in ALL_UG40_LANGUAGES:
|
414 |
for tgt in ALL_UG40_LANGUAGES:
|
415 |
+
if src == tgt:
|
416 |
+
continue
|
417 |
+
|
418 |
+
pair_data = merged[
|
419 |
+
(merged["source_language"] == src)
|
420 |
+
& (merged["target_language"] == tgt)
|
421 |
+
]
|
422 |
+
|
423 |
+
if len(pair_data) > 0:
|
424 |
+
predicted_count = pair_data["prediction"].notna().sum()
|
425 |
+
coverage_rate = predicted_count / len(pair_data)
|
426 |
+
|
427 |
+
# Determine which tracks include this pair
|
428 |
+
tracks_included = []
|
429 |
+
for track_name, track_config in EVALUATION_TRACKS.items():
|
430 |
+
if (
|
431 |
+
src in track_config["languages"]
|
432 |
+
and tgt in track_config["languages"]
|
433 |
+
):
|
434 |
+
tracks_included.append(track_name)
|
435 |
+
|
436 |
+
coverage[f"{src}_{tgt}"] = {
|
437 |
+
"total": len(pair_data),
|
438 |
+
"predicted": predicted_count,
|
439 |
+
"coverage": coverage_rate,
|
440 |
+
"display_name": format_language_pair(src, tgt),
|
441 |
+
"tracks_included": tracks_included,
|
442 |
+
"google_comparable": (
|
443 |
+
src in GOOGLE_SUPPORTED_LANGUAGES
|
444 |
+
and tgt in GOOGLE_SUPPORTED_LANGUAGES
|
445 |
+
),
|
446 |
+
"statistical_adequacy": {
|
447 |
+
track: predicted_count
|
448 |
+
>= EVALUATION_TRACKS[track]["min_samples_per_pair"]
|
449 |
+
for track in tracks_included
|
450 |
+
},
|
451 |
+
}
|
452 |
+
|
453 |
return coverage
|
454 |
+
|
455 |
except Exception as e:
|
456 |
print(f"Error calculating language pair coverage: {e}")
|
457 |
return {}
|
458 |
|
459 |
+
|
460 |
def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
|
461 |
"""Safely divide two numbers, handling edge cases."""
|
462 |
try:
|
463 |
if denominator == 0 or pd.isna(denominator) or pd.isna(numerator):
|
464 |
return default
|
465 |
result = numerator / denominator
|
466 |
+
if pd.isna(result) or not np.isfinite(result):
|
467 |
return default
|
468 |
return float(result)
|
469 |
except (TypeError, ValueError, ZeroDivisionError):
|
470 |
return default
|
471 |
|
472 |
+
|
473 |
def clean_text_for_evaluation(text: str) -> str:
|
474 |
"""Clean text for evaluation, handling common encoding issues."""
|
475 |
if not isinstance(text, str):
|
476 |
return str(text) if text is not None else ""
|
477 |
+
|
478 |
# Remove extra whitespace
|
479 |
+
text = re.sub(r"\s+", " ", text.strip())
|
480 |
+
|
481 |
# Handle common encoding issues
|
482 |
+
text = text.replace("\u00a0", " ") # Non-breaking space
|
483 |
+
text = text.replace("\u2019", "'") # Right single quotation mark
|
484 |
+
text = text.replace("\u201c", '"') # Left double quotation mark
|
485 |
+
text = text.replace("\u201d", '"') # Right double quotation mark
|
486 |
+
|
487 |
return text
|
488 |
|
489 |
+
|
490 |
+
def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -> Dict:
|
491 |
+
"""Extract comprehensive summary statistics from model evaluation results."""
|
492 |
+
|
493 |
+
if not model_results or "tracks" not in model_results:
|
494 |
return {}
|
495 |
+
|
496 |
+
tracks = model_results["tracks"]
|
497 |
+
|
498 |
+
# If specific track requested
|
499 |
+
if track and track in tracks:
|
500 |
+
track_data = tracks[track]
|
501 |
+
if track_data.get("error"):
|
502 |
+
return {"error": f"No valid data for {track} track"}
|
503 |
+
|
504 |
+
track_averages = track_data.get("track_averages", {})
|
505 |
+
track_statistics = track_data.get("track_statistics", {})
|
506 |
+
summary = track_data.get("summary", {})
|
507 |
+
|
508 |
+
stats = {
|
509 |
+
"track": track,
|
510 |
+
"track_name": EVALUATION_TRACKS[track]["name"],
|
511 |
+
"quality_score": track_averages.get("quality_score", 0.0),
|
512 |
+
"bleu": track_averages.get("bleu", 0.0),
|
513 |
+
"chrf": track_averages.get("chrf", 0.0),
|
514 |
+
"total_samples": summary.get("total_samples", 0),
|
515 |
+
"language_pairs": summary.get("language_pairs_evaluated", 0),
|
516 |
+
"statistical_adequacy": summary.get("total_samples", 0)
|
517 |
+
>= 100, # Simple threshold
|
518 |
+
}
|
519 |
+
|
520 |
+
# Add confidence intervals if available
|
521 |
+
if "quality_score" in track_statistics:
|
522 |
+
quality_stats = track_statistics["quality_score"]
|
523 |
+
stats["confidence_interval"] = [
|
524 |
+
quality_stats.get("ci_lower", 0.0),
|
525 |
+
quality_stats.get("ci_upper", 0.0),
|
526 |
+
]
|
527 |
+
|
528 |
+
return stats
|
529 |
+
|
530 |
+
# Otherwise, return summary across all tracks
|
531 |
+
all_tracks_summary = {
|
532 |
+
"tracks_evaluated": len([t for t in tracks.values() if not t.get("error")]),
|
533 |
+
"total_tracks": len(EVALUATION_TRACKS),
|
534 |
+
"by_track": {},
|
535 |
}
|
536 |
|
537 |
+
for track_name, track_data in tracks.items():
|
538 |
+
if not track_data.get("error"):
|
539 |
+
track_averages = track_data.get("track_averages", {})
|
540 |
+
summary = track_data.get("summary", {})
|
541 |
+
|
542 |
+
all_tracks_summary["by_track"][track_name] = {
|
543 |
+
"quality_score": track_averages.get("quality_score", 0.0),
|
544 |
+
"samples": summary.get("total_samples", 0),
|
545 |
+
"pairs": summary.get("language_pairs_evaluated", 0),
|
546 |
+
}
|
547 |
+
|
548 |
+
return all_tracks_summary
|
549 |
+
|
550 |
+
|
551 |
+
def generate_model_identifier_scientific(
|
552 |
+
model_name: str, author: str, category: str
|
553 |
+
) -> str:
|
554 |
+
"""Generate a unique scientific identifier for a model."""
|
555 |
clean_name = sanitize_model_name(model_name)
|
556 |
+
clean_author = (
|
557 |
+
re.sub(r"[^\w\-]", "_", author.strip())[:20] if author else "Anonymous"
|
558 |
+
)
|
559 |
+
clean_category = category[:10] if category in MODEL_CATEGORIES else "community"
|
560 |
timestamp = datetime.datetime.now().strftime("%m%d_%H%M")
|
|
|
561 |
|
562 |
+
return f"{clean_category}_{clean_name}_{clean_author}_{timestamp}"
|
563 |
+
|
564 |
+
|
565 |
+
def validate_dataframe_structure_enhanced(
|
566 |
+
df: pd.DataFrame, required_columns: List[str], track: str = None
|
567 |
+
) -> Tuple[bool, List[str]]:
|
568 |
+
"""Enhanced DataFrame structure validation with track-specific checks."""
|
569 |
+
|
570 |
if df.empty:
|
571 |
return False, ["DataFrame is empty"]
|
572 |
+
|
573 |
+
issues = []
|
574 |
+
|
575 |
+
# Check required columns
|
576 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
577 |
if missing_columns:
|
578 |
+
issues.append(f"Missing columns: {', '.join(missing_columns)}")
|
579 |
+
|
580 |
+
# Check for track-specific requirements
|
581 |
+
if track and track in EVALUATION_TRACKS:
|
582 |
+
track_config = EVALUATION_TRACKS[track]
|
583 |
+
min_samples = track_config.get("min_samples_per_pair", 10)
|
584 |
+
|
585 |
+
# Check sample size adequacy
|
586 |
+
if len(df) < min_samples * 5: # At least 5 pairs worth of data
|
587 |
+
issues.append(
|
588 |
+
f"Insufficient samples for {track} track (minimum ~{min_samples * 5})"
|
589 |
+
)
|
590 |
+
|
591 |
+
# Check data types
|
592 |
+
if "sample_id" in df.columns:
|
593 |
+
if not df["sample_id"].dtype == "object":
|
594 |
+
try:
|
595 |
+
df["sample_id"] = df["sample_id"].astype(str)
|
596 |
+
except Exception:
|
597 |
+
issues.append("Cannot convert sample_id to string")
|
598 |
+
|
599 |
+
return len(issues) == 0, issues
|
600 |
+
|
601 |
|
602 |
def format_duration(seconds: float) -> str:
|
603 |
"""Format duration in seconds to human-readable format."""
|
|
|
608 |
else:
|
609 |
return f"{seconds/3600:.1f}h"
|
610 |
|
611 |
+
|
612 |
def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
|
613 |
"""Truncate text to specified length with suffix."""
|
614 |
if not isinstance(text, str):
|
615 |
text = str(text)
|
616 |
+
|
617 |
if len(text) <= max_length:
|
618 |
return text
|
619 |
+
|
620 |
+
return text[: max_length - len(suffix)] + suffix
|
621 |
+
|
622 |
+
|
623 |
+
def calculate_sample_size_recommendation(
|
624 |
+
desired_power: float = 0.8, effect_size: float = 0.5, alpha: float = 0.05
|
625 |
+
) -> int:
|
626 |
+
"""Calculate recommended sample size for statistical analysis."""
|
627 |
+
|
628 |
+
try:
|
629 |
+
# Simplified sample size calculation for t-test
|
630 |
+
# This is an approximation using Cohen's conventions
|
631 |
+
|
632 |
+
z_alpha = stats.norm.ppf(1 - alpha / 2)
|
633 |
+
z_beta = stats.norm.ppf(desired_power)
|
634 |
+
|
635 |
+
# Sample size per group
|
636 |
+
n_per_group = 2 * ((z_alpha + z_beta) / effect_size) ** 2
|
637 |
+
|
638 |
+
# Round up to nearest integer
|
639 |
+
return max(10, int(np.ceil(n_per_group)))
|
640 |
+
|
641 |
+
except Exception:
|
642 |
+
return 50 # Default fallback
|
643 |
+
|
644 |
+
|
645 |
+
def assess_model_category_appropriateness(
|
646 |
+
model_name: str, category: str, performance_data: Dict
|
647 |
+
) -> Dict:
|
648 |
+
"""Assess if the detected/assigned model category is appropriate."""
|
649 |
+
|
650 |
+
assessment = {
|
651 |
+
"category": category,
|
652 |
+
"appropriate": True,
|
653 |
+
"confidence": 1.0,
|
654 |
+
"recommendations": [],
|
655 |
+
}
|
656 |
+
|
657 |
+
# Check for category mismatches based on performance
|
658 |
+
if category == "baseline" and performance_data:
|
659 |
+
# Baselines shouldn't perform too well
|
660 |
+
quality_scores = []
|
661 |
+
for track_data in performance_data.get("tracks", {}).values():
|
662 |
+
if not track_data.get("error"):
|
663 |
+
quality_scores.append(
|
664 |
+
track_data.get("track_averages", {}).get("quality_score", 0)
|
665 |
+
)
|
666 |
+
|
667 |
+
if (
|
668 |
+
quality_scores and max(quality_scores) > 0.7
|
669 |
+
): # High performance for baseline
|
670 |
+
assessment["appropriate"] = False
|
671 |
+
assessment["confidence"] = 0.3
|
672 |
+
assessment["recommendations"].append(
|
673 |
+
"High performance suggests this might not be a baseline model"
|
674 |
+
)
|
675 |
+
|
676 |
+
# Check for commercial model expectations
|
677 |
+
if category == "commercial":
|
678 |
+
# Commercial models should have good Google-comparable performance
|
679 |
+
google_track = performance_data.get("tracks", {}).get("google_comparable", {})
|
680 |
+
if not google_track.get("error"):
|
681 |
+
quality = google_track.get("track_averages", {}).get("quality_score", 0)
|
682 |
+
if quality < 0.3: # Poor performance for commercial
|
683 |
+
assessment["recommendations"].append(
|
684 |
+
"Low performance unexpected for commercial systems"
|
685 |
+
)
|
686 |
+
|
687 |
+
return assessment
|