akera commited on
Commit
a411078
·
verified ·
1 Parent(s): ce36131

Update src/utils.py

Browse files
Files changed (1) hide show
  1. src/utils.py +563 -134
src/utils.py CHANGED
@@ -2,8 +2,19 @@
2
  import re
3
  import datetime
4
  import pandas as pd
5
- from typing import Dict, List, Tuple, Set, Optional
6
- from config import ALL_UG40_LANGUAGES, LANGUAGE_NAMES, GOOGLE_SUPPORTED_LANGUAGES, DISPLAY_CONFIG
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def get_all_language_pairs() -> List[Tuple[str, str]]:
9
  """Get all possible UG40 language pairs."""
@@ -14,6 +25,7 @@ def get_all_language_pairs() -> List[Tuple[str, str]]:
14
  pairs.append((src, tgt))
15
  return pairs
16
 
 
17
  def get_google_comparable_pairs() -> List[Tuple[str, str]]:
18
  """Get language pairs that can be compared with Google Translate."""
19
  pairs = []
@@ -23,220 +35,569 @@ def get_google_comparable_pairs() -> List[Tuple[str, str]]:
23
  pairs.append((src, tgt))
24
  return pairs
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def format_language_pair(src: str, tgt: str) -> str:
27
  """Format language pair for display."""
28
  src_name = LANGUAGE_NAMES.get(src, src.upper())
29
  tgt_name = LANGUAGE_NAMES.get(tgt, tgt.upper())
30
  return f"{src_name} → {tgt_name}"
31
 
 
32
  def validate_language_code(lang: str) -> bool:
33
  """Validate if language code is supported."""
34
  return lang in ALL_UG40_LANGUAGES
35
 
 
36
  def create_submission_id() -> str:
37
- """Create unique submission ID."""
38
- return datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
 
 
 
39
 
40
  def sanitize_model_name(name: str) -> str:
41
- """Sanitize model name for display and storage."""
42
  if not name or not isinstance(name, str):
43
  return "Anonymous_Model"
44
-
45
  # Remove special characters, limit length
46
- name = re.sub(r'[^\w\-.]', '_', name.strip())
47
  # Remove multiple consecutive underscores
48
- name = re.sub(r'_+', '_', name)
49
  # Remove leading/trailing underscores
50
- name = name.strip('_')
51
-
52
  # Ensure minimum length
53
  if len(name) < 3:
54
  name = f"Model_{name}"
55
-
 
 
 
 
 
56
  return name[:50] # Limit to 50 characters
57
 
58
- def format_metric_value(value: float, metric: str) -> str:
59
- """Format metric value for display with appropriate precision."""
 
 
 
 
 
 
 
60
  if pd.isna(value) or value is None:
61
  return "N/A"
62
-
63
  try:
64
- precision = DISPLAY_CONFIG['decimal_places'].get(metric, 4)
65
-
66
- if metric == 'coverage_rate':
67
- return f"{value:.{precision}%}"
68
- elif metric in ['bleu']:
69
- return f"{value:.{precision}f}"
70
- elif metric in ['cer', 'wer'] and value > 1:
71
  # Cap error rates at 1.0 for display
72
- return f"{min(value, 1.0):.{precision}f}"
73
  else:
74
- return f"{value:.{precision}f}"
 
 
 
 
 
 
 
 
75
  except (ValueError, TypeError):
76
  return str(value)
77
 
78
- def get_language_pair_stats(test_data: pd.DataFrame) -> Dict[str, Dict]:
79
- """Get statistics about language pair coverage in test data."""
80
- if test_data.empty:
81
- return {}
82
-
83
- stats = {}
84
-
85
  try:
86
- for src in ALL_UG40_LANGUAGES:
87
- for tgt in ALL_UG40_LANGUAGES:
88
- if src != tgt:
89
- pair_data = test_data[
90
- (test_data['source_language'] == src) &
91
- (test_data['target_language'] == tgt)
92
- ]
93
-
94
- stats[f"{src}_{tgt}"] = {
95
- 'count': len(pair_data),
96
- 'google_comparable': src in GOOGLE_SUPPORTED_LANGUAGES and tgt in GOOGLE_SUPPORTED_LANGUAGES,
97
- 'display_name': format_language_pair(src, tgt),
98
- 'source_language': src,
99
- 'target_language': tgt
100
- }
101
- except Exception as e:
102
- print(f"Error calculating language pair stats: {e}")
103
- return {}
104
-
105
- return stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- def validate_submission_completeness(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
108
- """Validate that submission covers all required samples."""
109
-
110
  if predictions.empty or test_set.empty:
111
  return {
112
- 'is_complete': False,
113
- 'missing_count': len(test_set) if not test_set.empty else 0,
114
- 'extra_count': len(predictions) if not predictions.empty else 0,
115
- 'missing_ids': [],
116
- 'coverage': 0.0
 
117
  }
118
-
 
 
 
 
 
 
 
 
119
  try:
120
- required_ids = set(test_set['sample_id'].astype(str))
121
- provided_ids = set(predictions['sample_id'].astype(str))
122
-
123
  missing_ids = required_ids - provided_ids
124
  extra_ids = provided_ids - required_ids
125
-
126
- return {
127
- 'is_complete': len(missing_ids) == 0,
128
- 'missing_count': len(missing_ids),
129
- 'extra_count': len(extra_ids),
130
- 'missing_ids': list(missing_ids)[:10], # First 10 for display
131
- 'coverage': len(provided_ids & required_ids) / len(required_ids) if required_ids else 0.0
 
132
  }
 
 
 
 
 
 
 
 
133
  except Exception as e:
134
- print(f"Error validating submission completeness: {e}")
135
  return {
136
- 'is_complete': False,
137
- 'missing_count': 0,
138
- 'extra_count': 0,
139
- 'missing_ids': [],
140
- 'coverage': 0.0
 
141
  }
142
 
143
- def calculate_language_pair_coverage(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
144
- """Calculate coverage by language pair."""
145
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  if predictions.empty or test_set.empty:
147
  return {}
148
-
149
  try:
150
  # Merge to get language info
151
- merged = test_set.merge(predictions, on='sample_id', how='left', suffixes=('', '_pred'))
152
-
 
 
153
  coverage = {}
154
  for src in ALL_UG40_LANGUAGES:
155
  for tgt in ALL_UG40_LANGUAGES:
156
- if src != tgt:
157
- pair_data = merged[
158
- (merged['source_language'] == src) &
159
- (merged['target_language'] == tgt)
160
- ]
161
-
162
- if len(pair_data) > 0:
163
- predicted_count = pair_data['prediction'].notna().sum()
164
- coverage[f"{src}_{tgt}"] = {
165
- 'total': len(pair_data),
166
- 'predicted': predicted_count,
167
- 'coverage': predicted_count / len(pair_data),
168
- 'display_name': format_language_pair(src, tgt)
169
- }
170
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  return coverage
 
172
  except Exception as e:
173
  print(f"Error calculating language pair coverage: {e}")
174
  return {}
175
 
 
176
  def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
177
  """Safely divide two numbers, handling edge cases."""
178
  try:
179
  if denominator == 0 or pd.isna(denominator) or pd.isna(numerator):
180
  return default
181
  result = numerator / denominator
182
- if pd.isna(result) or not pd.isfinite(result):
183
  return default
184
  return float(result)
185
  except (TypeError, ValueError, ZeroDivisionError):
186
  return default
187
 
 
188
  def clean_text_for_evaluation(text: str) -> str:
189
  """Clean text for evaluation, handling common encoding issues."""
190
  if not isinstance(text, str):
191
  return str(text) if text is not None else ""
192
-
193
  # Remove extra whitespace
194
- text = re.sub(r'\s+', ' ', text.strip())
195
-
196
  # Handle common encoding issues
197
- text = text.replace('\u00a0', ' ') # Non-breaking space
198
- text = text.replace('\u2019', "'") # Right single quotation mark
199
- text = text.replace('\u201c', '"') # Left double quotation mark
200
- text = text.replace('\u201d', '"') # Right double quotation mark
201
-
202
  return text
203
 
204
- def get_model_summary_stats(model_results: Dict) -> Dict:
205
- """Extract summary statistics from model evaluation results."""
206
- if not model_results or 'averages' not in model_results:
 
 
207
  return {}
208
-
209
- averages = model_results.get('averages', {})
210
- summary = model_results.get('summary', {})
211
-
212
- return {
213
- 'quality_score': averages.get('quality_score', 0.0),
214
- 'bleu': averages.get('bleu', 0.0),
215
- 'chrf': averages.get('chrf', 0.0),
216
- 'rouge1': averages.get('rouge1', 0.0),
217
- 'rougeL': averages.get('rougeL', 0.0),
218
- 'total_samples': summary.get('total_samples', 0),
219
- 'language_pairs': summary.get('language_pairs_covered', 0),
220
- 'google_pairs': summary.get('google_comparable_pairs', 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  }
222
 
223
- def generate_model_identifier(model_name: str, author: str) -> str:
224
- """Generate a unique identifier for a model."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  clean_name = sanitize_model_name(model_name)
226
- clean_author = re.sub(r'[^\w\-]', '_', author.strip())[:20] if author else "Anonymous"
 
 
 
227
  timestamp = datetime.datetime.now().strftime("%m%d_%H%M")
228
- return f"{clean_name}_{clean_author}_{timestamp}"
229
 
230
- def validate_dataframe_structure(df: pd.DataFrame, required_columns: List[str]) -> Tuple[bool, List[str]]:
231
- """Validate that a DataFrame has the required structure."""
 
 
 
 
 
 
232
  if df.empty:
233
  return False, ["DataFrame is empty"]
234
-
 
 
 
235
  missing_columns = [col for col in required_columns if col not in df.columns]
236
  if missing_columns:
237
- return False, [f"Missing columns: {', '.join(missing_columns)}"]
238
-
239
- return True, []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
  def format_duration(seconds: float) -> str:
242
  """Format duration in seconds to human-readable format."""
@@ -247,12 +608,80 @@ def format_duration(seconds: float) -> str:
247
  else:
248
  return f"{seconds/3600:.1f}h"
249
 
 
250
  def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
251
  """Truncate text to specified length with suffix."""
252
  if not isinstance(text, str):
253
  text = str(text)
254
-
255
  if len(text) <= max_length:
256
  return text
257
-
258
- return text[:max_length - len(suffix)] + suffix
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import re
3
  import datetime
4
  import pandas as pd
5
+ import numpy as np
6
+ from typing import Dict, List, Tuple, Set, Optional, Union
7
+ from scipy import stats
8
+ from config import (
9
+ ALL_UG40_LANGUAGES,
10
+ GOOGLE_SUPPORTED_LANGUAGES,
11
+ LANGUAGE_NAMES,
12
+ EVALUATION_TRACKS,
13
+ MODEL_CATEGORIES,
14
+ STATISTICAL_CONFIG,
15
+ METRICS_CONFIG,
16
+ )
17
+
18
 
19
  def get_all_language_pairs() -> List[Tuple[str, str]]:
20
  """Get all possible UG40 language pairs."""
 
25
  pairs.append((src, tgt))
26
  return pairs
27
 
28
+
29
  def get_google_comparable_pairs() -> List[Tuple[str, str]]:
30
  """Get language pairs that can be compared with Google Translate."""
31
  pairs = []
 
35
  pairs.append((src, tgt))
36
  return pairs
37
 
38
+
39
+ def get_track_language_pairs(track: str) -> List[Tuple[str, str]]:
40
+ """Get language pairs for a specific evaluation track."""
41
+ if track not in EVALUATION_TRACKS:
42
+ return []
43
+
44
+ track_languages = EVALUATION_TRACKS[track]["languages"]
45
+ pairs = []
46
+ for src in track_languages:
47
+ for tgt in track_languages:
48
+ if src != tgt:
49
+ pairs.append((src, tgt))
50
+ return pairs
51
+
52
+
53
  def format_language_pair(src: str, tgt: str) -> str:
54
  """Format language pair for display."""
55
  src_name = LANGUAGE_NAMES.get(src, src.upper())
56
  tgt_name = LANGUAGE_NAMES.get(tgt, tgt.upper())
57
  return f"{src_name} → {tgt_name}"
58
 
59
+
60
  def validate_language_code(lang: str) -> bool:
61
  """Validate if language code is supported."""
62
  return lang in ALL_UG40_LANGUAGES
63
 
64
+
65
  def create_submission_id() -> str:
66
+ """Create unique submission ID with timestamp and random component."""
67
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
68
+ random_suffix = str(np.random.randint(1000, 9999))
69
+ return f"sub_{timestamp}_{random_suffix}"
70
+
71
 
72
  def sanitize_model_name(name: str) -> str:
73
+ """Sanitize model name for display and storage with enhanced validation."""
74
  if not name or not isinstance(name, str):
75
  return "Anonymous_Model"
76
+
77
  # Remove special characters, limit length
78
+ name = re.sub(r"[^\w\-.]", "_", name.strip())
79
  # Remove multiple consecutive underscores
80
+ name = re.sub(r"_+", "_", name)
81
  # Remove leading/trailing underscores
82
+ name = name.strip("_")
83
+
84
  # Ensure minimum length
85
  if len(name) < 3:
86
  name = f"Model_{name}"
87
+
88
+ # Check for reserved names
89
+ reserved_names = ["admin", "test", "baseline", "google", "system"]
90
+ if name.lower() in reserved_names:
91
+ name = f"User_{name}"
92
+
93
  return name[:50] # Limit to 50 characters
94
 
95
+
96
+ def format_metric_value(
97
+ value: float,
98
+ metric: str,
99
+ include_ci: bool = False,
100
+ ci_lower: float = None,
101
+ ci_upper: float = None,
102
+ ) -> str:
103
+ """Format metric value for display with optional confidence intervals."""
104
  if pd.isna(value) or value is None:
105
  return "N/A"
106
+
107
  try:
108
+ precision = METRICS_CONFIG["display_precision"]
109
+
110
+ if metric == "coverage_rate":
111
+ formatted = f"{value:.{precision}%}"
112
+ elif metric in ["bleu"]:
113
+ formatted = f"{value:.2f}"
114
+ elif metric in ["cer", "wer"] and value > 1:
115
  # Cap error rates at 1.0 for display
116
+ formatted = f"{min(value, 1.0):.{precision}f}"
117
  else:
118
+ formatted = f"{value:.{precision}f}"
119
+
120
+ # Add confidence interval if requested
121
+ if include_ci and ci_lower is not None and ci_upper is not None:
122
+ ci_str = f" [{ci_lower:.{precision}f}, {ci_upper:.{precision}f}]"
123
+ formatted += ci_str
124
+
125
+ return formatted
126
+
127
  except (ValueError, TypeError):
128
  return str(value)
129
 
130
+
131
+ def calculate_effect_size(values1: List[float], values2: List[float]) -> float:
132
+ """Calculate Cohen's d effect size between two groups."""
133
+ if len(values1) < 2 or len(values2) < 2:
134
+ return 0.0
135
+
 
136
  try:
137
+ values1 = np.array(values1)
138
+ values2 = np.array(values2)
139
+
140
+ # Remove NaN values
141
+ values1 = values1[~np.isnan(values1)]
142
+ values2 = values2[~np.isnan(values2)]
143
+
144
+ if len(values1) < 2 or len(values2) < 2:
145
+ return 0.0
146
+
147
+ # Calculate pooled standard deviation
148
+ n1, n2 = len(values1), len(values2)
149
+ pooled_std = np.sqrt(
150
+ ((n1 - 1) * np.var(values1, ddof=1) + (n2 - 1) * np.var(values2, ddof=1))
151
+ / (n1 + n2 - 2)
152
+ )
153
+
154
+ if pooled_std == 0:
155
+ return 0.0
156
+
157
+ # Cohen's d
158
+ effect_size = (np.mean(values1) - np.mean(values2)) / pooled_std
159
+ return abs(effect_size)
160
+
161
+ except Exception:
162
+ return 0.0
163
+
164
+
165
+ def interpret_effect_size(effect_size: float) -> str:
166
+ """Interpret effect size according to Cohen's conventions."""
167
+ thresholds = STATISTICAL_CONFIG["effect_size_thresholds"]
168
+
169
+ if effect_size < thresholds["small"]:
170
+ return "negligible"
171
+ elif effect_size < thresholds["medium"]:
172
+ return "small"
173
+ elif effect_size < thresholds["large"]:
174
+ return "medium"
175
+ else:
176
+ return "large"
177
+
178
+
179
+ def calculate_statistical_power(
180
+ effect_size: float, n1: int, n2: int, alpha: float = 0.05
181
+ ) -> float:
182
+ """Estimate statistical power for given effect size and sample sizes."""
183
+ if n1 < 2 or n2 < 2:
184
+ return 0.0
185
+
186
+ try:
187
+ # Simplified power calculation using t-test
188
+ # This is an approximation
189
+ df = n1 + n2 - 2
190
+ pooled_se = np.sqrt((1 / n1) + (1 / n2))
191
+
192
+ # Critical t-value
193
+ t_critical = stats.t.ppf(1 - alpha / 2, df)
194
+
195
+ # Non-centrality parameter
196
+ ncp = effect_size / pooled_se
197
+
198
+ # Power (approximate)
199
+ power = (
200
+ 1
201
+ - stats.t.cdf(t_critical, df, loc=ncp)
202
+ + stats.t.cdf(-t_critical, df, loc=ncp)
203
+ )
204
+
205
+ return min(1.0, max(0.0, power))
206
+
207
+ except Exception:
208
+ return 0.0
209
+
210
+
211
+ def get_track_statistics(test_data: pd.DataFrame) -> Dict[str, Dict]:
212
+ """Get comprehensive statistics about test data coverage for each track."""
213
+ track_stats = {}
214
+
215
+ for track_name, track_config in EVALUATION_TRACKS.items():
216
+ track_languages = track_config["languages"]
217
+
218
+ # Filter test data to track languages
219
+ track_data = test_data[
220
+ (test_data["source_language"].isin(track_languages))
221
+ & (test_data["target_language"].isin(track_languages))
222
+ ]
223
+
224
+ if track_data.empty:
225
+ track_stats[track_name] = {
226
+ "total_samples": 0,
227
+ "language_pairs": 0,
228
+ "samples_per_pair": {},
229
+ "coverage_matrix": {},
230
+ "adequacy_assessment": "insufficient",
231
+ }
232
+ continue
233
+
234
+ # Calculate pair-wise statistics
235
+ pair_counts = {}
236
+ for src in track_languages:
237
+ for tgt in track_languages:
238
+ if src == tgt:
239
+ continue
240
+
241
+ pair_data = track_data[
242
+ (track_data["source_language"] == src)
243
+ & (track_data["target_language"] == tgt)
244
+ ]
245
+
246
+ pair_key = f"{src}_to_{tgt}"
247
+ pair_counts[pair_key] = len(pair_data)
248
+
249
+ # Calculate adequacy
250
+ min_required = track_config["min_samples_per_pair"]
251
+ adequate_pairs = sum(
252
+ 1 for count in pair_counts.values() if count >= min_required
253
+ )
254
+ total_possible_pairs = len(track_languages) * (len(track_languages) - 1)
255
+
256
+ adequacy_rate = adequate_pairs / max(total_possible_pairs, 1)
257
+
258
+ if adequacy_rate >= 0.8:
259
+ adequacy = "excellent"
260
+ elif adequacy_rate >= 0.6:
261
+ adequacy = "good"
262
+ elif adequacy_rate >= 0.4:
263
+ adequacy = "fair"
264
+ else:
265
+ adequacy = "insufficient"
266
+
267
+ track_stats[track_name] = {
268
+ "total_samples": len(track_data),
269
+ "language_pairs": len([k for k, v in pair_counts.items() if v > 0]),
270
+ "samples_per_pair": pair_counts,
271
+ "coverage_matrix": pair_counts,
272
+ "adequacy_assessment": adequacy,
273
+ "adequacy_rate": adequacy_rate,
274
+ "min_samples_per_pair": min_required,
275
+ }
276
+
277
+ return track_stats
278
+
279
+
280
+ def validate_submission_completeness_scientific(
281
+ predictions: pd.DataFrame, test_set: pd.DataFrame, track: str = None
282
+ ) -> Dict:
283
+ """Enhanced validation with track-specific analysis."""
284
 
 
 
 
285
  if predictions.empty or test_set.empty:
286
  return {
287
+ "is_complete": False,
288
+ "missing_count": len(test_set) if not test_set.empty else 0,
289
+ "extra_count": len(predictions) if not predictions.empty else 0,
290
+ "missing_ids": [],
291
+ "coverage": 0.0,
292
+ "track_analysis": {},
293
  }
294
+
295
+ # If track specified, filter to track languages
296
+ if track and track in EVALUATION_TRACKS:
297
+ track_languages = EVALUATION_TRACKS[track]["languages"]
298
+ test_set = test_set[
299
+ (test_set["source_language"].isin(track_languages))
300
+ & (test_set["target_language"].isin(track_languages))
301
+ ]
302
+
303
  try:
304
+ required_ids = set(test_set["sample_id"].astype(str))
305
+ provided_ids = set(predictions["sample_id"].astype(str))
306
+
307
  missing_ids = required_ids - provided_ids
308
  extra_ids = provided_ids - required_ids
309
+ matching_ids = provided_ids & required_ids
310
+
311
+ base_result = {
312
+ "is_complete": len(missing_ids) == 0,
313
+ "missing_count": len(missing_ids),
314
+ "extra_count": len(extra_ids),
315
+ "missing_ids": list(missing_ids)[:10],
316
+ "coverage": len(matching_ids) / len(required_ids) if required_ids else 0.0,
317
  }
318
+
319
+ # Add track-specific analysis if requested
320
+ if track:
321
+ track_analysis = analyze_track_coverage(predictions, test_set, track)
322
+ base_result["track_analysis"] = track_analysis
323
+
324
+ return base_result
325
+
326
  except Exception as e:
327
+ print(f"Error in submission completeness validation: {e}")
328
  return {
329
+ "is_complete": False,
330
+ "missing_count": 0,
331
+ "extra_count": 0,
332
+ "missing_ids": [],
333
+ "coverage": 0.0,
334
+ "track_analysis": {},
335
  }
336
 
337
+
338
+ def analyze_track_coverage(
339
+ predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
340
+ ) -> Dict:
341
+ """Analyze coverage for a specific track."""
342
+
343
+ if track not in EVALUATION_TRACKS:
344
+ return {"error": f"Unknown track: {track}"}
345
+
346
+ track_config = EVALUATION_TRACKS[track]
347
+ track_languages = track_config["languages"]
348
+
349
+ # Filter test set to track languages
350
+ track_test_set = test_set[
351
+ (test_set["source_language"].isin(track_languages))
352
+ & (test_set["target_language"].isin(track_languages))
353
+ ]
354
+
355
+ if track_test_set.empty:
356
+ return {"error": f"No test data available for {track} track"}
357
+
358
+ # Merge with predictions
359
+ merged = track_test_set.merge(
360
+ predictions, on="sample_id", how="left", suffixes=("", "_pred")
361
+ )
362
+
363
+ # Analyze by language pair
364
+ pair_analysis = {}
365
+ for src in track_languages:
366
+ for tgt in track_languages:
367
+ if src == tgt:
368
+ continue
369
+
370
+ pair_data = merged[
371
+ (merged["source_language"] == src) & (merged["target_language"] == tgt)
372
+ ]
373
+
374
+ if len(pair_data) > 0:
375
+ covered = pair_data["prediction"].notna().sum()
376
+ pair_analysis[f"{src}_to_{tgt}"] = {
377
+ "total": len(pair_data),
378
+ "covered": covered,
379
+ "coverage_rate": covered / len(pair_data),
380
+ "meets_minimum": covered >= track_config["min_samples_per_pair"],
381
+ }
382
+
383
+ # Overall track statistics
384
+ total_pairs = len(pair_analysis)
385
+ adequate_pairs = sum(1 for info in pair_analysis.values() if info["meets_minimum"])
386
+
387
+ return {
388
+ "track_name": track_config["name"],
389
+ "total_language_pairs": total_pairs,
390
+ "adequate_pairs": adequate_pairs,
391
+ "adequacy_rate": adequate_pairs / max(total_pairs, 1),
392
+ "pair_analysis": pair_analysis,
393
+ "overall_adequate": adequate_pairs
394
+ >= total_pairs * 0.8, # 80% of pairs adequate
395
+ }
396
+
397
+
398
+ def calculate_language_pair_coverage_scientific(
399
+ predictions: pd.DataFrame, test_set: pd.DataFrame
400
+ ) -> Dict:
401
+ """Calculate comprehensive language pair coverage with statistical metrics."""
402
+
403
  if predictions.empty or test_set.empty:
404
  return {}
405
+
406
  try:
407
  # Merge to get language info
408
+ merged = test_set.merge(
409
+ predictions, on="sample_id", how="left", suffixes=("", "_pred")
410
+ )
411
+
412
  coverage = {}
413
  for src in ALL_UG40_LANGUAGES:
414
  for tgt in ALL_UG40_LANGUAGES:
415
+ if src == tgt:
416
+ continue
417
+
418
+ pair_data = merged[
419
+ (merged["source_language"] == src)
420
+ & (merged["target_language"] == tgt)
421
+ ]
422
+
423
+ if len(pair_data) > 0:
424
+ predicted_count = pair_data["prediction"].notna().sum()
425
+ coverage_rate = predicted_count / len(pair_data)
426
+
427
+ # Determine which tracks include this pair
428
+ tracks_included = []
429
+ for track_name, track_config in EVALUATION_TRACKS.items():
430
+ if (
431
+ src in track_config["languages"]
432
+ and tgt in track_config["languages"]
433
+ ):
434
+ tracks_included.append(track_name)
435
+
436
+ coverage[f"{src}_{tgt}"] = {
437
+ "total": len(pair_data),
438
+ "predicted": predicted_count,
439
+ "coverage": coverage_rate,
440
+ "display_name": format_language_pair(src, tgt),
441
+ "tracks_included": tracks_included,
442
+ "google_comparable": (
443
+ src in GOOGLE_SUPPORTED_LANGUAGES
444
+ and tgt in GOOGLE_SUPPORTED_LANGUAGES
445
+ ),
446
+ "statistical_adequacy": {
447
+ track: predicted_count
448
+ >= EVALUATION_TRACKS[track]["min_samples_per_pair"]
449
+ for track in tracks_included
450
+ },
451
+ }
452
+
453
  return coverage
454
+
455
  except Exception as e:
456
  print(f"Error calculating language pair coverage: {e}")
457
  return {}
458
 
459
+
460
  def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
461
  """Safely divide two numbers, handling edge cases."""
462
  try:
463
  if denominator == 0 or pd.isna(denominator) or pd.isna(numerator):
464
  return default
465
  result = numerator / denominator
466
+ if pd.isna(result) or not np.isfinite(result):
467
  return default
468
  return float(result)
469
  except (TypeError, ValueError, ZeroDivisionError):
470
  return default
471
 
472
+
473
  def clean_text_for_evaluation(text: str) -> str:
474
  """Clean text for evaluation, handling common encoding issues."""
475
  if not isinstance(text, str):
476
  return str(text) if text is not None else ""
477
+
478
  # Remove extra whitespace
479
+ text = re.sub(r"\s+", " ", text.strip())
480
+
481
  # Handle common encoding issues
482
+ text = text.replace("\u00a0", " ") # Non-breaking space
483
+ text = text.replace("\u2019", "'") # Right single quotation mark
484
+ text = text.replace("\u201c", '"') # Left double quotation mark
485
+ text = text.replace("\u201d", '"') # Right double quotation mark
486
+
487
  return text
488
 
489
+
490
+ def get_model_summary_stats_scientific(model_results: Dict, track: str = None) -> Dict:
491
+ """Extract comprehensive summary statistics from model evaluation results."""
492
+
493
+ if not model_results or "tracks" not in model_results:
494
  return {}
495
+
496
+ tracks = model_results["tracks"]
497
+
498
+ # If specific track requested
499
+ if track and track in tracks:
500
+ track_data = tracks[track]
501
+ if track_data.get("error"):
502
+ return {"error": f"No valid data for {track} track"}
503
+
504
+ track_averages = track_data.get("track_averages", {})
505
+ track_statistics = track_data.get("track_statistics", {})
506
+ summary = track_data.get("summary", {})
507
+
508
+ stats = {
509
+ "track": track,
510
+ "track_name": EVALUATION_TRACKS[track]["name"],
511
+ "quality_score": track_averages.get("quality_score", 0.0),
512
+ "bleu": track_averages.get("bleu", 0.0),
513
+ "chrf": track_averages.get("chrf", 0.0),
514
+ "total_samples": summary.get("total_samples", 0),
515
+ "language_pairs": summary.get("language_pairs_evaluated", 0),
516
+ "statistical_adequacy": summary.get("total_samples", 0)
517
+ >= 100, # Simple threshold
518
+ }
519
+
520
+ # Add confidence intervals if available
521
+ if "quality_score" in track_statistics:
522
+ quality_stats = track_statistics["quality_score"]
523
+ stats["confidence_interval"] = [
524
+ quality_stats.get("ci_lower", 0.0),
525
+ quality_stats.get("ci_upper", 0.0),
526
+ ]
527
+
528
+ return stats
529
+
530
+ # Otherwise, return summary across all tracks
531
+ all_tracks_summary = {
532
+ "tracks_evaluated": len([t for t in tracks.values() if not t.get("error")]),
533
+ "total_tracks": len(EVALUATION_TRACKS),
534
+ "by_track": {},
535
  }
536
 
537
+ for track_name, track_data in tracks.items():
538
+ if not track_data.get("error"):
539
+ track_averages = track_data.get("track_averages", {})
540
+ summary = track_data.get("summary", {})
541
+
542
+ all_tracks_summary["by_track"][track_name] = {
543
+ "quality_score": track_averages.get("quality_score", 0.0),
544
+ "samples": summary.get("total_samples", 0),
545
+ "pairs": summary.get("language_pairs_evaluated", 0),
546
+ }
547
+
548
+ return all_tracks_summary
549
+
550
+
551
+ def generate_model_identifier_scientific(
552
+ model_name: str, author: str, category: str
553
+ ) -> str:
554
+ """Generate a unique scientific identifier for a model."""
555
  clean_name = sanitize_model_name(model_name)
556
+ clean_author = (
557
+ re.sub(r"[^\w\-]", "_", author.strip())[:20] if author else "Anonymous"
558
+ )
559
+ clean_category = category[:10] if category in MODEL_CATEGORIES else "community"
560
  timestamp = datetime.datetime.now().strftime("%m%d_%H%M")
 
561
 
562
+ return f"{clean_category}_{clean_name}_{clean_author}_{timestamp}"
563
+
564
+
565
+ def validate_dataframe_structure_enhanced(
566
+ df: pd.DataFrame, required_columns: List[str], track: str = None
567
+ ) -> Tuple[bool, List[str]]:
568
+ """Enhanced DataFrame structure validation with track-specific checks."""
569
+
570
  if df.empty:
571
  return False, ["DataFrame is empty"]
572
+
573
+ issues = []
574
+
575
+ # Check required columns
576
  missing_columns = [col for col in required_columns if col not in df.columns]
577
  if missing_columns:
578
+ issues.append(f"Missing columns: {', '.join(missing_columns)}")
579
+
580
+ # Check for track-specific requirements
581
+ if track and track in EVALUATION_TRACKS:
582
+ track_config = EVALUATION_TRACKS[track]
583
+ min_samples = track_config.get("min_samples_per_pair", 10)
584
+
585
+ # Check sample size adequacy
586
+ if len(df) < min_samples * 5: # At least 5 pairs worth of data
587
+ issues.append(
588
+ f"Insufficient samples for {track} track (minimum ~{min_samples * 5})"
589
+ )
590
+
591
+ # Check data types
592
+ if "sample_id" in df.columns:
593
+ if not df["sample_id"].dtype == "object":
594
+ try:
595
+ df["sample_id"] = df["sample_id"].astype(str)
596
+ except Exception:
597
+ issues.append("Cannot convert sample_id to string")
598
+
599
+ return len(issues) == 0, issues
600
+
601
 
602
  def format_duration(seconds: float) -> str:
603
  """Format duration in seconds to human-readable format."""
 
608
  else:
609
  return f"{seconds/3600:.1f}h"
610
 
611
+
612
  def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
613
  """Truncate text to specified length with suffix."""
614
  if not isinstance(text, str):
615
  text = str(text)
616
+
617
  if len(text) <= max_length:
618
  return text
619
+
620
+ return text[: max_length - len(suffix)] + suffix
621
+
622
+
623
+ def calculate_sample_size_recommendation(
624
+ desired_power: float = 0.8, effect_size: float = 0.5, alpha: float = 0.05
625
+ ) -> int:
626
+ """Calculate recommended sample size for statistical analysis."""
627
+
628
+ try:
629
+ # Simplified sample size calculation for t-test
630
+ # This is an approximation using Cohen's conventions
631
+
632
+ z_alpha = stats.norm.ppf(1 - alpha / 2)
633
+ z_beta = stats.norm.ppf(desired_power)
634
+
635
+ # Sample size per group
636
+ n_per_group = 2 * ((z_alpha + z_beta) / effect_size) ** 2
637
+
638
+ # Round up to nearest integer
639
+ return max(10, int(np.ceil(n_per_group)))
640
+
641
+ except Exception:
642
+ return 50 # Default fallback
643
+
644
+
645
+ def assess_model_category_appropriateness(
646
+ model_name: str, category: str, performance_data: Dict
647
+ ) -> Dict:
648
+ """Assess if the detected/assigned model category is appropriate."""
649
+
650
+ assessment = {
651
+ "category": category,
652
+ "appropriate": True,
653
+ "confidence": 1.0,
654
+ "recommendations": [],
655
+ }
656
+
657
+ # Check for category mismatches based on performance
658
+ if category == "baseline" and performance_data:
659
+ # Baselines shouldn't perform too well
660
+ quality_scores = []
661
+ for track_data in performance_data.get("tracks", {}).values():
662
+ if not track_data.get("error"):
663
+ quality_scores.append(
664
+ track_data.get("track_averages", {}).get("quality_score", 0)
665
+ )
666
+
667
+ if (
668
+ quality_scores and max(quality_scores) > 0.7
669
+ ): # High performance for baseline
670
+ assessment["appropriate"] = False
671
+ assessment["confidence"] = 0.3
672
+ assessment["recommendations"].append(
673
+ "High performance suggests this might not be a baseline model"
674
+ )
675
+
676
+ # Check for commercial model expectations
677
+ if category == "commercial":
678
+ # Commercial models should have good Google-comparable performance
679
+ google_track = performance_data.get("tracks", {}).get("google_comparable", {})
680
+ if not google_track.get("error"):
681
+ quality = google_track.get("track_averages", {}).get("quality_score", 0)
682
+ if quality < 0.3: # Poor performance for commercial
683
+ assessment["recommendations"].append(
684
+ "Low performance unexpected for commercial systems"
685
+ )
686
+
687
+ return assessment