akera commited on
Commit
ad7599c
·
verified ·
1 Parent(s): f0df659

Update src/leaderboard.py

Browse files
Files changed (1) hide show
  1. src/leaderboard.py +536 -341
src/leaderboard.py CHANGED
@@ -5,438 +5,633 @@ import json
5
  import datetime
6
  from typing import Dict, List, Optional, Tuple
7
  import os
8
- from config import LEADERBOARD_DATASET, HF_TOKEN, ALL_UG40_LANGUAGES, GOOGLE_SUPPORTED_LANGUAGES
9
- from src.utils import create_submission_id, sanitize_model_name, get_all_language_pairs, get_google_comparable_pairs
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- def initialize_leaderboard() -> pd.DataFrame:
12
- """Initialize empty leaderboard DataFrame."""
13
-
14
  columns = {
15
- 'submission_id': [],
16
- 'model_name': [],
17
- 'author': [],
18
- 'submission_date': [],
19
- 'model_type': [],
20
- 'description': [],
21
-
22
- # Primary metrics
23
- 'quality_score': [],
24
- 'bleu': [],
25
- 'chrf': [],
26
-
27
- # Secondary metrics
28
- 'rouge1': [],
29
- 'rouge2': [],
30
- 'rougeL': [],
31
- 'cer': [],
32
- 'wer': [],
33
- 'len_ratio': [],
34
-
35
- # Google comparable metrics
36
- 'google_quality_score': [],
37
- 'google_bleu': [],
38
- 'google_chrf': [],
39
-
40
- # Coverage info
41
- 'total_samples': [],
42
- 'language_pairs_covered': [],
43
- 'google_pairs_covered': [],
44
- 'coverage_rate': [],
45
-
46
- # Detailed results
47
- 'detailed_metrics': [], # JSON string
48
- 'validation_report': [],
49
-
 
 
 
 
 
 
 
50
  # Metadata
51
- 'evaluation_date': [],
52
- 'leaderboard_version': []
 
53
  }
54
-
55
  return pd.DataFrame(columns)
56
 
57
- def load_leaderboard() -> pd.DataFrame:
58
- """Load current leaderboard from HuggingFace dataset."""
59
-
 
60
  try:
61
- print("Loading leaderboard...")
62
- dataset = load_dataset(LEADERBOARD_DATASET, split='train')
63
  df = dataset.to_pandas()
64
-
65
  # Ensure all required columns exist
66
- required_columns = list(initialize_leaderboard().columns)
67
  for col in required_columns:
68
  if col not in df.columns:
69
- if col in ['quality_score', 'bleu', 'chrf', 'rouge1', 'rouge2', 'rougeL',
70
- 'cer', 'wer', 'len_ratio', 'google_quality_score', 'google_bleu',
71
- 'google_chrf', 'total_samples', 'language_pairs_covered',
72
- 'google_pairs_covered', 'coverage_rate']:
 
 
 
73
  df[col] = 0.0
74
- elif col in ['leaderboard_version']:
75
- df[col] = 1
76
  else:
77
- df[col] = ''
78
-
79
- print(f"Loaded leaderboard with {len(df)} entries")
80
  return df
81
-
82
  except Exception as e:
83
- print(f"Could not load leaderboard: {e}")
84
- print("Initializing empty leaderboard...")
85
- return initialize_leaderboard()
 
 
 
 
86
 
87
- def save_leaderboard(df: pd.DataFrame) -> bool:
88
- """Save leaderboard to HuggingFace dataset."""
89
-
90
  try:
91
  # Clean data before saving
92
  df_clean = df.copy()
93
-
94
  # Ensure numeric columns are proper types
95
- numeric_columns = ['quality_score', 'bleu', 'chrf', 'rouge1', 'rouge2', 'rougeL',
96
- 'cer', 'wer', 'len_ratio', 'google_quality_score', 'google_bleu',
97
- 'google_chrf', 'total_samples', 'language_pairs_covered',
98
- 'google_pairs_covered', 'coverage_rate', 'leaderboard_version']
99
-
 
 
 
 
 
 
 
 
 
 
 
 
100
  for col in numeric_columns:
101
  if col in df_clean.columns:
102
- df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').fillna(0.0)
103
-
 
 
 
 
 
104
  # Convert to dataset
105
  dataset = Dataset.from_pandas(df_clean)
106
-
107
  # Push to hub
108
  dataset.push_to_hub(
109
- LEADERBOARD_DATASET,
110
  token=HF_TOKEN,
111
- commit_message=f"Update leaderboard - {datetime.datetime.now().isoformat()[:19]}"
112
  )
113
-
114
- print("Leaderboard saved successfully!")
115
  return True
116
-
117
  except Exception as e:
118
- print(f"Error saving leaderboard: {e}")
119
  return False
120
 
121
- def add_model_to_leaderboard(
 
122
  model_name: str,
123
  author: str,
124
  evaluation_results: Dict,
125
- validation_info: Dict,
126
- model_type: str = "",
127
- description: str = ""
128
  ) -> pd.DataFrame:
129
- """
130
- Add new model results to leaderboard, with JSON-safe detailed_metrics.
131
- """
132
  # Load current leaderboard
133
- df = load_leaderboard()
134
 
135
  # Remove existing entry if present
136
- existing_mask = df['model_name'] == model_name
137
  if existing_mask.any():
138
  df = df[~existing_mask]
139
 
140
- # Safely serialize evaluation_results by dropping non-JSON types
141
- safe_results = evaluation_results.copy()
142
- # Remove sample_metrics DataFrame which isn't JSON serializable
143
- if 'sample_metrics' in safe_results:
144
- safe_results.pop('sample_metrics')
145
 
146
- detailed_json = json.dumps(safe_results)
147
-
148
- # Extract metrics
149
- averages = evaluation_results.get('averages', {})
150
- google_averages = evaluation_results.get('google_comparable_averages', {})
151
- summary = evaluation_results.get('summary', {})
152
 
153
  # Prepare new entry
154
  new_entry = {
155
- 'submission_id': create_submission_id(),
156
- 'model_name': sanitize_model_name(model_name),
157
- 'author': author[:100] if author else 'Anonymous',
158
- 'submission_date': datetime.datetime.now().isoformat(),
159
- 'model_type': model_type[:50] if model_type else 'unknown',
160
- 'description': description[:500] if description else '',
161
-
162
- # Primary metrics
163
- 'quality_score': float(averages.get('quality_score', 0.0)),
164
- 'bleu': float(averages.get('bleu', 0.0)),
165
- 'chrf': float(averages.get('chrf', 0.0)),
166
-
167
- # Secondary metrics
168
- 'rouge1': float(averages.get('rouge1', 0.0)),
169
- 'rouge2': float(averages.get('rouge2', 0.0)),
170
- 'rougeL': float(averages.get('rougeL', 0.0)),
171
- 'cer': float(averages.get('cer', 0.0)),
172
- 'wer': float(averages.get('wer', 0.0)),
173
- 'len_ratio': float(averages.get('len_ratio', 0.0)),
174
-
175
- # Google comparable metrics
176
- 'google_quality_score': float(google_averages.get('quality_score', 0.0)),
177
- 'google_bleu': float(google_averages.get('bleu', 0.0)),
178
- 'google_chrf': float(google_averages.get('chrf', 0.0)),
179
-
180
- # Coverage info
181
- 'total_samples': int(summary.get('total_samples', 0)),
182
- 'language_pairs_covered': int(summary.get('language_pairs_covered', 0)),
183
- 'google_pairs_covered': int(summary.get('google_comparable_pairs', 0)),
184
- 'coverage_rate': float(validation_info.get('coverage', 0.0)),
185
-
186
- # Detailed results (JSON string)
187
- 'detailed_metrics': detailed_json,
188
- 'validation_report': validation_info.get('report', ''),
189
-
190
  # Metadata
191
- 'evaluation_date': datetime.datetime.now().isoformat(),
192
- 'leaderboard_version': 1
 
193
  }
194
 
195
  # Convert to DataFrame and append
196
  new_row_df = pd.DataFrame([new_entry])
197
  updated_df = pd.concat([df, new_row_df], ignore_index=True)
198
- updated_df = updated_df.sort_values('quality_score', ascending=False).reset_index(drop=True)
199
 
200
  # Save to hub
201
- save_leaderboard(updated_df)
202
 
203
  return updated_df
204
 
205
- def prepare_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
206
- """Prepare leaderboard for display by formatting and selecting appropriate columns."""
207
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  if df.empty:
209
  return df
210
-
211
- # Select columns for display (exclude detailed_metrics and validation_report)
212
- display_columns = [
213
- 'model_name', 'author', 'submission_date', 'model_type',
214
- 'quality_score', 'bleu', 'chrf',
215
- 'rouge1', 'rougeL',
216
- 'total_samples', 'language_pairs_covered', 'google_pairs_covered',
217
- 'coverage_rate'
 
 
 
 
 
218
  ]
219
-
220
  # Only include columns that exist
221
- available_columns = [col for col in display_columns if col in df.columns]
 
 
222
  display_df = df[available_columns].copy()
223
-
224
  # Format numeric columns
225
  numeric_format = {
226
- 'quality_score': '{:.4f}',
227
- 'bleu': '{:.2f}',
228
- 'chrf': '{:.4f}',
229
- 'rouge1': '{:.4f}',
230
- 'rougeL': '{:.4f}',
231
- 'coverage_rate': '{:.1%}',
232
  }
233
-
234
  for col, fmt in numeric_format.items():
235
  if col in display_df.columns:
236
- display_df[col] = display_df[col].apply(lambda x: fmt.format(float(x)) if pd.notnull(x) else "0.0000")
237
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  # Format submission date
239
- if 'submission_date' in display_df.columns:
240
- display_df['submission_date'] = pd.to_datetime(display_df['submission_date']).dt.strftime('%Y-%m-%d %H:%M')
241
-
 
 
242
  # Rename columns for better display
 
243
  column_renames = {
244
- 'model_name': 'Model Name',
245
- 'author': 'Author',
246
- 'submission_date': 'Submitted',
247
- 'model_type': 'Type',
248
- 'quality_score': 'Quality Score',
249
- 'bleu': 'BLEU',
250
- 'chrf': 'ChrF',
251
- 'rouge1': 'ROUGE-1',
252
- 'rougeL': 'ROUGE-L',
253
- 'total_samples': 'Samples',
254
- 'language_pairs_covered': 'Lang Pairs',
255
- 'google_pairs_covered': 'Google Pairs',
256
- 'coverage_rate': 'Coverage'
257
  }
258
-
259
  display_df = display_df.rename(columns=column_renames)
260
-
261
  return display_df
262
 
263
- def get_leaderboard_stats(df: pd.DataFrame) -> Dict:
264
- """Get summary statistics for the leaderboard."""
265
-
 
266
  if df.empty:
267
  return {
268
- 'total_models': 0,
269
- 'avg_quality_score': 0.0,
270
- 'best_model': None,
271
- 'latest_submission': None,
272
- 'google_comparable_models': 0,
273
- 'coverage_distribution': {},
274
- 'language_pair_coverage': {}
275
  }
276
-
277
- # Basic stats
278
  stats = {
279
- 'total_models': len(df),
280
- 'avg_quality_score': float(df['quality_score'].mean()),
281
- 'best_model': {
282
- 'name': df.iloc[0]['model_name'],
283
- 'score': float(df.iloc[0]['quality_score']),
284
- 'author': df.iloc[0]['author']
285
- } if len(df) > 0 else None,
286
- 'latest_submission': df['submission_date'].max() if len(df) > 0 else None
287
  }
288
-
289
- # Google comparable models
290
- stats['google_comparable_models'] = int((df['google_pairs_covered'] > 0).sum())
291
-
292
- # Coverage distribution
293
- coverage_bins = pd.cut(df['coverage_rate'], bins=[0, 0.5, 0.8, 0.95, 1.0],
294
- labels=['<50%', '50-80%', '80-95%', '95-100%'])
295
- stats['coverage_distribution'] = coverage_bins.value_counts().to_dict()
296
-
297
- # Language pair coverage
298
- if len(df) > 0:
299
- stats['avg_pairs_covered'] = float(df['language_pairs_covered'].mean())
300
- stats['max_pairs_covered'] = int(df['language_pairs_covered'].max())
301
- stats['total_possible_pairs'] = len(get_all_language_pairs())
302
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  return stats
304
 
305
- def filter_leaderboard(
306
- df: pd.DataFrame,
307
- search_query: str = "",
308
- model_type: str = "",
309
- min_coverage: float = 0.0,
310
- google_comparable_only: bool = False,
311
- top_n: int = None
312
- ) -> pd.DataFrame:
313
- """Filter leaderboard based on various criteria."""
314
-
315
- filtered_df = df.copy()
316
-
317
- # Text search
318
- if search_query:
319
- query_lower = search_query.lower()
320
- mask = (
321
- filtered_df['model_name'].str.lower().str.contains(query_lower, na=False) |
322
- filtered_df['author'].str.lower().str.contains(query_lower, na=False) |
323
- filtered_df['description'].str.lower().str.contains(query_lower, na=False)
324
- )
325
- filtered_df = filtered_df[mask]
326
-
327
- # Model type filter
328
- if model_type and model_type != "all":
329
- filtered_df = filtered_df[filtered_df['model_type'] == model_type]
330
-
331
- # Coverage filter
332
- if min_coverage > 0:
333
- filtered_df = filtered_df[filtered_df['coverage_rate'] >= min_coverage]
334
-
335
- # Google comparable filter
336
- if google_comparable_only:
337
- filtered_df = filtered_df[filtered_df['google_pairs_covered'] > 0]
338
-
339
- # Top N filter
340
- if top_n:
341
- filtered_df = filtered_df.head(top_n)
342
-
343
- return filtered_df
344
-
345
- def get_model_comparison(df: pd.DataFrame, model_names: List[str]) -> Dict:
346
- """Get detailed comparison between specific models."""
347
-
348
- models = df[df['model_name'].isin(model_names)]
349
-
350
  if len(models) == 0:
351
- return {'error': 'No models found'}
352
-
353
  comparison = {
354
- 'models': [],
355
- 'metrics_comparison': {},
356
- 'detailed_results': {}
 
 
357
  }
358
-
359
- # Extract basic info for each model
360
- for _, model in models.iterrows():
361
- comparison['models'].append({
362
- 'name': model['model_name'],
363
- 'author': model['author'],
364
- 'submission_date': model['submission_date'],
365
- 'model_type': model['model_type']
366
- })
367
-
368
- # Parse detailed metrics if available
369
- try:
370
- detailed = json.loads(model['detailed_metrics'])
371
- comparison['detailed_results'][model['model_name']] = detailed
372
- except:
373
- comparison['detailed_results'][model['model_name']] = {}
374
-
375
- # Compare metrics
376
- metrics = ['quality_score', 'bleu', 'chrf', 'rouge1', 'rougeL', 'cer', 'wer']
377
- for metric in metrics:
378
- if metric in models.columns:
379
- comparison['metrics_comparison'][metric] = {
380
- model_name: float(score)
381
- for model_name, score in zip(models['model_name'], models[metric])
382
  }
383
-
 
 
 
 
 
 
 
 
384
  return comparison
385
 
386
- def export_leaderboard(df: pd.DataFrame, format: str = 'csv', include_detailed: bool = False) -> str:
387
- """Export leaderboard in specified format."""
388
-
 
 
 
 
 
 
389
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
390
-
391
- # Select columns for export
392
- if include_detailed:
393
- export_df = df.copy()
 
394
  else:
395
- basic_columns = [
396
- 'model_name', 'author', 'submission_date', 'model_type',
397
- 'quality_score', 'bleu', 'chrf', 'rouge1', 'rougeL',
398
- 'total_samples', 'language_pairs_covered', 'coverage_rate'
399
- ]
400
- export_df = df[basic_columns].copy()
401
-
402
- if format == 'csv':
403
- filename = f"salt_leaderboard_{timestamp}.csv"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
  export_df.to_csv(filename, index=False)
405
- elif format == 'json':
406
- filename = f"salt_leaderboard_{timestamp}.json"
407
- export_df.to_json(filename, orient='records', indent=2)
408
- elif format == 'xlsx':
409
- filename = f"salt_leaderboard_{timestamp}.xlsx"
410
  export_df.to_excel(filename, index=False)
411
  else:
412
  raise ValueError(f"Unsupported format: {format}")
413
-
414
- return filename
415
 
416
- def get_ranking_history(df: pd.DataFrame, model_name: str) -> Dict:
417
- """Get ranking history for a specific model (if multiple submissions)."""
418
-
419
- model_entries = df[df['model_name'] == model_name].sort_values('submission_date')
420
-
421
- if len(model_entries) == 0:
422
- return {'error': 'Model not found'}
423
-
424
- history = []
425
- for _, entry in model_entries.iterrows():
426
- # Calculate rank at time of submission
427
- submission_date = entry['submission_date']
428
- historical_df = df[df['submission_date'] <= submission_date]
429
- rank = (historical_df['quality_score'] > entry['quality_score']).sum() + 1
430
-
431
- history.append({
432
- 'submission_date': submission_date,
433
- 'quality_score': float(entry['quality_score']),
434
- 'rank': int(rank),
435
- 'total_models': len(historical_df)
436
- })
437
-
438
- return {
439
- 'model_name': model_name,
440
- 'history': history,
441
- 'current_rank': history[-1]['rank'] if history else None
442
- }
 
5
  import datetime
6
  from typing import Dict, List, Optional, Tuple
7
  import os
8
+ import numpy as np
9
+ from config import (
10
+ LEADERBOARD_DATASET,
11
+ HF_TOKEN,
12
+ EVALUATION_TRACKS,
13
+ MODEL_CATEGORIES,
14
+ STATISTICAL_CONFIG,
15
+ METRICS_CONFIG,
16
+ )
17
+ from src.utils import create_submission_id, sanitize_model_name
18
+
19
+
20
+ def initialize_scientific_leaderboard() -> pd.DataFrame:
21
+ """Initialize empty scientific leaderboard DataFrame with all required columns."""
22
 
 
 
 
23
  columns = {
24
+ # Basic information
25
+ "submission_id": [],
26
+ "model_name": [],
27
+ "author": [],
28
+ "submission_date": [],
29
+ "model_category": [],
30
+ "description": [],
31
+ # Track-specific quality scores
32
+ "google_comparable_quality": [],
33
+ "ug40_complete_quality": [],
34
+ "language_pair_matrix_quality": [],
35
+ # Track-specific BLEU scores
36
+ "google_comparable_bleu": [],
37
+ "ug40_complete_bleu": [],
38
+ "language_pair_matrix_bleu": [],
39
+ # Track-specific ChrF scores
40
+ "google_comparable_chrf": [],
41
+ "ug40_complete_chrf": [],
42
+ "language_pair_matrix_chrf": [],
43
+ # Statistical metadata
44
+ "google_comparable_ci_lower": [],
45
+ "google_comparable_ci_upper": [],
46
+ "ug40_complete_ci_lower": [],
47
+ "ug40_complete_ci_upper": [],
48
+ "language_pair_matrix_ci_lower": [],
49
+ "language_pair_matrix_ci_upper": [],
50
+ # Coverage information
51
+ "google_comparable_samples": [],
52
+ "ug40_complete_samples": [],
53
+ "language_pair_matrix_samples": [],
54
+ "google_comparable_pairs": [],
55
+ "ug40_complete_pairs": [],
56
+ "language_pair_matrix_pairs": [],
57
+ # Statistical adequacy flags
58
+ "google_comparable_adequate": [],
59
+ "ug40_complete_adequate": [],
60
+ "language_pair_matrix_adequate": [],
61
+ # Detailed results (JSON strings)
62
+ "detailed_google_comparable": [],
63
+ "detailed_ug40_complete": [],
64
+ "detailed_language_pair_matrix": [],
65
+ "cross_track_analysis": [],
66
  # Metadata
67
+ "evaluation_date": [],
68
+ "leaderboard_version": [],
69
+ "scientific_adequacy_score": [],
70
  }
71
+
72
  return pd.DataFrame(columns)
73
 
74
+
75
+ def load_scientific_leaderboard() -> pd.DataFrame:
76
+ """Load current scientific leaderboard from HuggingFace dataset."""
77
+
78
  try:
79
+ print("📥 Loading scientific leaderboard...")
80
+ dataset = load_dataset(LEADERBOARD_DATASET + "-scientific", split="train")
81
  df = dataset.to_pandas()
82
+
83
  # Ensure all required columns exist
84
+ required_columns = list(initialize_scientific_leaderboard().columns)
85
  for col in required_columns:
86
  if col not in df.columns:
87
+ if "quality" in col or "bleu" in col or "chrf" in col or "ci_" in col:
88
+ df[col] = 0.0
89
+ elif "samples" in col or "pairs" in col:
90
+ df[col] = 0
91
+ elif "adequate" in col:
92
+ df[col] = False
93
+ elif col == "scientific_adequacy_score":
94
  df[col] = 0.0
95
+ elif col == "leaderboard_version":
96
+ df[col] = 2 # Scientific version
97
  else:
98
+ df[col] = ""
99
+
100
+ print(f"Loaded scientific leaderboard with {len(df)} entries")
101
  return df
102
+
103
  except Exception as e:
104
+ print(f"⚠️ Could not load scientific leaderboard: {e}")
105
+ print("🔄 Initializing empty scientific leaderboard...")
106
+ return initialize_scientific_leaderboard()
107
+
108
+
109
+ def save_scientific_leaderboard(df: pd.DataFrame) -> bool:
110
+ """Save scientific leaderboard to HuggingFace dataset."""
111
 
 
 
 
112
  try:
113
  # Clean data before saving
114
  df_clean = df.copy()
115
+
116
  # Ensure numeric columns are proper types
117
+ numeric_columns = [
118
+ col
119
+ for col in df_clean.columns
120
+ if any(
121
+ x in col
122
+ for x in [
123
+ "quality",
124
+ "bleu",
125
+ "chrf",
126
+ "ci_",
127
+ "samples",
128
+ "pairs",
129
+ "adequacy",
130
+ ]
131
+ )
132
+ ]
133
+
134
  for col in numeric_columns:
135
  if col in df_clean.columns:
136
+ if "adequate" in col:
137
+ df_clean[col] = df_clean[col].astype(bool)
138
+ else:
139
+ df_clean[col] = pd.to_numeric(
140
+ df_clean[col], errors="coerce"
141
+ ).fillna(0.0)
142
+
143
  # Convert to dataset
144
  dataset = Dataset.from_pandas(df_clean)
145
+
146
  # Push to hub
147
  dataset.push_to_hub(
148
+ LEADERBOARD_DATASET + "-scientific",
149
  token=HF_TOKEN,
150
+ commit_message=f"Update scientific leaderboard - {datetime.datetime.now().isoformat()[:19]}",
151
  )
152
+
153
+ print(" Scientific leaderboard saved successfully!")
154
  return True
155
+
156
  except Exception as e:
157
+ print(f"Error saving scientific leaderboard: {e}")
158
  return False
159
 
160
+
161
+ def add_model_to_scientific_leaderboard(
162
  model_name: str,
163
  author: str,
164
  evaluation_results: Dict,
165
+ model_category: str = "community",
166
+ description: str = "",
 
167
  ) -> pd.DataFrame:
168
+ """Add new model results to scientific leaderboard."""
169
+
 
170
  # Load current leaderboard
171
+ df = load_scientific_leaderboard()
172
 
173
  # Remove existing entry if present
174
+ existing_mask = df["model_name"] == model_name
175
  if existing_mask.any():
176
  df = df[~existing_mask]
177
 
178
+ # Extract track results
179
+ tracks = evaluation_results.get("tracks", {})
180
+ cross_track = evaluation_results.get("cross_track_analysis", {})
 
 
181
 
182
+ # Calculate scientific adequacy score
183
+ adequacy_score = calculate_scientific_adequacy_score(evaluation_results)
 
 
 
 
184
 
185
  # Prepare new entry
186
  new_entry = {
187
+ "submission_id": create_submission_id(),
188
+ "model_name": sanitize_model_name(model_name),
189
+ "author": author[:100] if author else "Anonymous",
190
+ "submission_date": datetime.datetime.now().isoformat(),
191
+ "model_category": (
192
+ model_category if model_category in MODEL_CATEGORIES else "community"
193
+ ),
194
+ "description": description[:500] if description else "",
195
+ # Extract track-specific metrics
196
+ **extract_track_metrics(tracks),
197
+ # Statistical metadata
198
+ **extract_statistical_metadata(tracks),
199
+ # Coverage information
200
+ **extract_coverage_information(tracks),
201
+ # Adequacy flags
202
+ **extract_adequacy_flags(tracks),
203
+ # Detailed results (JSON strings)
204
+ **serialize_detailed_results(tracks, cross_track),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  # Metadata
206
+ "evaluation_date": datetime.datetime.now().isoformat(),
207
+ "leaderboard_version": 2,
208
+ "scientific_adequacy_score": adequacy_score,
209
  }
210
 
211
  # Convert to DataFrame and append
212
  new_row_df = pd.DataFrame([new_entry])
213
  updated_df = pd.concat([df, new_row_df], ignore_index=True)
 
214
 
215
  # Save to hub
216
+ save_scientific_leaderboard(updated_df)
217
 
218
  return updated_df
219
 
220
+
221
+ def extract_track_metrics(tracks: Dict) -> Dict:
222
+ """Extract primary metrics from each track."""
223
+
224
+ metrics = {}
225
+
226
+ for track_name in EVALUATION_TRACKS.keys():
227
+ track_data = tracks.get(track_name, {})
228
+ track_averages = track_data.get("track_averages", {})
229
+
230
+ # Quality score
231
+ metrics[f"{track_name}_quality"] = float(
232
+ track_averages.get("quality_score", 0.0)
233
+ )
234
+
235
+ # BLEU score
236
+ metrics[f"{track_name}_bleu"] = float(track_averages.get("bleu", 0.0))
237
+
238
+ # ChrF score
239
+ metrics[f"{track_name}_chrf"] = float(track_averages.get("chrf", 0.0))
240
+
241
+ return metrics
242
+
243
+
244
+ def extract_statistical_metadata(tracks: Dict) -> Dict:
245
+ """Extract confidence intervals from each track."""
246
+
247
+ metadata = {}
248
+
249
+ for track_name in EVALUATION_TRACKS.keys():
250
+ track_data = tracks.get(track_name, {})
251
+ track_statistics = track_data.get("track_statistics", {})
252
+
253
+ quality_stats = track_statistics.get("quality_score", {})
254
+ metadata[f"{track_name}_ci_lower"] = float(quality_stats.get("ci_lower", 0.0))
255
+ metadata[f"{track_name}_ci_upper"] = float(quality_stats.get("ci_upper", 0.0))
256
+
257
+ return metadata
258
+
259
+
260
+ def extract_coverage_information(tracks: Dict) -> Dict:
261
+ """Extract coverage information from each track."""
262
+
263
+ coverage = {}
264
+
265
+ for track_name in EVALUATION_TRACKS.keys():
266
+ track_data = tracks.get(track_name, {})
267
+ summary = track_data.get("summary", {})
268
+
269
+ coverage[f"{track_name}_samples"] = int(summary.get("total_samples", 0))
270
+ coverage[f"{track_name}_pairs"] = int(
271
+ summary.get("language_pairs_evaluated", 0)
272
+ )
273
+
274
+ return coverage
275
+
276
+
277
+ def extract_adequacy_flags(tracks: Dict) -> Dict:
278
+ """Extract statistical adequacy flags for each track."""
279
+
280
+ adequacy = {}
281
+
282
+ for track_name in EVALUATION_TRACKS.keys():
283
+ track_data = tracks.get(track_name, {})
284
+ summary = track_data.get("summary", {})
285
+
286
+ min_required = EVALUATION_TRACKS[track_name][
287
+ "min_samples_per_pair"
288
+ ] * summary.get("language_pairs_evaluated", 0)
289
+ is_adequate = summary.get("total_samples", 0) >= min_required
290
+
291
+ adequacy[f"{track_name}_adequate"] = bool(is_adequate)
292
+
293
+ return adequacy
294
+
295
+
296
+ def serialize_detailed_results(tracks: Dict, cross_track: Dict) -> Dict:
297
+ """Serialize detailed results for storage."""
298
+
299
+ detailed = {}
300
+
301
+ for track_name in EVALUATION_TRACKS.keys():
302
+ track_data = tracks.get(track_name, {})
303
+
304
+ # Remove non-serializable data
305
+ safe_track_data = {}
306
+ for key, value in track_data.items():
307
+ if key != "sample_metrics": # Skip large DataFrames
308
+ safe_track_data[key] = value
309
+
310
+ detailed[f"detailed_{track_name}"] = json.dumps(safe_track_data)
311
+
312
+ detailed["cross_track_analysis"] = json.dumps(cross_track)
313
+
314
+ return detailed
315
+
316
+
317
+ def calculate_scientific_adequacy_score(evaluation_results: Dict) -> float:
318
+ """Calculate overall scientific adequacy score (0-1)."""
319
+
320
+ tracks = evaluation_results.get("tracks", {})
321
+
322
+ adequacy_scores = []
323
+
324
+ for track_name in EVALUATION_TRACKS.keys():
325
+ track_data = tracks.get(track_name, {})
326
+ summary = track_data.get("summary", {})
327
+
328
+ if track_data.get("error"):
329
+ adequacy_scores.append(0.0)
330
+ continue
331
+
332
+ # Sample size adequacy
333
+ min_required = EVALUATION_TRACKS[track_name][
334
+ "min_samples_per_pair"
335
+ ] * summary.get("language_pairs_evaluated", 0)
336
+ sample_adequacy = min(
337
+ summary.get("total_samples", 0) / max(min_required, 1), 1.0
338
+ )
339
+
340
+ # Coverage adequacy
341
+ total_possible_pairs = len(EVALUATION_TRACKS[track_name]["languages"]) * (
342
+ len(EVALUATION_TRACKS[track_name]["languages"]) - 1
343
+ )
344
+ coverage_adequacy = summary.get("language_pairs_evaluated", 0) / max(
345
+ total_possible_pairs, 1
346
+ )
347
+
348
+ # Track adequacy
349
+ track_adequacy = (sample_adequacy + coverage_adequacy) / 2
350
+ adequacy_scores.append(track_adequacy)
351
+
352
+ return float(np.mean(adequacy_scores))
353
+
354
+
355
+ def get_track_leaderboard(
356
+ df: pd.DataFrame,
357
+ track: str,
358
+ metric: str = "quality",
359
+ category_filter: str = "all",
360
+ min_adequacy: float = 0.0,
361
+ ) -> pd.DataFrame:
362
+ """Get leaderboard for a specific track with filtering."""
363
+
364
+ if df.empty:
365
+ return df
366
+
367
+ track_quality_col = f"{track}_{metric}"
368
+ track_adequate_col = f"{track}_adequate"
369
+
370
+ # Filter by adequacy
371
+ if min_adequacy > 0:
372
+ adequacy_mask = df["scientific_adequacy_score"] >= min_adequacy
373
+ df = df[adequacy_mask]
374
+
375
+ # Filter by category
376
+ if category_filter != "all":
377
+ df = df[df["model_category"] == category_filter]
378
+
379
+ # Filter to models that have this track
380
+ valid_mask = (df[track_quality_col] > 0) & df[track_adequate_col]
381
+ df = df[valid_mask]
382
+
383
+ if df.empty:
384
+ return df
385
+
386
+ # Sort by track-specific metric
387
+ df = df.sort_values(track_quality_col, ascending=False).reset_index(drop=True)
388
+
389
+ return df
390
+
391
+
392
+ def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFrame:
393
+ """Prepare track-specific leaderboard for display."""
394
+
395
  if df.empty:
396
  return df
397
+
398
+ # Select relevant columns for this track
399
+ base_columns = ["model_name", "author", "submission_date", "model_category"]
400
+
401
+ track_columns = [
402
+ f"{track}_quality",
403
+ f"{track}_bleu",
404
+ f"{track}_chrf",
405
+ f"{track}_ci_lower",
406
+ f"{track}_ci_upper",
407
+ f"{track}_samples",
408
+ f"{track}_pairs",
409
+ f"{track}_adequate",
410
  ]
411
+
412
  # Only include columns that exist
413
+ available_columns = [
414
+ col for col in base_columns + track_columns if col in df.columns
415
+ ]
416
  display_df = df[available_columns].copy()
417
+
418
  # Format numeric columns
419
  numeric_format = {
420
+ f"{track}_quality": "{:.4f}",
421
+ f"{track}_bleu": "{:.2f}",
422
+ f"{track}_chrf": "{:.4f}",
423
+ f"{track}_ci_lower": "{:.4f}",
424
+ f"{track}_ci_upper": "{:.4f}",
 
425
  }
426
+
427
  for col, fmt in numeric_format.items():
428
  if col in display_df.columns:
429
+ display_df[col] = display_df[col].apply(
430
+ lambda x: fmt.format(float(x)) if pd.notnull(x) else "0.0000"
431
+ )
432
+
433
+ # Format confidence intervals
434
+ if (
435
+ f"{track}_ci_lower" in display_df.columns
436
+ and f"{track}_ci_upper" in display_df.columns
437
+ ):
438
+ display_df[f"{track}_confidence_interval"] = (
439
+ "["
440
+ + display_df[f"{track}_ci_lower"]
441
+ + ", "
442
+ + display_df[f"{track}_ci_upper"]
443
+ + "]"
444
+ )
445
+ # Remove individual CI columns for cleaner display
446
+ display_df = display_df.drop(columns=[f"{track}_ci_lower", f"{track}_ci_upper"])
447
+
448
  # Format submission date
449
+ if "submission_date" in display_df.columns:
450
+ display_df["submission_date"] = pd.to_datetime(
451
+ display_df["submission_date"]
452
+ ).dt.strftime("%Y-%m-%d")
453
+
454
  # Rename columns for better display
455
+ track_name = EVALUATION_TRACKS[track]["name"].split()[0] # First word
456
  column_renames = {
457
+ "model_name": "Model Name",
458
+ "author": "Author",
459
+ "submission_date": "Submitted",
460
+ "model_category": "Category",
461
+ f"{track}_quality": f"{track_name} Quality",
462
+ f"{track}_bleu": f"{track_name} BLEU",
463
+ f"{track}_chrf": f"{track_name} ChrF",
464
+ f"{track}_confidence_interval": "95% CI",
465
+ f"{track}_samples": "Samples",
466
+ f"{track}_pairs": "Pairs",
467
+ f"{track}_adequate": "Adequate",
 
 
468
  }
469
+
470
  display_df = display_df.rename(columns=column_renames)
471
+
472
  return display_df
473
 
474
+
475
+ def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dict:
476
+ """Get comprehensive statistics for the scientific leaderboard."""
477
+
478
  if df.empty:
479
  return {
480
+ "total_models": 0,
481
+ "models_by_category": {},
482
+ "track_statistics": {},
483
+ "adequacy_distribution": {},
484
+ "best_models_by_track": {},
 
 
485
  }
486
+
 
487
  stats = {
488
+ "total_models": len(df),
489
+ "models_by_category": df["model_category"].value_counts().to_dict(),
490
+ "adequacy_distribution": {},
491
+ "track_statistics": {},
492
+ "best_models_by_track": {},
 
 
 
493
  }
494
+
495
+ # Adequacy distribution
496
+ adequacy_bins = pd.cut(
497
+ df["scientific_adequacy_score"],
498
+ bins=[0, 0.3, 0.6, 0.8, 1.0],
499
+ labels=["Poor", "Fair", "Good", "Excellent"],
500
+ )
501
+ stats["adequacy_distribution"] = adequacy_bins.value_counts().to_dict()
502
+
503
+ # Track-specific statistics
504
+ for track_name in EVALUATION_TRACKS.keys():
505
+ quality_col = f"{track_name}_quality"
506
+ adequate_col = f"{track_name}_adequate"
507
+
508
+ if quality_col in df.columns and adequate_col in df.columns:
509
+ track_models = df[df[adequate_col] & (df[quality_col] > 0)]
510
+
511
+ if len(track_models) > 0:
512
+ stats["track_statistics"][track_name] = {
513
+ "participating_models": len(track_models),
514
+ "avg_quality": float(track_models[quality_col].mean()),
515
+ "std_quality": float(track_models[quality_col].std()),
516
+ "best_quality": float(track_models[quality_col].max()),
517
+ }
518
+
519
+ # Best model for this track
520
+ best_model = track_models.loc[track_models[quality_col].idxmax()]
521
+ stats["best_models_by_track"][track_name] = {
522
+ "name": best_model["model_name"],
523
+ "category": best_model["model_category"],
524
+ "quality": float(best_model[quality_col]),
525
+ }
526
+
527
  return stats
528
 
529
+
530
+ def perform_fair_comparison(
531
+ df: pd.DataFrame, model_names: List[str], shared_pairs_only: bool = True
532
+ ) -> Dict:
533
+ """Perform fair comparison between models using only shared language pairs."""
534
+
535
+ models = df[df["model_name"].isin(model_names)]
536
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
  if len(models) == 0:
538
+ return {"error": "No models found"}
539
+
540
  comparison = {
541
+ "models": list(models["model_name"]),
542
+ "fair_comparison_possible": True,
543
+ "track_comparisons": {},
544
+ "statistical_significance": {},
545
+ "recommendations": [],
546
  }
547
+
548
+ # Check if fair comparison is possible
549
+ categories = models["model_category"].unique()
550
+ if len(categories) > 1:
551
+ comparison["recommendations"].append(
552
+ "⚠️ Comparing models from different categories - interpret results carefully"
553
+ )
554
+
555
+ # For each track, compare models
556
+ for track_name in EVALUATION_TRACKS.keys():
557
+ quality_col = f"{track_name}_quality"
558
+ adequate_col = f"{track_name}_adequate"
559
+
560
+ track_models = models[models[adequate_col] & (models[quality_col] > 0)]
561
+
562
+ if len(track_models) >= 2:
563
+ comparison["track_comparisons"][track_name] = {
564
+ "participating_models": len(track_models),
565
+ "quality_scores": dict(
566
+ zip(track_models["model_name"], track_models[quality_col])
567
+ ),
568
+ "confidence_intervals": {},
 
 
569
  }
570
+
571
+ # Extract confidence intervals
572
+ for _, model in track_models.iterrows():
573
+ ci_lower = model.get(f"{track_name}_ci_lower", 0)
574
+ ci_upper = model.get(f"{track_name}_ci_upper", 0)
575
+ comparison["track_comparisons"][track_name]["confidence_intervals"][
576
+ model["model_name"]
577
+ ] = [ci_lower, ci_upper]
578
+
579
  return comparison
580
 
581
+
582
+ def export_scientific_leaderboard(
583
+ df: pd.DataFrame,
584
+ track: str = "all",
585
+ format: str = "csv",
586
+ include_detailed: bool = False,
587
+ ) -> str:
588
+ """Export scientific leaderboard in specified format."""
589
+
590
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
591
+
592
+ if track != "all":
593
+ # Export specific track
594
+ export_df = prepare_track_leaderboard_display(df, track)
595
+ filename_prefix = f"salt_leaderboard_{track}_{timestamp}"
596
  else:
597
+ # Export all tracks
598
+ if include_detailed:
599
+ export_df = df.copy()
600
+ else:
601
+ # Select essential columns
602
+ essential_columns = [
603
+ "model_name",
604
+ "author",
605
+ "submission_date",
606
+ "model_category",
607
+ "scientific_adequacy_score",
608
+ ]
609
+
610
+ # Add track-specific quality scores
611
+ for track_name in EVALUATION_TRACKS.keys():
612
+ essential_columns.extend(
613
+ [
614
+ f"{track_name}_quality",
615
+ f"{track_name}_adequate",
616
+ ]
617
+ )
618
+
619
+ available_columns = [col for col in essential_columns if col in df.columns]
620
+ export_df = df[available_columns].copy()
621
+
622
+ filename_prefix = f"salt_leaderboard_scientific_{timestamp}"
623
+
624
+ # Export in specified format
625
+ if format == "csv":
626
+ filename = f"{filename_prefix}.csv"
627
  export_df.to_csv(filename, index=False)
628
+ elif format == "json":
629
+ filename = f"{filename_prefix}.json"
630
+ export_df.to_json(filename, orient="records", indent=2)
631
+ elif format == "xlsx":
632
+ filename = f"{filename_prefix}.xlsx"
633
  export_df.to_excel(filename, index=False)
634
  else:
635
  raise ValueError(f"Unsupported format: {format}")
 
 
636
 
637
+ return filename