akera commited on
Commit
423834f
·
verified ·
1 Parent(s): 4fa2f10

Rename src/leaderboard.py to src/evaluation.py

Browse files
Files changed (2) hide show
  1. src/evaluation.py +403 -0
  2. src/leaderboard.py +0 -183
src/evaluation.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/evaluation.py
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sacrebleu.metrics import BLEU, CHRF
5
+ from rouge_score import rouge_scorer
6
+ import Levenshtein
7
+ from collections import defaultdict
8
+ from transformers.models.whisper.english_normalizer import BasicTextNormalizer
9
+ from typing import Dict, List, Tuple
10
+ from config import ALL_UG40_LANGUAGES, GOOGLE_SUPPORTED_LANGUAGES, METRICS_CONFIG
11
+ from src.utils import get_all_language_pairs, get_google_comparable_pairs
12
+
13
+ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
14
+ """Calculate all metrics for a single sentence pair."""
15
+
16
+ # Handle empty predictions
17
+ if not prediction or not isinstance(prediction, str):
18
+ prediction = ""
19
+
20
+ if not reference or not isinstance(reference, str):
21
+ reference = ""
22
+
23
+ # Normalize texts
24
+ normalizer = BasicTextNormalizer()
25
+ pred_norm = normalizer(prediction)
26
+ ref_norm = normalizer(reference)
27
+
28
+ metrics = {}
29
+
30
+ # BLEU score
31
+ try:
32
+ bleu = BLEU(effective_order=True)
33
+ metrics['bleu'] = bleu.sentence_score(pred_norm, [ref_norm]).score
34
+ except:
35
+ metrics['bleu'] = 0.0
36
+
37
+ # ChrF score
38
+ try:
39
+ chrf = CHRF()
40
+ metrics['chrf'] = chrf.sentence_score(pred_norm, [ref_norm]).score / 100.0
41
+ except:
42
+ metrics['chrf'] = 0.0
43
+
44
+ # Character Error Rate (CER)
45
+ try:
46
+ if len(ref_norm) > 0:
47
+ metrics['cer'] = Levenshtein.distance(ref_norm, pred_norm) / len(ref_norm)
48
+ else:
49
+ metrics['cer'] = 1.0 if len(pred_norm) > 0 else 0.0
50
+ except:
51
+ metrics['cer'] = 1.0
52
+
53
+ # Word Error Rate (WER)
54
+ try:
55
+ ref_words = ref_norm.split()
56
+ pred_words = pred_norm.split()
57
+ if len(ref_words) > 0:
58
+ metrics['wer'] = Levenshtein.distance(ref_words, pred_words) / len(ref_words)
59
+ else:
60
+ metrics['wer'] = 1.0 if len(pred_words) > 0 else 0.0
61
+ except:
62
+ metrics['wer'] = 1.0
63
+
64
+ # Length ratio
65
+ try:
66
+ if len(ref_norm) > 0:
67
+ metrics['len_ratio'] = len(pred_norm) / len(ref_norm)
68
+ else:
69
+ metrics['len_ratio'] = 1.0 if len(pred_norm) == 0 else float('inf')
70
+ except:
71
+ metrics['len_ratio'] = 1.0
72
+
73
+ # ROUGE scores
74
+ try:
75
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
76
+ rouge_scores = scorer.score(ref_norm, pred_norm)
77
+
78
+ metrics['rouge1'] = rouge_scores['rouge1'].fmeasure
79
+ metrics['rouge2'] = rouge_scores['rouge2'].fmeasure
80
+ metrics['rougeL'] = rouge_scores['rougeL'].fmeasure
81
+ except:
82
+ metrics['rouge1'] = 0.0
83
+ metrics['rouge2'] = 0.0
84
+ metrics['rougeL'] = 0.0
85
+
86
+ # Quality score (composite metric)
87
+ try:
88
+ quality_components = [
89
+ metrics['bleu'] / 100.0, # Normalize BLEU to 0-1
90
+ metrics['chrf'],
91
+ 1.0 - min(metrics['cer'], 1.0), # Invert error rates
92
+ 1.0 - min(metrics['wer'], 1.0),
93
+ metrics['rouge1'],
94
+ metrics['rougeL']
95
+ ]
96
+ metrics['quality_score'] = np.mean(quality_components)
97
+ except:
98
+ metrics['quality_score'] = 0.0
99
+
100
+ return metrics
101
+
102
+ def evaluate_predictions(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
103
+ """Evaluate predictions against test set targets."""
104
+
105
+ print("Starting evaluation...")
106
+
107
+ # Merge predictions with test set (which contains targets)
108
+ merged = test_set.merge(
109
+ predictions,
110
+ on='sample_id',
111
+ how='inner',
112
+ suffixes=('', '_pred')
113
+ )
114
+
115
+ if len(merged) == 0:
116
+ return {
117
+ 'error': 'No matching samples found between predictions and test set',
118
+ 'evaluated_samples': 0
119
+ }
120
+
121
+ print(f"Evaluating {len(merged)} samples...")
122
+
123
+ # Calculate metrics for each sample
124
+ sample_metrics = []
125
+ for idx, row in merged.iterrows():
126
+ metrics = calculate_sentence_metrics(row['target_text'], row['prediction'])
127
+ metrics['sample_id'] = row['sample_id']
128
+ metrics['source_language'] = row['source_language']
129
+ metrics['target_language'] = row['target_language']
130
+ metrics['google_comparable'] = row.get('google_comparable', False)
131
+ sample_metrics.append(metrics)
132
+
133
+ sample_df = pd.DataFrame(sample_metrics)
134
+
135
+ # Aggregate by language pairs
136
+ pair_metrics = {}
137
+ overall_metrics = defaultdict(list)
138
+ google_comparable_metrics = defaultdict(list)
139
+
140
+ # Calculate metrics for each language pair
141
+ for src_lang in ALL_UG40_LANGUAGES:
142
+ for tgt_lang in ALL_UG40_LANGUAGES:
143
+ if src_lang != tgt_lang:
144
+ pair_data = sample_df[
145
+ (sample_df['source_language'] == src_lang) &
146
+ (sample_df['target_language'] == tgt_lang)
147
+ ]
148
+
149
+ if len(pair_data) > 0:
150
+ pair_key = f"{src_lang}_to_{tgt_lang}"
151
+ pair_metrics[pair_key] = {}
152
+
153
+ # Calculate averages for this pair
154
+ for metric in METRICS_CONFIG['primary_metrics'] + METRICS_CONFIG['secondary_metrics']:
155
+ if metric in pair_data.columns:
156
+ avg_value = float(pair_data[metric].mean())
157
+ pair_metrics[pair_key][metric] = avg_value
158
+
159
+ # Add to overall averages
160
+ overall_metrics[metric].append(avg_value)
161
+
162
+ # Add to Google comparable if applicable
163
+ if (src_lang in GOOGLE_SUPPORTED_LANGUAGES and
164
+ tgt_lang in GOOGLE_SUPPORTED_LANGUAGES):
165
+ google_comparable_metrics[metric].append(avg_value)
166
+
167
+ pair_metrics[pair_key]['sample_count'] = len(pair_data)
168
+
169
+ # Calculate overall averages
170
+ averages = {}
171
+ for metric in overall_metrics:
172
+ if overall_metrics[metric]:
173
+ averages[metric] = float(np.mean(overall_metrics[metric]))
174
+ else:
175
+ averages[metric] = 0.0
176
+
177
+ # Calculate Google comparable averages
178
+ google_averages = {}
179
+ for metric in google_comparable_metrics:
180
+ if google_comparable_metrics[metric]:
181
+ google_averages[metric] = float(np.mean(google_comparable_metrics[metric]))
182
+ else:
183
+ google_averages[metric] = 0.0
184
+
185
+ # Generate evaluation summary
186
+ summary = {
187
+ 'total_samples': len(sample_df),
188
+ 'language_pairs_covered': len([k for k in pair_metrics if pair_metrics[k]['sample_count'] > 0]),
189
+ 'google_comparable_pairs': len([k for k in pair_metrics
190
+ if '_to_' in k and
191
+ k.split('_to_')[0] in GOOGLE_SUPPORTED_LANGUAGES and
192
+ k.split('_to_')[1] in GOOGLE_SUPPORTED_LANGUAGES]),
193
+ 'primary_metrics': {metric: averages.get(metric, 0.0)
194
+ for metric in METRICS_CONFIG['primary_metrics']},
195
+ 'secondary_metrics': {metric: averages.get(metric, 0.0)
196
+ for metric in METRICS_CONFIG['secondary_metrics']}
197
+ }
198
+
199
+ return {
200
+ 'sample_metrics': sample_df,
201
+ 'pair_metrics': pair_metrics,
202
+ 'averages': averages,
203
+ 'google_comparable_averages': google_averages,
204
+ 'summary': summary,
205
+ 'evaluated_samples': len(sample_df),
206
+ 'error': None
207
+ }
208
+
209
+ def compare_with_baseline(results: Dict, baseline_results: Dict = None) -> Dict:
210
+ """Compare results with baseline (e.g., Google Translate)."""
211
+
212
+ if baseline_results is None:
213
+ return {
214
+ 'comparison_available': False,
215
+ 'message': 'No baseline available for comparison'
216
+ }
217
+
218
+ comparison = {
219
+ 'comparison_available': True,
220
+ 'overall_comparison': {},
221
+ 'pair_comparisons': {},
222
+ 'better_pairs': [],
223
+ 'worse_pairs': []
224
+ }
225
+
226
+ # Compare overall metrics
227
+ for metric in METRICS_CONFIG['primary_metrics']:
228
+ if metric in results['averages'] and metric in baseline_results['averages']:
229
+ user_score = results['averages'][metric]
230
+ baseline_score = baseline_results['averages'][metric]
231
+
232
+ # For error metrics (cer, wer), lower is better
233
+ if metric in ['cer', 'wer']:
234
+ improvement = baseline_score - user_score # Positive = improvement
235
+ else:
236
+ improvement = user_score - baseline_score # Positive = improvement
237
+
238
+ comparison['overall_comparison'][metric] = {
239
+ 'user_score': user_score,
240
+ 'baseline_score': baseline_score,
241
+ 'improvement': improvement,
242
+ 'improvement_percent': (improvement / max(baseline_score, 0.001)) * 100
243
+ }
244
+
245
+ # Compare by language pairs (only Google comparable ones)
246
+ google_pairs = [k for k in results['pair_metrics']
247
+ if '_to_' in k and
248
+ k.split('_to_')[0] in GOOGLE_SUPPORTED_LANGUAGES and
249
+ k.split('_to_')[1] in GOOGLE_SUPPORTED_LANGUAGES]
250
+
251
+ for pair in google_pairs:
252
+ if pair in baseline_results['pair_metrics']:
253
+ pair_comparison = {}
254
+
255
+ for metric in METRICS_CONFIG['primary_metrics']:
256
+ if (metric in results['pair_metrics'][pair] and
257
+ metric in baseline_results['pair_metrics'][pair]):
258
+
259
+ user_score = results['pair_metrics'][pair][metric]
260
+ baseline_score = baseline_results['pair_metrics'][pair][metric]
261
+
262
+ if metric in ['cer', 'wer']:
263
+ improvement = baseline_score - user_score
264
+ else:
265
+ improvement = user_score - baseline_score
266
+
267
+ pair_comparison[metric] = {
268
+ 'user_score': user_score,
269
+ 'baseline_score': baseline_score,
270
+ 'improvement': improvement
271
+ }
272
+
273
+ comparison['pair_comparisons'][pair] = pair_comparison
274
+
275
+ # Determine if this pair is better or worse overall
276
+ quality_improvement = pair_comparison.get('quality_score', {}).get('improvement', 0)
277
+ if quality_improvement > 0.01: # Threshold for significance
278
+ comparison['better_pairs'].append(pair)
279
+ elif quality_improvement < -0.01:
280
+ comparison['worse_pairs'].append(pair)
281
+
282
+ return comparison
283
+
284
+ def generate_evaluation_report(results: Dict, model_name: str = "", comparison: Dict = None) -> str:
285
+ """Generate human-readable evaluation report."""
286
+
287
+ if results.get('error'):
288
+ return f"❌ **Evaluation Error**: {results['error']}"
289
+
290
+ report = []
291
+
292
+ # Header
293
+ report.append(f"# Evaluation Report: {model_name or 'Submission'}")
294
+ report.append(f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
295
+ report.append("")
296
+
297
+ # Summary
298
+ summary = results['summary']
299
+ report.append("## 📊 Summary")
300
+ report.append(f"- **Total Samples Evaluated**: {summary['total_samples']:,}")
301
+ report.append(f"- **Language Pairs Covered**: {summary['language_pairs_covered']}")
302
+ report.append(f"- **Google Comparable Pairs**: {summary['google_comparable_pairs']}")
303
+ report.append("")
304
+
305
+ # Primary metrics
306
+ report.append("## 🎯 Primary Metrics")
307
+ for metric, value in summary['primary_metrics'].items():
308
+ formatted_value = f"{value:.4f}" if metric != 'bleu' else f"{value:.2f}"
309
+ report.append(f"- **{metric.upper()}**: {formatted_value}")
310
+
311
+ # Quality ranking (if comparison available)
312
+ if comparison and comparison.get('comparison_available'):
313
+ quality_comp = comparison['overall_comparison'].get('quality_score', {})
314
+ if quality_comp:
315
+ improvement = quality_comp.get('improvement', 0)
316
+ if improvement > 0.01:
317
+ report.append(f" - 🟢 **{improvement:.3f}** better than baseline")
318
+ elif improvement < -0.01:
319
+ report.append(f" - 🔴 **{abs(improvement):.3f}** worse than baseline")
320
+ else:
321
+ report.append(f" - 🟡 Similar to baseline")
322
+
323
+ report.append("")
324
+
325
+ # Secondary metrics
326
+ report.append("## 📈 Secondary Metrics")
327
+ for metric, value in summary['secondary_metrics'].items():
328
+ formatted_value = f"{value:.4f}"
329
+ report.append(f"- **{metric.upper()}**: {formatted_value}")
330
+ report.append("")
331
+
332
+ # Language pair performance (top and bottom 5)
333
+ pair_metrics = results['pair_metrics']
334
+ if pair_metrics:
335
+ # Sort pairs by quality score
336
+ sorted_pairs = sorted(
337
+ [(k, v.get('quality_score', 0)) for k, v in pair_metrics.items() if v.get('sample_count', 0) > 0],
338
+ key=lambda x: x[1],
339
+ reverse=True
340
+ )
341
+
342
+ report.append("## 🏆 Best Performing Language Pairs")
343
+ for pair, score in sorted_pairs[:5]:
344
+ src, tgt = pair.replace('_to_', ' → ').split(' → ')
345
+ report.append(f"- **{src} → {tgt}**: {score:.3f}")
346
+
347
+ if len(sorted_pairs) > 5:
348
+ report.append("")
349
+ report.append("## 📉 Challenging Language Pairs")
350
+ for pair, score in sorted_pairs[-3:]:
351
+ src, tgt = pair.replace('_to_', ' → ').split(' → ')
352
+ report.append(f"- **{src} → {tgt}**: {score:.3f}")
353
+
354
+ # Comparison with baseline
355
+ if comparison and comparison.get('comparison_available'):
356
+ report.append("")
357
+ report.append("## 🔍 Comparison with Baseline")
358
+
359
+ better_count = len(comparison.get('better_pairs', []))
360
+ worse_count = len(comparison.get('worse_pairs', []))
361
+ total_comparable = better_count + worse_count + (comparison.get('google_comparable_pairs', 0) - better_count - worse_count)
362
+
363
+ if total_comparable > 0:
364
+ report.append(f"- **Better than baseline**: {better_count}/{total_comparable} pairs")
365
+ report.append(f"- **Worse than baseline**: {worse_count}/{total_comparable} pairs")
366
+
367
+ if comparison['better_pairs']:
368
+ report.append(" - Strong pairs: " + ", ".join(comparison['better_pairs'][:3]))
369
+
370
+ if comparison['worse_pairs']:
371
+ report.append(" - Weak pairs: " + ", ".join(comparison['worse_pairs'][:3]))
372
+
373
+ return "\n".join(report)
374
+
375
+ def create_sample_analysis(results: Dict, n_samples: int = 10) -> pd.DataFrame:
376
+ """Create sample analysis showing best and worst translations."""
377
+
378
+ if 'sample_metrics' not in results:
379
+ return pd.DataFrame()
380
+
381
+ sample_df = results['sample_metrics']
382
+
383
+ # Get best and worst samples by quality score
384
+ best_samples = sample_df.nlargest(n_samples // 2, 'quality_score')
385
+ worst_samples = sample_df.nsmallest(n_samples // 2, 'quality_score')
386
+
387
+ analysis_samples = pd.concat([best_samples, worst_samples])
388
+
389
+ # Add category
390
+ analysis_samples['category'] = ['Best'] * len(best_samples) + ['Worst'] * len(worst_samples)
391
+
392
+ return analysis_samples[['sample_id', 'source_language', 'target_language',
393
+ 'quality_score', 'bleu', 'chrf', 'category']]
394
+
395
+ def get_google_translate_baseline() -> Dict:
396
+ """Get Google Translate baseline results (if available)."""
397
+
398
+ try:
399
+ # This would load pre-computed Google Translate results
400
+ # For now, return empty dict - implement when Google Translate baseline is available
401
+ return {}
402
+ except:
403
+ return {}
src/leaderboard.py DELETED
@@ -1,183 +0,0 @@
1
- # src/leaderboard.py
2
- import pandas as pd
3
- from datasets import Dataset, load_dataset
4
- from huggingface_hub import hf_hub_download, upload_file
5
- import json
6
- import datetime
7
- from typing import Dict, List, Optional
8
- import os
9
- from config import LEADERBOARD_DATASET, HF_TOKEN
10
- from src.utils import format_model_name, create_submission_id
11
-
12
- def initialize_leaderboard() -> Dataset:
13
- """Initialize empty leaderboard dataset."""
14
- empty_data = {
15
- 'submission_id': [],
16
- 'model_path': [],
17
- 'model_display_name': [],
18
- 'author': [],
19
- 'submission_date': [],
20
- 'bleu': [],
21
- 'chrf': [],
22
- 'quality_score': [],
23
- 'cer': [],
24
- 'wer': [],
25
- 'rouge1': [],
26
- 'rouge2': [],
27
- 'rougeL': [],
28
- 'len_ratio': [],
29
- 'detailed_metrics': [],
30
- 'evaluation_samples': [],
31
- 'model_type': []
32
- }
33
- return Dataset.from_dict(empty_data)
34
-
35
- def load_leaderboard() -> pd.DataFrame:
36
- """Load current leaderboard from HuggingFace dataset."""
37
- try:
38
- dataset = load_dataset(LEADERBOARD_DATASET, split='train')
39
- df = dataset.to_pandas()
40
-
41
- # Ensure all required columns exist
42
- required_columns = [
43
- 'submission_id', 'model_path', 'model_display_name', 'author',
44
- 'submission_date', 'bleu', 'chrf', 'quality_score', 'cer', 'wer',
45
- 'rouge1', 'rouge2', 'rougeL', 'len_ratio', 'detailed_metrics',
46
- 'evaluation_samples', 'model_type'
47
- ]
48
-
49
- for col in required_columns:
50
- if col not in df.columns:
51
- if col in ['bleu', 'chrf', 'quality_score', 'cer', 'wer', 'rouge1', 'rouge2', 'rougeL', 'len_ratio', 'evaluation_samples']:
52
- df[col] = 0.0
53
- else:
54
- df[col] = ''
55
-
56
- return df
57
-
58
- except Exception as e:
59
- print(f"Error loading leaderboard: {e}")
60
- print("Initializing empty leaderboard...")
61
- return initialize_leaderboard().to_pandas()
62
-
63
- def save_leaderboard(df: pd.DataFrame) -> bool:
64
- """Save leaderboard back to HuggingFace dataset."""
65
- try:
66
- # Convert DataFrame to Dataset
67
- dataset = Dataset.from_pandas(df)
68
-
69
- # Push to HuggingFace Hub
70
- dataset.push_to_hub(
71
- LEADERBOARD_DATASET,
72
- token=HF_TOKEN,
73
- commit_message=f"Update leaderboard - {datetime.datetime.now().isoformat()}"
74
- )
75
-
76
- print("Leaderboard saved successfully!")
77
- return True
78
-
79
- except Exception as e:
80
- print(f"Error saving leaderboard: {e}")
81
- return False
82
-
83
- def add_model_results(
84
- model_path: str,
85
- author: str,
86
- metrics: Dict,
87
- detailed_metrics: Dict,
88
- evaluation_samples: int,
89
- model_type: str
90
- ) -> pd.DataFrame:
91
- """Add new model results to leaderboard."""
92
-
93
- # Load current leaderboard
94
- df = load_leaderboard()
95
-
96
- # Check if model already exists
97
- existing = df[df['model_path'] == model_path]
98
- if not existing.empty:
99
- print(f"Model {model_path} already exists. Updating with new results.")
100
- # Remove existing entry
101
- df = df[df['model_path'] != model_path]
102
-
103
- # Create new entry
104
- new_entry = {
105
- 'submission_id': create_submission_id(),
106
- 'model_path': model_path,
107
- 'model_display_name': format_model_name(model_path),
108
- 'author': author,
109
- 'submission_date': datetime.datetime.now().isoformat(),
110
- 'bleu': metrics.get('bleu', 0.0),
111
- 'chrf': metrics.get('chrf', 0.0),
112
- 'quality_score': metrics.get('quality_score', 0.0),
113
- 'cer': metrics.get('cer', 0.0),
114
- 'wer': metrics.get('wer', 0.0),
115
- 'rouge1': metrics.get('rouge1', 0.0),
116
- 'rouge2': metrics.get('rouge2', 0.0),
117
- 'rougeL': metrics.get('rougeL', 0.0),
118
- 'len_ratio': metrics.get('len_ratio', 0.0),
119
- 'detailed_metrics': json.dumps(detailed_metrics),
120
- 'evaluation_samples': evaluation_samples,
121
- 'model_type': model_type
122
- }
123
-
124
- # Add to dataframe
125
- new_df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
126
-
127
- # Sort by quality score descending
128
- new_df = new_df.sort_values('quality_score', ascending=False).reset_index(drop=True)
129
-
130
- # Save updated leaderboard
131
- save_leaderboard(new_df)
132
-
133
- return new_df
134
-
135
- def get_leaderboard_summary(df: pd.DataFrame) -> Dict:
136
- """Get summary statistics for the leaderboard."""
137
- if df.empty:
138
- return {
139
- 'total_models': 0,
140
- 'avg_quality_score': 0.0,
141
- 'best_model': 'None',
142
- 'latest_submission': 'None'
143
- }
144
-
145
- return {
146
- 'total_models': len(df),
147
- 'avg_quality_score': df['quality_score'].mean(),
148
- 'best_model': df.iloc[0]['model_display_name'] if not df.empty else 'None',
149
- 'latest_submission': df['submission_date'].max() if not df.empty else 'None'
150
- }
151
-
152
- def get_top_models(df: pd.DataFrame, n: int = 10) -> pd.DataFrame:
153
- """Get top N models by quality score."""
154
- return df.nlargest(n, 'quality_score')
155
-
156
- def search_models(df: pd.DataFrame, query: str) -> pd.DataFrame:
157
- """Search models by name or author."""
158
- if not query:
159
- return df
160
-
161
- query = query.lower()
162
- mask = (
163
- df['model_display_name'].str.lower().str.contains(query, na=False) |
164
- df['author'].str.lower().str.contains(query, na=False) |
165
- df['model_path'].str.lower().str.contains(query, na=False)
166
- )
167
-
168
- return df[mask]
169
-
170
- def export_results(df: pd.DataFrame, format: str = 'csv') -> str:
171
- """Export leaderboard results in specified format."""
172
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
173
-
174
- if format == 'csv':
175
- filename = f"salt_leaderboard_{timestamp}.csv"
176
- df.to_csv(filename, index=False)
177
- return filename
178
- elif format == 'json':
179
- filename = f"salt_leaderboard_{timestamp}.json"
180
- df.to_json(filename, orient='records', indent=2)
181
- return filename
182
- else:
183
- raise ValueError(f"Unsupported format: {format}")