akera commited on
Commit
24ebdcb
·
verified ·
1 Parent(s): 6e3baf0

Update src/leaderboard.py

Browse files
Files changed (1) hide show
  1. src/leaderboard.py +114 -29
src/leaderboard.py CHANGED
@@ -118,6 +118,88 @@ def save_leaderboard(df: pd.DataFrame) -> bool:
118
  print(f"Error saving leaderboard: {e}")
119
  return False
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def add_model_to_leaderboard(
122
  model_name: str,
123
  author: str,
@@ -126,23 +208,31 @@ def add_model_to_leaderboard(
126
  model_type: str = "",
127
  description: str = ""
128
  ) -> pd.DataFrame:
129
- """Add new model results to leaderboard."""
130
-
 
131
  # Load current leaderboard
132
  df = load_leaderboard()
133
-
134
- # Check if model already exists
135
  existing_mask = df['model_name'] == model_name
136
  if existing_mask.any():
137
- print(f"Model '{model_name}' already exists. Updating...")
138
- df = df[~existing_mask] # Remove existing entry
139
-
 
 
 
 
 
 
 
140
  # Extract metrics
141
  averages = evaluation_results.get('averages', {})
142
  google_averages = evaluation_results.get('google_comparable_averages', {})
143
  summary = evaluation_results.get('summary', {})
144
-
145
- # Create new entry
146
  new_entry = {
147
  'submission_id': create_submission_id(),
148
  'model_name': sanitize_model_name(model_name),
@@ -150,12 +240,12 @@ def add_model_to_leaderboard(
150
  'submission_date': datetime.datetime.now().isoformat(),
151
  'model_type': model_type[:50] if model_type else 'unknown',
152
  'description': description[:500] if description else '',
153
-
154
  # Primary metrics
155
  'quality_score': float(averages.get('quality_score', 0.0)),
156
  'bleu': float(averages.get('bleu', 0.0)),
157
  'chrf': float(averages.get('chrf', 0.0)),
158
-
159
  # Secondary metrics
160
  'rouge1': float(averages.get('rouge1', 0.0)),
161
  'rouge2': float(averages.get('rouge2', 0.0)),
@@ -163,41 +253,36 @@ def add_model_to_leaderboard(
163
  'cer': float(averages.get('cer', 0.0)),
164
  'wer': float(averages.get('wer', 0.0)),
165
  'len_ratio': float(averages.get('len_ratio', 0.0)),
166
-
167
  # Google comparable metrics
168
  'google_quality_score': float(google_averages.get('quality_score', 0.0)),
169
  'google_bleu': float(google_averages.get('bleu', 0.0)),
170
  'google_chrf': float(google_averages.get('chrf', 0.0)),
171
-
172
  # Coverage info
173
  'total_samples': int(summary.get('total_samples', 0)),
174
  'language_pairs_covered': int(summary.get('language_pairs_covered', 0)),
175
  'google_pairs_covered': int(summary.get('google_comparable_pairs', 0)),
176
  'coverage_rate': float(validation_info.get('coverage', 0.0)),
177
-
178
- # Detailed results
179
- 'detailed_metrics': json.dumps(evaluation_results),
180
  'validation_report': validation_info.get('report', ''),
181
-
182
  # Metadata
183
  'evaluation_date': datetime.datetime.now().isoformat(),
184
  'leaderboard_version': 1
185
  }
186
-
187
- # Add to dataframe
188
  new_row_df = pd.DataFrame([new_entry])
189
  updated_df = pd.concat([df, new_row_df], ignore_index=True)
190
-
191
- # Sort by quality score (descending)
192
  updated_df = updated_df.sort_values('quality_score', ascending=False).reset_index(drop=True)
193
-
194
- # Save updated leaderboard
195
- if save_leaderboard(updated_df):
196
- print(f"Added '{model_name}' to leaderboard")
197
- return updated_df
198
- else:
199
- print("Failed to save leaderboard")
200
- return df
201
 
202
  def get_leaderboard_stats(df: pd.DataFrame) -> Dict:
203
  """Get summary statistics for the leaderboard."""
 
118
  print(f"Error saving leaderboard: {e}")
119
  return False
120
 
121
+ # def add_model_to_leaderboard(
122
+ # model_name: str,
123
+ # author: str,
124
+ # evaluation_results: Dict,
125
+ # validation_info: Dict,
126
+ # model_type: str = "",
127
+ # description: str = ""
128
+ # ) -> pd.DataFrame:
129
+ # """Add new model results to leaderboard."""
130
+
131
+ # # Load current leaderboard
132
+ # df = load_leaderboard()
133
+
134
+ # # Check if model already exists
135
+ # existing_mask = df['model_name'] == model_name
136
+ # if existing_mask.any():
137
+ # print(f"Model '{model_name}' already exists. Updating...")
138
+ # df = df[~existing_mask] # Remove existing entry
139
+
140
+ # # Extract metrics
141
+ # averages = evaluation_results.get('averages', {})
142
+ # google_averages = evaluation_results.get('google_comparable_averages', {})
143
+ # summary = evaluation_results.get('summary', {})
144
+
145
+ # # Create new entry
146
+ # new_entry = {
147
+ # 'submission_id': create_submission_id(),
148
+ # 'model_name': sanitize_model_name(model_name),
149
+ # 'author': author[:100] if author else 'Anonymous',
150
+ # 'submission_date': datetime.datetime.now().isoformat(),
151
+ # 'model_type': model_type[:50] if model_type else 'unknown',
152
+ # 'description': description[:500] if description else '',
153
+
154
+ # # Primary metrics
155
+ # 'quality_score': float(averages.get('quality_score', 0.0)),
156
+ # 'bleu': float(averages.get('bleu', 0.0)),
157
+ # 'chrf': float(averages.get('chrf', 0.0)),
158
+
159
+ # # Secondary metrics
160
+ # 'rouge1': float(averages.get('rouge1', 0.0)),
161
+ # 'rouge2': float(averages.get('rouge2', 0.0)),
162
+ # 'rougeL': float(averages.get('rougeL', 0.0)),
163
+ # 'cer': float(averages.get('cer', 0.0)),
164
+ # 'wer': float(averages.get('wer', 0.0)),
165
+ # 'len_ratio': float(averages.get('len_ratio', 0.0)),
166
+
167
+ # # Google comparable metrics
168
+ # 'google_quality_score': float(google_averages.get('quality_score', 0.0)),
169
+ # 'google_bleu': float(google_averages.get('bleu', 0.0)),
170
+ # 'google_chrf': float(google_averages.get('chrf', 0.0)),
171
+
172
+ # # Coverage info
173
+ # 'total_samples': int(summary.get('total_samples', 0)),
174
+ # 'language_pairs_covered': int(summary.get('language_pairs_covered', 0)),
175
+ # 'google_pairs_covered': int(summary.get('google_comparable_pairs', 0)),
176
+ # 'coverage_rate': float(validation_info.get('coverage', 0.0)),
177
+
178
+ # # Detailed results
179
+ # 'detailed_metrics': json.dumps(evaluation_results),
180
+ # 'validation_report': validation_info.get('report', ''),
181
+
182
+ # # Metadata
183
+ # 'evaluation_date': datetime.datetime.now().isoformat(),
184
+ # 'leaderboard_version': 1
185
+ # }
186
+
187
+ # # Add to dataframe
188
+ # new_row_df = pd.DataFrame([new_entry])
189
+ # updated_df = pd.concat([df, new_row_df], ignore_index=True)
190
+
191
+ # # Sort by quality score (descending)
192
+ # updated_df = updated_df.sort_values('quality_score', ascending=False).reset_index(drop=True)
193
+
194
+ # # Save updated leaderboard
195
+ # if save_leaderboard(updated_df):
196
+ # print(f"Added '{model_name}' to leaderboard")
197
+ # return updated_df
198
+ # else:
199
+ # print("Failed to save leaderboard")
200
+ # return df
201
+
202
+
203
  def add_model_to_leaderboard(
204
  model_name: str,
205
  author: str,
 
208
  model_type: str = "",
209
  description: str = ""
210
  ) -> pd.DataFrame:
211
+ """
212
+ Add new model results to leaderboard, with JSON-safe detailed_metrics.
213
+ """
214
  # Load current leaderboard
215
  df = load_leaderboard()
216
+
217
+ # Remove existing entry if present
218
  existing_mask = df['model_name'] == model_name
219
  if existing_mask.any():
220
+ df = df[~existing_mask]
221
+
222
+ # Safely serialize evaluation_results by dropping non-JSON types
223
+ safe_results = evaluation_results.copy()
224
+ # Remove sample_metrics DataFrame which isn't JSON serializable
225
+ if 'sample_metrics' in safe_results:
226
+ safe_results.pop('sample_metrics')
227
+
228
+ detailed_json = json.dumps(safe_results)
229
+
230
  # Extract metrics
231
  averages = evaluation_results.get('averages', {})
232
  google_averages = evaluation_results.get('google_comparable_averages', {})
233
  summary = evaluation_results.get('summary', {})
234
+
235
+ # Prepare new entry
236
  new_entry = {
237
  'submission_id': create_submission_id(),
238
  'model_name': sanitize_model_name(model_name),
 
240
  'submission_date': datetime.datetime.now().isoformat(),
241
  'model_type': model_type[:50] if model_type else 'unknown',
242
  'description': description[:500] if description else '',
243
+
244
  # Primary metrics
245
  'quality_score': float(averages.get('quality_score', 0.0)),
246
  'bleu': float(averages.get('bleu', 0.0)),
247
  'chrf': float(averages.get('chrf', 0.0)),
248
+
249
  # Secondary metrics
250
  'rouge1': float(averages.get('rouge1', 0.0)),
251
  'rouge2': float(averages.get('rouge2', 0.0)),
 
253
  'cer': float(averages.get('cer', 0.0)),
254
  'wer': float(averages.get('wer', 0.0)),
255
  'len_ratio': float(averages.get('len_ratio', 0.0)),
256
+
257
  # Google comparable metrics
258
  'google_quality_score': float(google_averages.get('quality_score', 0.0)),
259
  'google_bleu': float(google_averages.get('bleu', 0.0)),
260
  'google_chrf': float(google_averages.get('chrf', 0.0)),
261
+
262
  # Coverage info
263
  'total_samples': int(summary.get('total_samples', 0)),
264
  'language_pairs_covered': int(summary.get('language_pairs_covered', 0)),
265
  'google_pairs_covered': int(summary.get('google_comparable_pairs', 0)),
266
  'coverage_rate': float(validation_info.get('coverage', 0.0)),
267
+
268
+ # Detailed results (JSON string)
269
+ 'detailed_metrics': detailed_json,
270
  'validation_report': validation_info.get('report', ''),
271
+
272
  # Metadata
273
  'evaluation_date': datetime.datetime.now().isoformat(),
274
  'leaderboard_version': 1
275
  }
276
+
277
+ # Convert to DataFrame and append
278
  new_row_df = pd.DataFrame([new_entry])
279
  updated_df = pd.concat([df, new_row_df], ignore_index=True)
 
 
280
  updated_df = updated_df.sort_values('quality_score', ascending=False).reset_index(drop=True)
281
+
282
+ # Save to hub
283
+ save_leaderboard(updated_df)
284
+
285
+ return updated_df
 
 
 
286
 
287
  def get_leaderboard_stats(df: pd.DataFrame) -> Dict:
288
  """Get summary statistics for the leaderboard."""