oliver-aizip kai-aizip commited on
Commit
c93381b
·
verified ·
1 Parent(s): 32ae078

Implemented elo (#6)

Browse files

- Implemented elo (8878917366baff6fee97ed2bf699891a4e393b0e)


Co-authored-by: Kai <[email protected]>

Files changed (1) hide show
  1. utils/leaderboard.py +311 -17
utils/leaderboard.py CHANGED
@@ -1,19 +1,99 @@
1
  import os
2
  import pandas as pd
3
- import random
4
- from .models import model_names
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def load_leaderboard_data():
7
  """
8
  Loads the leaderboard data from the leaderboard CSV file.
9
  Returns the data in a format compatible with the application.
10
  """
11
- # Initialize the results structure
12
- results = {"wins": {}, "losses": {}, "ties": {}, "votes": 0}
 
 
 
 
 
 
 
 
13
 
14
  try:
15
  # Define the path to the CSV file for leaderboard
16
- csv_path = os.path.join('utils', 'arena_df_leaderboard.csv')
17
 
18
  # Check if the file exists and load it
19
  if os.path.exists(csv_path):
@@ -25,52 +105,266 @@ def load_leaderboard_data():
25
  results["wins"][model] = row['wins']
26
  results["losses"][model] = row['losses']
27
  results["ties"][model] = row['ties']
 
 
28
 
29
  # Calculate total votes
30
  for model in results["wins"].keys():
31
  results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
32
  else:
33
- # If file doesn't exist, pre-populate with some data
 
34
  for model in model_names:
35
- results["wins"][model] = random.randint(0, 10)
36
- results["losses"][model] = random.randint(0, 10)
37
- results["ties"][model] = random.randint(0, 5)
 
 
38
 
39
- # Calculate total votes
40
- for model in model_names:
41
- results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
42
-
43
  return results
44
  except Exception as e:
45
  print(f"Error loading leaderboard data: {e}")
46
  # Return the initialized structure if file can't be loaded
47
  return results
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def save_leaderboard_data(results):
50
  """
51
  Saves the current leaderboard results back to the CSV file.
52
 
53
  Parameters:
54
- - results: The results dictionary containing wins, losses, ties, and votes
55
  """
56
  try:
57
  # Define the path to the CSV file
58
- csv_path = os.path.join('utils', 'arena_df_leaderboard.csv')
59
 
60
  # Convert the results dictionary to a DataFrame
61
  data = []
62
- for model in results["wins"].keys():
 
 
 
 
63
  data.append({
64
  'model': model,
 
65
  'wins': results["wins"].get(model, 0),
66
  'losses': results["losses"].get(model, 0),
67
- 'ties': results["ties"].get(model, 0)
 
 
68
  })
69
 
70
  df = pd.DataFrame(data)
71
 
 
 
 
72
  # Save to CSV
73
  df.to_csv(csv_path, index=False)
74
  print(f"Leaderboard data saved successfully to {csv_path}")
75
  except Exception as e:
76
  print(f"Error saving leaderboard data: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import pandas as pd
3
+ import math
4
+ from datetime import datetime
5
+
6
+ # Default K-factor (determines how much a single match affects ratings)
7
+ DEFAULT_K_FACTOR = 32
8
+
9
+ # Default starting Elo
10
+ DEFAULT_ELO = 1500
11
+
12
+ # Mapping of model names to their Hugging Face URLs
13
+ model_to_hf = {
14
+ "Qwen2.5-1.5b-Instruct": "https://huggingface.co/qwen/qwen2.5-1.5b-instruct",
15
+ "Qwen2.5-3b-Instruct": "https://huggingface.co/qwen/qwen2.5-3b-instruct",
16
+ # Add more models and their HF links here
17
+ }
18
+
19
+ def calculate_elo_changes(winner_rating, loser_rating, k_factor=DEFAULT_K_FACTOR, draw=False):
20
+ """
21
+ Calculate Elo rating changes for two models.
22
+
23
+ Parameters:
24
+ - winner_rating: Winner's current rating
25
+ - loser_rating: Loser's current rating
26
+ - k_factor: How much a single match affects ratings
27
+ - draw: Whether the match was a draw
28
+
29
+ Returns:
30
+ - (winner_change, loser_change): Rating changes to apply
31
+ """
32
+ # Calculate expected scores (probability of winning)
33
+ expected_winner = 1 / (1 + 10 ** ((loser_rating - winner_rating) / 400))
34
+ expected_loser = 1 / (1 + 10 ** ((winner_rating - loser_rating) / 400))
35
+
36
+ if draw:
37
+ # For a draw, both get 0.5 points
38
+ actual_winner = 0.5
39
+ actual_loser = 0.5
40
+ else:
41
+ # For a win, winner gets 1 point, loser gets 0
42
+ actual_winner = 1.0
43
+ actual_loser = 0.0
44
+
45
+ # Calculate rating changes
46
+ winner_change = k_factor * (actual_winner - expected_winner)
47
+ loser_change = k_factor * (actual_loser - expected_loser)
48
+
49
+ return winner_change, loser_change
50
+
51
+ def calculate_confidence_interval(elo_rating, num_games, confidence=0.95):
52
+ """
53
+ Calculate a confidence interval for an Elo rating.
54
+
55
+ Parameters:
56
+ - elo_rating: The current Elo rating
57
+ - num_games: Number of games played
58
+ - confidence: Confidence level (default: 0.95 for 95% confidence)
59
+
60
+ Returns:
61
+ - margin: The margin of error for the confidence interval
62
+ """
63
+ if num_games == 0:
64
+ return float('inf')
65
+
66
+ # Z-score for the given confidence level (1.96 for 95% confidence)
67
+ z = 1.96 if confidence == 0.95 else 1.645 if confidence == 0.90 else 2.576 if confidence == 0.99 else 1.96
68
+
69
+ # Standard deviation of the Elo rating
70
+ # The factor 400/sqrt(num_games) is a common approximation
71
+ std_dev = 400 / math.sqrt(num_games)
72
+
73
+ # Margin of error
74
+ margin = z * std_dev
75
+
76
+ return margin
77
 
78
  def load_leaderboard_data():
79
  """
80
  Loads the leaderboard data from the leaderboard CSV file.
81
  Returns the data in a format compatible with the application.
82
  """
83
+ # Initialize the results structure with both win/loss/tie counts and Elo ratings
84
+ results = {
85
+ "wins": {},
86
+ "losses": {},
87
+ "ties": {},
88
+ "votes": 0,
89
+ "elo": {},
90
+ "games_played": {},
91
+ "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
92
+ }
93
 
94
  try:
95
  # Define the path to the CSV file for leaderboard
96
+ csv_path = os.path.join('utils', 'arena_elo_leaderboard.csv')
97
 
98
  # Check if the file exists and load it
99
  if os.path.exists(csv_path):
 
105
  results["wins"][model] = row['wins']
106
  results["losses"][model] = row['losses']
107
  results["ties"][model] = row['ties']
108
+ results["elo"][model] = row['elo']
109
+ results["games_played"][model] = row['games_played']
110
 
111
  # Calculate total votes
112
  for model in results["wins"].keys():
113
  results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
114
  else:
115
+ # If file doesn't exist, pre-populate with some reasonable data
116
+ from .models import model_names
117
  for model in model_names:
118
+ results["wins"][model] = 0
119
+ results["losses"][model] = 0
120
+ results["ties"][model] = 0
121
+ results["elo"][model] = DEFAULT_ELO # Start everyone at 1500 Elo
122
+ results["games_played"][model] = 0
123
 
 
 
 
 
124
  return results
125
  except Exception as e:
126
  print(f"Error loading leaderboard data: {e}")
127
  # Return the initialized structure if file can't be loaded
128
  return results
129
 
130
+ def update_elo_ratings(results, model_a, model_b, winner, k_factor=DEFAULT_K_FACTOR):
131
+ """
132
+ Updates Elo ratings based on a match result.
133
+
134
+ Parameters:
135
+ - results: The current leaderboard results dictionary
136
+ - model_a: Name of model A
137
+ - model_b: Name of model B
138
+ - winner: 'left' for model A, 'right' for model B, 'tie' for a tie, 'neither' for no winner
139
+ - k_factor: How much this match affects ratings
140
+
141
+ Returns:
142
+ - Updated results dictionary
143
+ """
144
+ # Initialize ratings if not present
145
+ if model_a not in results["elo"]:
146
+ results["elo"][model_a] = DEFAULT_ELO
147
+ results["games_played"][model_a] = 0
148
+
149
+ if model_b not in results["elo"]:
150
+ results["elo"][model_b] = DEFAULT_ELO
151
+ results["games_played"][model_b] = 0
152
+
153
+ # Get current ratings
154
+ rating_a = results["elo"][model_a]
155
+ rating_b = results["elo"][model_b]
156
+
157
+ # Handle different winning scenarios
158
+ if winner == 'left':
159
+ # Model A won
160
+ change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=False)
161
+ results["wins"][model_a] = results["wins"].get(model_a, 0) + 1
162
+ results["losses"][model_b] = results["losses"].get(model_b, 0) + 1
163
+ elif winner == 'right':
164
+ # Model B won
165
+ change_b, change_a = calculate_elo_changes(rating_b, rating_a, k_factor, draw=False)
166
+ results["wins"][model_b] = results["wins"].get(model_b, 0) + 1
167
+ results["losses"][model_a] = results["losses"].get(model_a, 0) + 1
168
+ elif winner == 'tie':
169
+ # It's a tie
170
+ change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=True)
171
+ results["ties"][model_a] = results["ties"].get(model_a, 0) + 1
172
+ results["ties"][model_b] = results["ties"].get(model_b, 0) + 1
173
+ else: # 'neither' case - no winner
174
+ # No rating changes, but still log the game
175
+ change_a, change_b = 0, 0
176
+
177
+ # Apply rating changes
178
+ results["elo"][model_a] = rating_a + change_a
179
+ results["elo"][model_b] = rating_b + change_b
180
+
181
+ # Update games played counters
182
+ results["games_played"][model_a] = results["games_played"].get(model_a, 0) + 1
183
+ results["games_played"][model_b] = results["games_played"].get(model_b, 0) + 1
184
+
185
+ # Update timestamp
186
+ results["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
187
+
188
+ return results
189
+
190
  def save_leaderboard_data(results):
191
  """
192
  Saves the current leaderboard results back to the CSV file.
193
 
194
  Parameters:
195
+ - results: The results dictionary with wins, losses, ties, elo, etc.
196
  """
197
  try:
198
  # Define the path to the CSV file
199
+ csv_path = os.path.join('utils', 'arena_elo_leaderboard.csv')
200
 
201
  # Convert the results dictionary to a DataFrame
202
  data = []
203
+ for model in results["elo"].keys():
204
+ # Calculate confidence interval
205
+ games_played = results["games_played"].get(model, 0)
206
+ confidence_interval = calculate_confidence_interval(results["elo"][model], games_played)
207
+
208
  data.append({
209
  'model': model,
210
+ 'elo': round(results["elo"].get(model, DEFAULT_ELO), 1),
211
  'wins': results["wins"].get(model, 0),
212
  'losses': results["losses"].get(model, 0),
213
+ 'ties': results["ties"].get(model, 0),
214
+ 'games_played': results["games_played"].get(model, 0),
215
+ 'confidence_interval': round(confidence_interval, 1)
216
  })
217
 
218
  df = pd.DataFrame(data)
219
 
220
+ # Sort by Elo rating (descending)
221
+ df = df.sort_values(by='elo', ascending=False)
222
+
223
  # Save to CSV
224
  df.to_csv(csv_path, index=False)
225
  print(f"Leaderboard data saved successfully to {csv_path}")
226
  except Exception as e:
227
  print(f"Error saving leaderboard data: {e}")
228
+
229
+ def generate_leaderboard_html(results):
230
+ """
231
+ Generate HTML for displaying the leaderboard with Elo ratings.
232
+
233
+ Parameters:
234
+ - results: The current leaderboard results dictionary
235
+
236
+ Returns:
237
+ - HTML string for the leaderboard
238
+ """
239
+ # Prepare model data for the HTML table
240
+ model_data = []
241
+ for model in results["elo"]:
242
+ elo = results["elo"].get(model, DEFAULT_ELO)
243
+ wins = results["wins"].get(model, 0)
244
+ losses = results["losses"].get(model, 0)
245
+ ties = results["ties"].get(model, 0)
246
+ total_comparisons = wins + losses + ties
247
+ win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0
248
+
249
+ # Calculate confidence interval
250
+ games_played = results["games_played"].get(model, 0)
251
+ confidence = calculate_confidence_interval(elo, games_played)
252
+
253
+ model_data.append({
254
+ "model": model,
255
+ "elo": elo,
256
+ "wins": wins,
257
+ "losses": losses,
258
+ "ties": ties,
259
+ "comparisons": total_comparisons,
260
+ "win_rate": win_rate,
261
+ "confidence": confidence
262
+ })
263
+
264
+ # Sort by Elo rating
265
+ model_data.sort(key=lambda x: x["elo"], reverse=True)
266
+
267
+ # Start building HTML table
268
+ html = """
269
+ <table class="leaderboard-table">
270
+ <thead>
271
+ <tr>
272
+ <th class="centered">Rank</th>
273
+ <th>Model</th>
274
+ <th>Elo Rating</th>
275
+ <th class="centered">Win Rate (%)</th>
276
+ <th class="centered">Wins</th>
277
+ <th class="centered">Losses</th>
278
+ <th class="centered">Ties</th>
279
+ <th class="centered">Comparisons</th>
280
+ </tr>
281
+ </thead>
282
+ <tbody>
283
+ """
284
+
285
+ # Add rows to the HTML table
286
+ for rank, data in enumerate(model_data, 1):
287
+ model = data["model"]
288
+ elo = data["elo"]
289
+ wins = data["wins"]
290
+ losses = data["losses"]
291
+ ties = data["ties"]
292
+ comparisons = data["comparisons"]
293
+ win_rate = data["win_rate"]
294
+ confidence = data["confidence"]
295
+
296
+ # Create model link if in the mapping
297
+ if model in model_to_hf:
298
+ model_html = f'<a href="{model_to_hf[model]}" target="_blank" rel="noopener noreferrer" class="model-link">{model}<span class="external-icon">↗</span></a>'
299
+ else:
300
+ model_html = model
301
+
302
+ # Format Elo with confidence interval
303
+ elo_html = f"{elo:.1f} <span class='confidence-value'>± {confidence:.1f}</span>"
304
+
305
+ # Add row to table
306
+ html += f"""
307
+ <tr>
308
+ <td class="centered"><strong>{rank}</strong></td>
309
+ <td>{model_html}</td>
310
+ <td class="elo-col">{elo_html}</td>
311
+ <td class="centered">{win_rate:.1%}</td>
312
+ <td class="centered">{wins}</td>
313
+ <td class="centered">{losses}</td>
314
+ <td class="centered">{ties}</td>
315
+ <td class="centered">{comparisons}</td>
316
+ </tr>
317
+ """
318
+
319
+ # Close the HTML table
320
+ html += """
321
+ </tbody>
322
+ </table>
323
+ """
324
+
325
+ return html
326
+
327
+ def submit_vote_with_elo(m_a, m_b, winner, feedback, current_results):
328
+ """
329
+ Enhanced version of submit_vote that calculates and applies Elo rating changes.
330
+ This replaces the original submit_vote_fixed function.
331
+
332
+ Parameters:
333
+ - m_a: Model A name
334
+ - m_b: Model B name
335
+ - winner: 'left', 'right', 'tie', or 'neither'
336
+ - feedback: List of feedback options selected
337
+ - current_results: The current leaderboard state
338
+
339
+ Returns:
340
+ - Updated results and UI components
341
+ """
342
+ if winner is None:
343
+ print("Warning: Submit called without a winner selected.")
344
+ return {}
345
+
346
+ # Update Elo ratings
347
+ updated_results = update_elo_ratings(current_results.copy(), m_a, m_b, winner)
348
+
349
+ # Update vote count
350
+ updated_results["votes"] = updated_results.get("votes", 0) + 1
351
+
352
+ # Save updated results
353
+ save_leaderboard_data(updated_results)
354
+
355
+ # Generate HTML leaderboard
356
+ leaderboard_html = generate_leaderboard_html(updated_results)
357
+
358
+ # Import gradio for the gr.update objects
359
+ import gradio as gr
360
+
361
+ return [
362
+ True, updated_results,
363
+ gr.update(interactive=False), gr.update(interactive=False),
364
+ gr.update(interactive=False), gr.update(interactive=False),
365
+ gr.update(interactive=False), gr.update(visible=True),
366
+ gr.update(visible=False), gr.update(visible=True),
367
+ gr.update(interactive=False), gr.update(value=leaderboard_html, visible=True),
368
+ gr.update(elem_classes=["results-revealed"]),
369
+ gr.update(interactive=True), gr.update(value=m_a), gr.update(value=m_b)
370
+ ]