SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip

kai-aizip commited on May 5

Commit

c93381b

verified ·

1 Parent(s): 32ae078

Implemented elo (#6)

Browse files

- Implemented elo (8878917366baff6fee97ed2bf699891a4e393b0e)

Co-authored-by: Kai <[email protected]>

Files changed (1) hide show

utils/leaderboard.py +311 -17

utils/leaderboard.py CHANGED Viewed

@@ -1,19 +1,99 @@
 import os
 import pandas as pd
-import random
-from .models import model_names
 def load_leaderboard_data():
     """
     Loads the leaderboard data from the leaderboard CSV file.
     Returns the data in a format compatible with the application.
     """
-    # Initialize the results structure
-    results = {"wins": {}, "losses": {}, "ties": {}, "votes": 0}
     try:
         # Define the path to the CSV file for leaderboard
-        csv_path = os.path.join('utils', 'arena_df_leaderboard.csv')
         # Check if the file exists and load it
         if os.path.exists(csv_path):
@@ -25,52 +105,266 @@ def load_leaderboard_data():
                 results["wins"][model] = row['wins']
                 results["losses"][model] = row['losses']
                 results["ties"][model] = row['ties']
             # Calculate total votes
             for model in results["wins"].keys():
                 results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
         else:
-            # If file doesn't exist, pre-populate with some data
             for model in model_names:
-                results["wins"][model] = random.randint(0, 10)
-                results["losses"][model] = random.randint(0, 10)
-                results["ties"][model] = random.randint(0, 5)
-            # Calculate total votes
-            for model in model_names:
-                results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
         return results
     except Exception as e:
         print(f"Error loading leaderboard data: {e}")
         # Return the initialized structure if file can't be loaded
         return results
 def save_leaderboard_data(results):
     """
     Saves the current leaderboard results back to the CSV file.
     Parameters:
-    - results: The results dictionary containing wins, losses, ties, and votes
     """
     try:
         # Define the path to the CSV file
-        csv_path = os.path.join('utils', 'arena_df_leaderboard.csv')
         # Convert the results dictionary to a DataFrame
         data = []
-        for model in results["wins"].keys():
             data.append({
                 'model': model,
                 'wins': results["wins"].get(model, 0),
                 'losses': results["losses"].get(model, 0),
-                'ties': results["ties"].get(model, 0)
             })
         df = pd.DataFrame(data)
         # Save to CSV
         df.to_csv(csv_path, index=False)
         print(f"Leaderboard data saved successfully to {csv_path}")
     except Exception as e:
         print(f"Error saving leaderboard data: {e}")

 import os
 import pandas as pd
+import math
+from datetime import datetime
+# Default K-factor (determines how much a single match affects ratings)
+DEFAULT_K_FACTOR = 32
+# Default starting Elo
+DEFAULT_ELO = 1500
+# Mapping of model names to their Hugging Face URLs
+model_to_hf = {
+    "Qwen2.5-1.5b-Instruct": "https://huggingface.co/qwen/qwen2.5-1.5b-instruct",
+    "Qwen2.5-3b-Instruct": "https://huggingface.co/qwen/qwen2.5-3b-instruct",
+    # Add more models and their HF links here
+}
+def calculate_elo_changes(winner_rating, loser_rating, k_factor=DEFAULT_K_FACTOR, draw=False):
+    """
+    Calculate Elo rating changes for two models.
+    Parameters:
+    - winner_rating: Winner's current rating
+    - loser_rating: Loser's current rating
+    - k_factor: How much a single match affects ratings
+    - draw: Whether the match was a draw
+    Returns:
+    - (winner_change, loser_change): Rating changes to apply
+    """
+    # Calculate expected scores (probability of winning)
+    expected_winner = 1 / (1 + 10 ** ((loser_rating - winner_rating) / 400))
+    expected_loser = 1 / (1 + 10 ** ((winner_rating - loser_rating) / 400))
+    if draw:
+        # For a draw, both get 0.5 points
+        actual_winner = 0.5
+        actual_loser = 0.5
+    else:
+        # For a win, winner gets 1 point, loser gets 0
+        actual_winner = 1.0
+        actual_loser = 0.0
+    # Calculate rating changes
+    winner_change = k_factor * (actual_winner - expected_winner)
+    loser_change = k_factor * (actual_loser - expected_loser)
+    return winner_change, loser_change
+def calculate_confidence_interval(elo_rating, num_games, confidence=0.95):
+    """
+    Calculate a confidence interval for an Elo rating.
+    Parameters:
+    - elo_rating: The current Elo rating
+    - num_games: Number of games played
+    - confidence: Confidence level (default: 0.95 for 95% confidence)
+    Returns:
+    - margin: The margin of error for the confidence interval
+    """
+    if num_games == 0:
+        return float('inf')
+    # Z-score for the given confidence level (1.96 for 95% confidence)
+    z = 1.96 if confidence == 0.95 else 1.645 if confidence == 0.90 else 2.576 if confidence == 0.99 else 1.96
+    # Standard deviation of the Elo rating
+    # The factor 400/sqrt(num_games) is a common approximation
+    std_dev = 400 / math.sqrt(num_games)
+    # Margin of error
+    margin = z * std_dev
+    return margin
 def load_leaderboard_data():
     """
     Loads the leaderboard data from the leaderboard CSV file.
     Returns the data in a format compatible with the application.
     """
+    # Initialize the results structure with both win/loss/tie counts and Elo ratings
+    results = {
+        "wins": {},
+        "losses": {},
+        "ties": {},
+        "votes": 0,
+        "elo": {},
+        "games_played": {},
+        "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    }
     try:
         # Define the path to the CSV file for leaderboard
+        csv_path = os.path.join('utils', 'arena_elo_leaderboard.csv')
         # Check if the file exists and load it
         if os.path.exists(csv_path):
                 results["wins"][model] = row['wins']
                 results["losses"][model] = row['losses']
                 results["ties"][model] = row['ties']
+                results["elo"][model] = row['elo']
+                results["games_played"][model] = row['games_played']
             # Calculate total votes
             for model in results["wins"].keys():
                 results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
         else:
+            # If file doesn't exist, pre-populate with some reasonable data
+            from .models import model_names
             for model in model_names:
+                results["wins"][model] = 0
+                results["losses"][model] = 0
+                results["ties"][model] = 0
+                results["elo"][model] = DEFAULT_ELO  # Start everyone at 1500 Elo
+                results["games_played"][model] = 0
         return results
     except Exception as e:
         print(f"Error loading leaderboard data: {e}")
         # Return the initialized structure if file can't be loaded
         return results
+def update_elo_ratings(results, model_a, model_b, winner, k_factor=DEFAULT_K_FACTOR):
+    """
+    Updates Elo ratings based on a match result.
+    Parameters:
+    - results: The current leaderboard results dictionary
+    - model_a: Name of model A
+    - model_b: Name of model B
+    - winner: 'left' for model A, 'right' for model B, 'tie' for a tie, 'neither' for no winner
+    - k_factor: How much this match affects ratings
+    Returns:
+    - Updated results dictionary
+    """
+    # Initialize ratings if not present
+    if model_a not in results["elo"]:
+        results["elo"][model_a] = DEFAULT_ELO
+        results["games_played"][model_a] = 0
+    if model_b not in results["elo"]:
+        results["elo"][model_b] = DEFAULT_ELO
+        results["games_played"][model_b] = 0
+    # Get current ratings
+    rating_a = results["elo"][model_a]
+    rating_b = results["elo"][model_b]
+    # Handle different winning scenarios
+    if winner == 'left':
+        # Model A won
+        change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=False)
+        results["wins"][model_a] = results["wins"].get(model_a, 0) + 1
+        results["losses"][model_b] = results["losses"].get(model_b, 0) + 1
+    elif winner == 'right':
+        # Model B won
+        change_b, change_a = calculate_elo_changes(rating_b, rating_a, k_factor, draw=False)
+        results["wins"][model_b] = results["wins"].get(model_b, 0) + 1
+        results["losses"][model_a] = results["losses"].get(model_a, 0) + 1
+    elif winner == 'tie':
+        # It's a tie
+        change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=True)
+        results["ties"][model_a] = results["ties"].get(model_a, 0) + 1
+        results["ties"][model_b] = results["ties"].get(model_b, 0) + 1
+    else:  # 'neither' case - no winner
+        # No rating changes, but still log the game
+        change_a, change_b = 0, 0
+    # Apply rating changes
+    results["elo"][model_a] = rating_a + change_a
+    results["elo"][model_b] = rating_b + change_b
+    # Update games played counters
+    results["games_played"][model_a] = results["games_played"].get(model_a, 0) + 1
+    results["games_played"][model_b] = results["games_played"].get(model_b, 0) + 1
+    # Update timestamp
+    results["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    return results
 def save_leaderboard_data(results):
     """
     Saves the current leaderboard results back to the CSV file.
     Parameters:
+    - results: The results dictionary with wins, losses, ties, elo, etc.
     """
     try:
         # Define the path to the CSV file
+        csv_path = os.path.join('utils', 'arena_elo_leaderboard.csv')
         # Convert the results dictionary to a DataFrame
         data = []
+        for model in results["elo"].keys():
+            # Calculate confidence interval
+            games_played = results["games_played"].get(model, 0)
+            confidence_interval = calculate_confidence_interval(results["elo"][model], games_played)
             data.append({
                 'model': model,
+                'elo': round(results["elo"].get(model, DEFAULT_ELO), 1),
                 'wins': results["wins"].get(model, 0),
                 'losses': results["losses"].get(model, 0),
+                'ties': results["ties"].get(model, 0),
+                'games_played': results["games_played"].get(model, 0),
+                'confidence_interval': round(confidence_interval, 1)
             })
         df = pd.DataFrame(data)
+        # Sort by Elo rating (descending)
+        df = df.sort_values(by='elo', ascending=False)
         # Save to CSV
         df.to_csv(csv_path, index=False)
         print(f"Leaderboard data saved successfully to {csv_path}")
     except Exception as e:
         print(f"Error saving leaderboard data: {e}")
+def generate_leaderboard_html(results):
+    """
+    Generate HTML for displaying the leaderboard with Elo ratings.
+    Parameters:
+    - results: The current leaderboard results dictionary
+    Returns:
+    - HTML string for the leaderboard
+    """
+    # Prepare model data for the HTML table
+    model_data = []
+    for model in results["elo"]:
+        elo = results["elo"].get(model, DEFAULT_ELO)
+        wins = results["wins"].get(model, 0)
+        losses = results["losses"].get(model, 0)
+        ties = results["ties"].get(model, 0)
+        total_comparisons = wins + losses + ties
+        win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0
+        # Calculate confidence interval
+        games_played = results["games_played"].get(model, 0)
+        confidence = calculate_confidence_interval(elo, games_played)
+        model_data.append({
+            "model": model,
+            "elo": elo,
+            "wins": wins,
+            "losses": losses,
+            "ties": ties,
+            "comparisons": total_comparisons,
+            "win_rate": win_rate,
+            "confidence": confidence
+        })
+    # Sort by Elo rating
+    model_data.sort(key=lambda x: x["elo"], reverse=True)
+    # Start building HTML table
+    html = """
+    <table class="leaderboard-table">
+        <thead>
+            <tr>
+                <th class="centered">Rank</th>
+                <th>Model</th>
+                <th>Elo Rating</th>
+                <th class="centered">Win Rate (%)</th>
+                <th class="centered">Wins</th>
+                <th class="centered">Losses</th>
+                <th class="centered">Ties</th>
+                <th class="centered">Comparisons</th>
+            </tr>
+        </thead>
+        <tbody>
+    """
+    # Add rows to the HTML table
+    for rank, data in enumerate(model_data, 1):
+        model = data["model"]
+        elo = data["elo"]
+        wins = data["wins"]
+        losses = data["losses"]
+        ties = data["ties"]
+        comparisons = data["comparisons"]
+        win_rate = data["win_rate"]
+        confidence = data["confidence"]
+        # Create model link if in the mapping
+        if model in model_to_hf:
+            model_html = f'<a href="{model_to_hf[model]}" target="_blank" rel="noopener noreferrer" class="model-link">{model}<span class="external-icon">↗</span></a>'
+        else:
+            model_html = model
+        # Format Elo with confidence interval
+        elo_html = f"{elo:.1f} <span class='confidence-value'>± {confidence:.1f}</span>"
+        # Add row to table
+        html += f"""
+        <tr>
+            <td class="centered"><strong>{rank}</strong></td>
+            <td>{model_html}</td>
+            <td class="elo-col">{elo_html}</td>
+            <td class="centered">{win_rate:.1%}</td>
+            <td class="centered">{wins}</td>
+            <td class="centered">{losses}</td>
+            <td class="centered">{ties}</td>
+            <td class="centered">{comparisons}</td>
+        </tr>
+        """
+    # Close the HTML table
+    html += """
+        </tbody>
+    </table>
+    """
+    return html
+def submit_vote_with_elo(m_a, m_b, winner, feedback, current_results):
+    """
+    Enhanced version of submit_vote that calculates and applies Elo rating changes.
+    This replaces the original submit_vote_fixed function.
+    Parameters:
+    - m_a: Model A name
+    - m_b: Model B name
+    - winner: 'left', 'right', 'tie', or 'neither'
+    - feedback: List of feedback options selected
+    - current_results: The current leaderboard state
+    Returns:
+    - Updated results and UI components
+    """
+    if winner is None:
+        print("Warning: Submit called without a winner selected.")
+        return {}
+    # Update Elo ratings
+    updated_results = update_elo_ratings(current_results.copy(), m_a, m_b, winner)
+    # Update vote count
+    updated_results["votes"] = updated_results.get("votes", 0) + 1
+    # Save updated results
+    save_leaderboard_data(updated_results)
+    # Generate HTML leaderboard
+    leaderboard_html = generate_leaderboard_html(updated_results)
+    # Import gradio for the gr.update objects
+    import gradio as gr
+    return [
+        True, updated_results,
+        gr.update(interactive=False), gr.update(interactive=False),
+        gr.update(interactive=False), gr.update(interactive=False),
+        gr.update(interactive=False), gr.update(visible=True),
+        gr.update(visible=False), gr.update(visible=True),
+        gr.update(interactive=False), gr.update(value=leaderboard_html, visible=True),
+        gr.update(elem_classes=["results-revealed"]),
+        gr.update(interactive=True), gr.update(value=m_a), gr.update(value=m_b)
+    ]