SLM-RAG-Arena

Running on Zero

App Files Files Community

aizip-dev commited on May 13

Commit

3f599cd

verified ·

1 Parent(s): ad2a067

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -7

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import pandas as pd
 import os
 import threading
 import time
 from utils.data_loader import get_random_example
 from utils.models import generate_summaries, model_names
 from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
@@ -20,6 +21,46 @@ feedback_options = {
                "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
 }
 def load_context(set_interrupt=False):
     if set_interrupt:
         generation_interrupt.set()
@@ -62,7 +103,30 @@ def generate_model_summaries(example):
         return result
     try:
-        m_a_name, m_b_name = random.sample(model_names, 2)
         result["model_a"] = m_a_name
         result["model_b"] = m_b_name
@@ -234,13 +298,8 @@ with gr.Blocks(theme=gr.themes.Default(
             gr.Markdown("""
 🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient.
-1️⃣ **Review the query and context** - ✨Highlighted text✨ contains key information for good answers
-2️⃣ **Compare answers** generated by two different models working with the same query and context
-3️⃣ **Vote for the better response** or select 'Tie/Neither' if appropriate
-> **Note:** Highlights are abbreviated contexts based on ground truth (via GPT-4o). Full Context shows the actual text provided to the models.
 """)
             gr.HTML("<hr>")

 import os
 import threading
 import time
+import numpy as np
 from utils.data_loader import get_random_example
 from utils.models import generate_summaries, model_names
 from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
                "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
 }
+def weighted_sample_without_replacement(population, weights, k=2):
+    """
+    Performs a weighted random sampling without replacement.
+    Args:
+        population: The list of items to sample from
+        weights: The weight for each item
+        k: Number of items to sample
+    Returns:
+        A list of k sampled items
+    """
+    if len(population) <= k:
+        return population
+    # Convert weights to numpy array for efficient operations
+    weights = np.array(weights)
+    # Create a copy of the population and weights
+    remaining_population = population.copy()
+    remaining_weights = weights.copy()
+    selected = []
+    for _ in range(k):
+        # Normalize weights so they sum to 1
+        normalized_weights = remaining_weights / remaining_weights.sum()
+        # Randomly select one item based on weights
+        selected_idx = np.random.choice(len(remaining_population), p=normalized_weights)
+        # Add the selected item to our result
+        selected.append(remaining_population[selected_idx])
+        # Remove the selected item from the pool
+        remaining_population.pop(selected_idx)
+        remaining_weights = np.delete(remaining_weights, selected_idx)
+    return selected
 def load_context(set_interrupt=False):
     if set_interrupt:
         generation_interrupt.set()
         return result
     try:
+        # Get current leaderboard data to determine model usage counts
+        leaderboard_data = load_leaderboard_data()
+        # Calculate weights using inverse weighting
+        # Weight = K / (games_played + C)
+        K = 100  # Scaling factor
+        C = 5    # Smoothing constant
+        weights = []
+        model_list = []
+        for model in model_names:
+            # Get games played for the model, default to 0 if not found
+            games_played = leaderboard_data["games_played"].get(model, 0)
+            # Calculate weight using inverse formula
+            weight = K / (games_played + C)
+            weights.append(weight)
+            model_list.append(model)
+        # Select two models using weighted sampling without replacement
+        selected_models = weighted_sample_without_replacement(model_list, weights, k=2)
+        m_a_name, m_b_name = selected_models
         result["model_a"] = m_a_name
         result["model_b"] = m_b_name
             gr.Markdown("""
 🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient.
+📝 Insturction: 1. **Review the query and context**. 2. **Compare answers** generated by two different models. 3. **Vote for the better response** or select 'Tie/Neither' if appropriate.
 """)
             gr.HTML("<hr>")