Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import pandas as pd
|
|
| 4 |
import os
|
| 5 |
import threading
|
| 6 |
import time
|
|
|
|
| 7 |
from utils.data_loader import get_random_example
|
| 8 |
from utils.models import generate_summaries, model_names
|
| 9 |
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
|
|
@@ -20,6 +21,46 @@ feedback_options = {
|
|
| 20 |
"Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
|
| 21 |
}
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def load_context(set_interrupt=False):
|
| 24 |
if set_interrupt:
|
| 25 |
generation_interrupt.set()
|
|
@@ -62,7 +103,30 @@ def generate_model_summaries(example):
|
|
| 62 |
return result
|
| 63 |
|
| 64 |
try:
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
result["model_a"] = m_a_name
|
| 68 |
result["model_b"] = m_b_name
|
|
@@ -234,13 +298,8 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
| 234 |
gr.Markdown("""
|
| 235 |
🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient.
|
| 236 |
|
| 237 |
-
1
|
| 238 |
-
|
| 239 |
-
2️⃣ **Compare answers** generated by two different models working with the same query and context
|
| 240 |
-
|
| 241 |
-
3️⃣ **Vote for the better response** or select 'Tie/Neither' if appropriate
|
| 242 |
|
| 243 |
-
> **Note:** Highlights are abbreviated contexts based on ground truth (via GPT-4o). Full Context shows the actual text provided to the models.
|
| 244 |
""")
|
| 245 |
|
| 246 |
gr.HTML("<hr>")
|
|
|
|
| 4 |
import os
|
| 5 |
import threading
|
| 6 |
import time
|
| 7 |
+
import numpy as np
|
| 8 |
from utils.data_loader import get_random_example
|
| 9 |
from utils.models import generate_summaries, model_names
|
| 10 |
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
|
|
|
|
| 21 |
"Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
|
| 22 |
}
|
| 23 |
|
| 24 |
+
def weighted_sample_without_replacement(population, weights, k=2):
|
| 25 |
+
"""
|
| 26 |
+
Performs a weighted random sampling without replacement.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
population: The list of items to sample from
|
| 30 |
+
weights: The weight for each item
|
| 31 |
+
k: Number of items to sample
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
A list of k sampled items
|
| 35 |
+
"""
|
| 36 |
+
if len(population) <= k:
|
| 37 |
+
return population
|
| 38 |
+
|
| 39 |
+
# Convert weights to numpy array for efficient operations
|
| 40 |
+
weights = np.array(weights)
|
| 41 |
+
|
| 42 |
+
# Create a copy of the population and weights
|
| 43 |
+
remaining_population = population.copy()
|
| 44 |
+
remaining_weights = weights.copy()
|
| 45 |
+
|
| 46 |
+
selected = []
|
| 47 |
+
|
| 48 |
+
for _ in range(k):
|
| 49 |
+
# Normalize weights so they sum to 1
|
| 50 |
+
normalized_weights = remaining_weights / remaining_weights.sum()
|
| 51 |
+
|
| 52 |
+
# Randomly select one item based on weights
|
| 53 |
+
selected_idx = np.random.choice(len(remaining_population), p=normalized_weights)
|
| 54 |
+
|
| 55 |
+
# Add the selected item to our result
|
| 56 |
+
selected.append(remaining_population[selected_idx])
|
| 57 |
+
|
| 58 |
+
# Remove the selected item from the pool
|
| 59 |
+
remaining_population.pop(selected_idx)
|
| 60 |
+
remaining_weights = np.delete(remaining_weights, selected_idx)
|
| 61 |
+
|
| 62 |
+
return selected
|
| 63 |
+
|
| 64 |
def load_context(set_interrupt=False):
|
| 65 |
if set_interrupt:
|
| 66 |
generation_interrupt.set()
|
|
|
|
| 103 |
return result
|
| 104 |
|
| 105 |
try:
|
| 106 |
+
# Get current leaderboard data to determine model usage counts
|
| 107 |
+
leaderboard_data = load_leaderboard_data()
|
| 108 |
+
|
| 109 |
+
# Calculate weights using inverse weighting
|
| 110 |
+
# Weight = K / (games_played + C)
|
| 111 |
+
K = 100 # Scaling factor
|
| 112 |
+
C = 5 # Smoothing constant
|
| 113 |
+
|
| 114 |
+
weights = []
|
| 115 |
+
model_list = []
|
| 116 |
+
|
| 117 |
+
for model in model_names:
|
| 118 |
+
# Get games played for the model, default to 0 if not found
|
| 119 |
+
games_played = leaderboard_data["games_played"].get(model, 0)
|
| 120 |
+
|
| 121 |
+
# Calculate weight using inverse formula
|
| 122 |
+
weight = K / (games_played + C)
|
| 123 |
+
|
| 124 |
+
weights.append(weight)
|
| 125 |
+
model_list.append(model)
|
| 126 |
+
|
| 127 |
+
# Select two models using weighted sampling without replacement
|
| 128 |
+
selected_models = weighted_sample_without_replacement(model_list, weights, k=2)
|
| 129 |
+
m_a_name, m_b_name = selected_models
|
| 130 |
|
| 131 |
result["model_a"] = m_a_name
|
| 132 |
result["model_b"] = m_b_name
|
|
|
|
| 298 |
gr.Markdown("""
|
| 299 |
🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient.
|
| 300 |
|
| 301 |
+
📝 Insturction: 1. **Review the query and context**. 2. **Compare answers** generated by two different models. 3. **Vote for the better response** or select 'Tie/Neither' if appropriate.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
|
|
|
| 303 |
""")
|
| 304 |
|
| 305 |
gr.HTML("<hr>")
|