Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
|
|
4 |
import os
|
5 |
import threading
|
6 |
import time
|
|
|
7 |
from utils.data_loader import get_random_example
|
8 |
from utils.models import generate_summaries, model_names
|
9 |
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
|
@@ -20,6 +21,46 @@ feedback_options = {
|
|
20 |
"Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
|
21 |
}
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def load_context(set_interrupt=False):
|
24 |
if set_interrupt:
|
25 |
generation_interrupt.set()
|
@@ -62,7 +103,30 @@ def generate_model_summaries(example):
|
|
62 |
return result
|
63 |
|
64 |
try:
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
result["model_a"] = m_a_name
|
68 |
result["model_b"] = m_b_name
|
@@ -234,13 +298,8 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
234 |
gr.Markdown("""
|
235 |
🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient.
|
236 |
|
237 |
-
1
|
238 |
-
|
239 |
-
2️⃣ **Compare answers** generated by two different models working with the same query and context
|
240 |
-
|
241 |
-
3️⃣ **Vote for the better response** or select 'Tie/Neither' if appropriate
|
242 |
|
243 |
-
> **Note:** Highlights are abbreviated contexts based on ground truth (via GPT-4o). Full Context shows the actual text provided to the models.
|
244 |
""")
|
245 |
|
246 |
gr.HTML("<hr>")
|
|
|
4 |
import os
|
5 |
import threading
|
6 |
import time
|
7 |
+
import numpy as np
|
8 |
from utils.data_loader import get_random_example
|
9 |
from utils.models import generate_summaries, model_names
|
10 |
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
|
|
|
21 |
"Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
|
22 |
}
|
23 |
|
24 |
+
def weighted_sample_without_replacement(population, weights, k=2):
|
25 |
+
"""
|
26 |
+
Performs a weighted random sampling without replacement.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
population: The list of items to sample from
|
30 |
+
weights: The weight for each item
|
31 |
+
k: Number of items to sample
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
A list of k sampled items
|
35 |
+
"""
|
36 |
+
if len(population) <= k:
|
37 |
+
return population
|
38 |
+
|
39 |
+
# Convert weights to numpy array for efficient operations
|
40 |
+
weights = np.array(weights)
|
41 |
+
|
42 |
+
# Create a copy of the population and weights
|
43 |
+
remaining_population = population.copy()
|
44 |
+
remaining_weights = weights.copy()
|
45 |
+
|
46 |
+
selected = []
|
47 |
+
|
48 |
+
for _ in range(k):
|
49 |
+
# Normalize weights so they sum to 1
|
50 |
+
normalized_weights = remaining_weights / remaining_weights.sum()
|
51 |
+
|
52 |
+
# Randomly select one item based on weights
|
53 |
+
selected_idx = np.random.choice(len(remaining_population), p=normalized_weights)
|
54 |
+
|
55 |
+
# Add the selected item to our result
|
56 |
+
selected.append(remaining_population[selected_idx])
|
57 |
+
|
58 |
+
# Remove the selected item from the pool
|
59 |
+
remaining_population.pop(selected_idx)
|
60 |
+
remaining_weights = np.delete(remaining_weights, selected_idx)
|
61 |
+
|
62 |
+
return selected
|
63 |
+
|
64 |
def load_context(set_interrupt=False):
|
65 |
if set_interrupt:
|
66 |
generation_interrupt.set()
|
|
|
103 |
return result
|
104 |
|
105 |
try:
|
106 |
+
# Get current leaderboard data to determine model usage counts
|
107 |
+
leaderboard_data = load_leaderboard_data()
|
108 |
+
|
109 |
+
# Calculate weights using inverse weighting
|
110 |
+
# Weight = K / (games_played + C)
|
111 |
+
K = 100 # Scaling factor
|
112 |
+
C = 5 # Smoothing constant
|
113 |
+
|
114 |
+
weights = []
|
115 |
+
model_list = []
|
116 |
+
|
117 |
+
for model in model_names:
|
118 |
+
# Get games played for the model, default to 0 if not found
|
119 |
+
games_played = leaderboard_data["games_played"].get(model, 0)
|
120 |
+
|
121 |
+
# Calculate weight using inverse formula
|
122 |
+
weight = K / (games_played + C)
|
123 |
+
|
124 |
+
weights.append(weight)
|
125 |
+
model_list.append(model)
|
126 |
+
|
127 |
+
# Select two models using weighted sampling without replacement
|
128 |
+
selected_models = weighted_sample_without_replacement(model_list, weights, k=2)
|
129 |
+
m_a_name, m_b_name = selected_models
|
130 |
|
131 |
result["model_a"] = m_a_name
|
132 |
result["model_b"] = m_b_name
|
|
|
298 |
gr.Markdown("""
|
299 |
🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient.
|
300 |
|
301 |
+
📝 Insturction: 1. **Review the query and context**. 2. **Compare answers** generated by two different models. 3. **Vote for the better response** or select 'Tie/Neither' if appropriate.
|
|
|
|
|
|
|
|
|
302 |
|
|
|
303 |
""")
|
304 |
|
305 |
gr.HTML("<hr>")
|