aizip-dev commited on
Commit
3f599cd
·
verified ·
1 Parent(s): ad2a067

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -7
app.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
  import os
5
  import threading
6
  import time
 
7
  from utils.data_loader import get_random_example
8
  from utils.models import generate_summaries, model_names
9
  from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
@@ -20,6 +21,46 @@ feedback_options = {
20
  "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
21
  }
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def load_context(set_interrupt=False):
24
  if set_interrupt:
25
  generation_interrupt.set()
@@ -62,7 +103,30 @@ def generate_model_summaries(example):
62
  return result
63
 
64
  try:
65
- m_a_name, m_b_name = random.sample(model_names, 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  result["model_a"] = m_a_name
68
  result["model_b"] = m_b_name
@@ -234,13 +298,8 @@ with gr.Blocks(theme=gr.themes.Default(
234
  gr.Markdown("""
235
  🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient.
236
 
237
- 1️⃣ **Review the query and context** - ✨Highlighted text✨ contains key information for good answers
238
-
239
- 2️⃣ **Compare answers** generated by two different models working with the same query and context
240
-
241
- 3️⃣ **Vote for the better response** or select 'Tie/Neither' if appropriate
242
 
243
- > **Note:** Highlights are abbreviated contexts based on ground truth (via GPT-4o). Full Context shows the actual text provided to the models.
244
  """)
245
 
246
  gr.HTML("<hr>")
 
4
  import os
5
  import threading
6
  import time
7
+ import numpy as np
8
  from utils.data_loader import get_random_example
9
  from utils.models import generate_summaries, model_names
10
  from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
 
21
  "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
22
  }
23
 
24
+ def weighted_sample_without_replacement(population, weights, k=2):
25
+ """
26
+ Performs a weighted random sampling without replacement.
27
+
28
+ Args:
29
+ population: The list of items to sample from
30
+ weights: The weight for each item
31
+ k: Number of items to sample
32
+
33
+ Returns:
34
+ A list of k sampled items
35
+ """
36
+ if len(population) <= k:
37
+ return population
38
+
39
+ # Convert weights to numpy array for efficient operations
40
+ weights = np.array(weights)
41
+
42
+ # Create a copy of the population and weights
43
+ remaining_population = population.copy()
44
+ remaining_weights = weights.copy()
45
+
46
+ selected = []
47
+
48
+ for _ in range(k):
49
+ # Normalize weights so they sum to 1
50
+ normalized_weights = remaining_weights / remaining_weights.sum()
51
+
52
+ # Randomly select one item based on weights
53
+ selected_idx = np.random.choice(len(remaining_population), p=normalized_weights)
54
+
55
+ # Add the selected item to our result
56
+ selected.append(remaining_population[selected_idx])
57
+
58
+ # Remove the selected item from the pool
59
+ remaining_population.pop(selected_idx)
60
+ remaining_weights = np.delete(remaining_weights, selected_idx)
61
+
62
+ return selected
63
+
64
  def load_context(set_interrupt=False):
65
  if set_interrupt:
66
  generation_interrupt.set()
 
103
  return result
104
 
105
  try:
106
+ # Get current leaderboard data to determine model usage counts
107
+ leaderboard_data = load_leaderboard_data()
108
+
109
+ # Calculate weights using inverse weighting
110
+ # Weight = K / (games_played + C)
111
+ K = 100 # Scaling factor
112
+ C = 5 # Smoothing constant
113
+
114
+ weights = []
115
+ model_list = []
116
+
117
+ for model in model_names:
118
+ # Get games played for the model, default to 0 if not found
119
+ games_played = leaderboard_data["games_played"].get(model, 0)
120
+
121
+ # Calculate weight using inverse formula
122
+ weight = K / (games_played + C)
123
+
124
+ weights.append(weight)
125
+ model_list.append(model)
126
+
127
+ # Select two models using weighted sampling without replacement
128
+ selected_models = weighted_sample_without_replacement(model_list, weights, k=2)
129
+ m_a_name, m_b_name = selected_models
130
 
131
  result["model_a"] = m_a_name
132
  result["model_b"] = m_b_name
 
298
  gr.Markdown("""
299
  🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient.
300
 
301
+ 📝 Insturction: 1. **Review the query and context**. 2. **Compare answers** generated by two different models. 3. **Vote for the better response** or select 'Tie/Neither' if appropriate.
 
 
 
 
302
 
 
303
  """)
304
 
305
  gr.HTML("<hr>")