import gradio as gr
import random
import pandas as pd
import os
import threading
import time
import numpy as np
from utils.data_loader import get_random_example
from utils.models import generate_summaries, model_names
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
from utils.vote_logger import save_vote_details
from utils.shared import generation_interrupt
feedback_options = {
"left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
"right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
"tie": ["Model A: Complete", "Model A: Accurate", "Model A: Relevant", "Model A: Well written", "Model A: Correct refusal (if applicable)",
"Model B: Complete", "Model B: Accurate", "Model B: Relevant", "Model B: Well written", "Model B: Corrent refusal (if applicable)"],
"neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)",
"Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
}
def weighted_sample_without_replacement(population, weights, k=2):
"""
Performs a weighted random sampling without replacement.
Args:
population: The list of items to sample from
weights: The weight for each item
k: Number of items to sample
Returns:
A list of k sampled items
"""
if len(population) <= k:
return population
# Convert weights to numpy array for efficient operations
weights = np.array(weights)
# Create a copy of the population and weights
remaining_population = population.copy()
remaining_weights = weights.copy()
selected = []
for _ in range(k):
# Normalize weights so they sum to 1
normalized_weights = remaining_weights / remaining_weights.sum()
# Randomly select one item based on weights
selected_idx = np.random.choice(len(remaining_population), p=normalized_weights)
# Add the selected item to our result
selected.append(remaining_population[selected_idx])
# Remove the selected item from the pool
remaining_population.pop(selected_idx)
remaining_weights = np.delete(remaining_weights, selected_idx)
return selected
def load_context(set_interrupt=False):
if set_interrupt:
generation_interrupt.set()
time.sleep(0.2)
generation_interrupt.clear()
example = get_random_example()
context_desc = example.get('processed_context_desc', '')
if context_desc:
context_desc = f"
The question and context are about: {context_desc}
"
show_full = False
context_html = get_context_html(example, show_full=show_full)
return [
example,
gr.update(value=example['question']),
gr.update(value=context_desc, visible=bool(context_desc)),
gr.update(value=context_html),
gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
show_full
]
def load_leaderboard():
results = load_leaderboard_data()
leaderboard_html = generate_leaderboard_html(results)
return leaderboard_html
def generate_model_summaries(example):
result = {
"model_a": "",
"model_b": "",
"summary_a": "",
"summary_b": "",
"completed": False
}
if generation_interrupt.is_set():
return result
try:
# Get current leaderboard data to determine model usage counts
leaderboard_data = load_leaderboard_data()
# Calculate weights using inverse weighting
# Weight = K / (games_played + C)
K = 100 # Scaling factor
C = 5 # Smoothing constant
weights = []
model_list = []
for model in model_names:
# Get games played for the model, default to 0 if not found
games_played = leaderboard_data["games_played"].get(model, 0)
# Calculate weight using inverse formula
weight = K / (games_played + C)
weights.append(weight)
model_list.append(model)
# Select two models using weighted sampling without replacement
selected_models = weighted_sample_without_replacement(model_list, weights, k=2)
m_a_name, m_b_name = selected_models
result["model_a"] = m_a_name
result["model_b"] = m_b_name
s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
if not generation_interrupt.is_set():
result["summary_a"] = s_a
result["summary_b"] = s_b
result["completed"] = bool(s_a and s_b)
except Exception as e:
print(f"Error in generation: {e}")
return result
def process_generation_result(result):
if not result["completed"] or not result["summary_a"] or not result["summary_b"]:
return [
result.get("model_a", ""),
result.get("model_b", ""),
result.get("summary_a", ""),
result.get("summary_b", ""),
None, [], False, load_leaderboard_data(),
gr.update(value=result.get("summary_a", "Generation was interrupted or failed.")),
gr.update(value=result.get("summary_b", "Generation was interrupted or failed.")),
gr.update(interactive=False, elem_classes=["vote-button"]),
gr.update(interactive=False, elem_classes=["vote-button"]),
gr.update(interactive=False, elem_classes=["vote-button"]),
gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]),
gr.update(choices=[], value=[], interactive=False, visible=False),
gr.update(visible=False),
gr.update(interactive=False, visible=True),
gr.update(visible=False),
gr.update(interactive=True),
gr.update(elem_classes=[])
]
buttons_interactive = bool(result["summary_a"] and result["summary_b"])
agg_results = load_leaderboard_data()
return [
result["model_a"], result["model_b"],
result["summary_a"], result["summary_b"],
None, [], False, agg_results,
gr.update(value=result["summary_a"]),
gr.update(value=result["summary_b"]),
gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]),
gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]),
gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]),
gr.update(interactive=buttons_interactive, elem_classes=["vote-button", "vote-button-neither"]),
gr.update(choices=[], value=[], interactive=False, visible=False),
gr.update(visible=False),
gr.update(interactive=False, visible=True),
gr.update(visible=False),
gr.update(interactive=True),
gr.update(elem_classes=[])
]
def process_example(example):
result = generate_model_summaries(example)
return process_generation_result(result)
def select_vote_improved(winner_choice):
feedback_choices = feedback_options.get(winner_choice, [])
btn_a_classes = ["vote-button"]
btn_b_classes = ["vote-button"]
btn_tie_classes = ["vote-button"]
btn_neither_classes = ["vote-button", "vote-button-neither"]
if winner_choice == 'left':
btn_a_classes.append("selected")
elif winner_choice == 'right':
btn_b_classes.append("selected")
elif winner_choice == 'tie':
btn_tie_classes.append("selected")
elif winner_choice == 'neither':
btn_neither_classes.append("selected")
return [
winner_choice,
gr.update(choices=feedback_choices, value=[], interactive=True, visible=True),
gr.update(visible=True),
gr.update(interactive=True),
gr.update(elem_classes=btn_a_classes),
gr.update(elem_classes=btn_b_classes),
gr.update(elem_classes=btn_tie_classes),
gr.update(elem_classes=btn_neither_classes)
]
def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results):
if winner is None:
print("Warning: Submit called without a winner selected.")
return {}
save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b)
return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results)
def show_loading_state():
"""Show loading state while fetching new content and reset UI elements"""
return [
gr.update(value="Loading new question and summaries...", interactive=False),
gr.update(value="Loading new question and summaries...", interactive=False),
gr.update(interactive=False, elem_classes=["vote-button"]), # Reset styling
gr.update(interactive=False, elem_classes=["vote-button"]),
gr.update(interactive=False, elem_classes=["vote-button"]),
gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]),
gr.update(visible=False), # feedback_section
gr.update(interactive=False), # submit_button
gr.update(visible=False), # results_reveal_area
gr.update(interactive=False), # random_question_btn
None # Reset selected_winner
]
def handle_new_example_click():
return load_context(set_interrupt=True)[0]
def update_ui_for_new_context(example):
context_desc = example.get('processed_context_desc', '')
if context_desc:
context_desc = f"The question and context are about: {context_desc}
"
return [
gr.update(value=example['question']),
gr.update(value=context_desc, visible=bool(context_desc)),
gr.update(value=get_context_html(example, False)),
gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
False
]
def cleanup_on_disconnect():
print(f"Browser disconnected. Cleaning up resources...")
generation_interrupt.set()
with gr.Blocks(theme=gr.themes.Default(
primary_hue=gr.themes.colors.orange,
secondary_hue=gr.themes.colors.slate
)) as demo:
css_path = os.path.join(os.getcwd(), 'static', 'styles.css')
with open(css_path, 'r') as f:
css_content = f.read()
gr.HTML(f"")
unload_js = """
"""
gr.HTML(unload_js)
current_example = gr.State({})
model_a_name = gr.State("")
model_b_name = gr.State("")
summary_a_text = gr.State("")
summary_b_text = gr.State("")
selected_winner = gr.State(None)
feedback_list = gr.State([])
show_results_state = gr.State(False)
results_agg = gr.State(load_leaderboard_data())
show_full_context = gr.State(False)
with gr.Tabs() as tabs:
with gr.TabItem("Arena", id="arena-tab"):
gr.Markdown("# Small Language Model RAG Summarization/Generation Arena")
gr.Markdown("""
🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient.
📝 Insturction: 1. **Review the query and context**. 2. **Compare answers** generated by two different models. 3. **Vote for the better response** or select 'Tie/Neither' if appropriate.
""")
gr.HTML("
")
with gr.Column(elem_id="main-interface-area") as main_interface_area:
with gr.Row(elem_id="query-title-row"):
gr.Markdown("### 💬 Query - Question About Document Content", elem_classes="section-heading")
with gr.Row(elem_id="query-container"):
with gr.Row(elem_classes="query-box-row"):
query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section")
random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button")
context_description = gr.Markdown("", elem_classes="context-description")
gr.HTML("
")
with gr.Row(elem_id="context-header-row"):
gr.Markdown("### 📋 Context - Retrieved Content from the Document", elem_classes="context-title")
context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
context_display = gr.HTML(value="Loading context...", label="Context Chunks")
gr.Markdown("---")
gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
with gr.Row(elem_id="summary-containers"):
with gr.Column(scale=1):
with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
summary_a_display = gr.Textbox(
label="Model A",
lines=10,
interactive=False,
show_copy_button=True,
autoscroll=False,
elem_id="summary-a-display"
)
with gr.Column(scale=1):
with gr.Group(elem_classes=["summary-card", "summary-card-b"]):
summary_b_display = gr.Textbox(
label="Model B",
lines=10,
interactive=False,
show_copy_button=True,
autoscroll=False,
elem_id="summary-b-display"
)
gr.HTML("
")
gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading")
with gr.Row():
vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False)
vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"], interactive=False)
vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False)
vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False)
with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button")
with gr.Column(visible=False) as results_reveal_area:
gr.Markdown("---")
gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Model A was:", elem_classes="section-heading")
model_a_reveal = gr.Markdown("", elem_classes="model-reveal model-a-reveal")
with gr.Column(scale=1):
gr.Markdown("### Model B was:", elem_classes="section-heading")
model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal")
gr.HTML("
")
with gr.Row(elem_classes=["control-buttons"]):
try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn")
with gr.TabItem("Leaderboard", id="leaderboard-tab"):
gr.Markdown("# RAG SLM Summarizer/Generator Leaderboard", elem_classes="orange-title")
gr.Markdown("View performance statistics for all models ranked by Elo rating.")
with gr.Group(elem_id="leaderboard-info"):
gr.Markdown("""### About Elo Ratings
The Elo rating system provides a more accurate ranking than simple win rates:
- All models start at 1500 points
- Points are exchanged after each comparison based on the expected outcome
- Beating a stronger model earns more points than beating a weaker one
- The ± value shows the statistical confidence interval (95%)
""")
results_table_display = gr.HTML(label="Model Performance")
context_toggle_btn.click(
fn=toggle_context_display,
inputs=[current_example, show_full_context],
outputs=[show_full_context, context_display, context_toggle_btn]
)
demo.load(
fn=load_context,
inputs=[],
outputs=[current_example, query_display, context_description, context_display,
context_toggle_btn, show_full_context]
).then(
fn=process_example,
inputs=[current_example],
outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
selected_winner, feedback_list, show_results_state, results_agg,
summary_a_display, summary_b_display, vote_button_a, vote_button_b,
vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
submit_button, results_reveal_area, random_question_btn, main_interface_area]
)
demo.load(
fn=load_leaderboard,
inputs=[],
outputs=[results_table_display]
)
for btn in [random_question_btn, try_another_btn]:
btn.click(
fn=show_loading_state,
inputs=[],
outputs=[
summary_a_display, summary_b_display,
vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
feedback_section, submit_button, results_reveal_area, random_question_btn,
selected_winner # Add selected_winner to reset vote state
]
).then(
fn=handle_new_example_click,
inputs=[],
outputs=[current_example]
).then(
fn=update_ui_for_new_context,
inputs=[current_example],
outputs=[query_display, context_description, context_display,
context_toggle_btn, show_full_context]
).then(
fn=process_example,
inputs=[current_example],
outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
selected_winner, feedback_list, show_results_state, results_agg,
summary_a_display, summary_b_display, vote_button_a, vote_button_b,
vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
submit_button, results_reveal_area, random_question_btn, main_interface_area]
)
for btn, choice in zip(
[vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
['left', 'right', 'tie', 'neither']
):
btn.click(
fn=lambda choice=choice: select_vote_improved(choice),
inputs=None,
outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button,
vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
)
feedback_checkboxes.change(
fn=update_feedback,
inputs=[feedback_checkboxes],
outputs=[feedback_list]
)
submit_button.click(
fn=handle_vote_submission,
inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg],
outputs=[show_results_state, results_agg, vote_button_a, vote_button_b,
vote_button_tie, vote_button_neither, feedback_checkboxes,
feedback_section, submit_button, results_reveal_area,
random_question_btn, results_table_display, main_interface_area,
context_toggle_btn, model_a_reveal, model_b_reveal]
)
tabs.select(
fn=load_leaderboard,
inputs=[],
outputs=[results_table_display],
api_name="refresh_leaderboard"
)
demo.unload(cleanup_on_disconnect)
if __name__ == "__main__":
demo.launch(debug=True)