import gradio as gr import random import pandas as pd import os import threading import time import numpy as np from utils.data_loader import get_random_example from utils.models import generate_summaries, model_names from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html from utils.vote_logger import save_vote_details from utils.shared import generation_interrupt feedback_options = { "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"], "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"], "tie": ["Model A: Complete", "Model A: Accurate", "Model A: Relevant", "Model A: Well written", "Model A: Correct refusal (if applicable)", "Model B: Complete", "Model B: Accurate", "Model B: Relevant", "Model B: Well written", "Model B: Corrent refusal (if applicable)"], "neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)", "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"] } def weighted_sample_without_replacement(population, weights, k=2): """ Performs a weighted random sampling without replacement. Args: population: The list of items to sample from weights: The weight for each item k: Number of items to sample Returns: A list of k sampled items """ if len(population) <= k: return population # Convert weights to numpy array for efficient operations weights = np.array(weights) # Create a copy of the population and weights remaining_population = population.copy() remaining_weights = weights.copy() selected = [] for _ in range(k): # Normalize weights so they sum to 1 normalized_weights = remaining_weights / remaining_weights.sum() # Randomly select one item based on weights selected_idx = np.random.choice(len(remaining_population), p=normalized_weights) # Add the selected item to our result selected.append(remaining_population[selected_idx]) # Remove the selected item from the pool remaining_population.pop(selected_idx) remaining_weights = np.delete(remaining_weights, selected_idx) return selected def load_context(set_interrupt=False): if set_interrupt: generation_interrupt.set() time.sleep(0.2) generation_interrupt.clear() example = get_random_example() context_desc = example.get('processed_context_desc', '') if context_desc: context_desc = f"
The question and context are about: {context_desc}
" show_full = False context_html = get_context_html(example, show_full=show_full) return [ example, gr.update(value=example['question']), gr.update(value=context_desc, visible=bool(context_desc)), gr.update(value=context_html), gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]), show_full ] def load_leaderboard(): results = load_leaderboard_data() leaderboard_html = generate_leaderboard_html(results) return leaderboard_html def generate_model_summaries(example): result = { "model_a": "", "model_b": "", "summary_a": "", "summary_b": "", "completed": False } if generation_interrupt.is_set(): return result try: # Get current leaderboard data to determine model usage counts leaderboard_data = load_leaderboard_data() # Calculate weights using inverse weighting # Weight = K / (games_played + C) K = 100 # Scaling factor C = 5 # Smoothing constant weights = [] model_list = [] for model in model_names: # Get games played for the model, default to 0 if not found games_played = leaderboard_data["games_played"].get(model, 0) # Calculate weight using inverse formula weight = K / (games_played + C) weights.append(weight) model_list.append(model) # Select two models using weighted sampling without replacement selected_models = weighted_sample_without_replacement(model_list, weights, k=2) m_a_name, m_b_name = selected_models result["model_a"] = m_a_name result["model_b"] = m_b_name s_a, s_b = generate_summaries(example, m_a_name, m_b_name) if not generation_interrupt.is_set(): result["summary_a"] = s_a result["summary_b"] = s_b result["completed"] = bool(s_a and s_b) except Exception as e: print(f"Error in generation: {e}") return result def process_generation_result(result): if not result["completed"] or not result["summary_a"] or not result["summary_b"]: return [ result.get("model_a", ""), result.get("model_b", ""), result.get("summary_a", ""), result.get("summary_b", ""), None, [], False, load_leaderboard_data(), gr.update(value=result.get("summary_a", "Generation was interrupted or failed.")), gr.update(value=result.get("summary_b", "Generation was interrupted or failed.")), gr.update(interactive=False, elem_classes=["vote-button"]), gr.update(interactive=False, elem_classes=["vote-button"]), gr.update(interactive=False, elem_classes=["vote-button"]), gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]), gr.update(choices=[], value=[], interactive=False, visible=False), gr.update(visible=False), gr.update(interactive=False, visible=True), gr.update(visible=False), gr.update(interactive=True), gr.update(elem_classes=[]) ] buttons_interactive = bool(result["summary_a"] and result["summary_b"]) agg_results = load_leaderboard_data() return [ result["model_a"], result["model_b"], result["summary_a"], result["summary_b"], None, [], False, agg_results, gr.update(value=result["summary_a"]), gr.update(value=result["summary_b"]), gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]), gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]), gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]), gr.update(interactive=buttons_interactive, elem_classes=["vote-button", "vote-button-neither"]), gr.update(choices=[], value=[], interactive=False, visible=False), gr.update(visible=False), gr.update(interactive=False, visible=True), gr.update(visible=False), gr.update(interactive=True), gr.update(elem_classes=[]) ] def process_example(example): result = generate_model_summaries(example) return process_generation_result(result) def select_vote_improved(winner_choice): feedback_choices = feedback_options.get(winner_choice, []) btn_a_classes = ["vote-button"] btn_b_classes = ["vote-button"] btn_tie_classes = ["vote-button"] btn_neither_classes = ["vote-button", "vote-button-neither"] if winner_choice == 'left': btn_a_classes.append("selected") elif winner_choice == 'right': btn_b_classes.append("selected") elif winner_choice == 'tie': btn_tie_classes.append("selected") elif winner_choice == 'neither': btn_neither_classes.append("selected") return [ winner_choice, gr.update(choices=feedback_choices, value=[], interactive=True, visible=True), gr.update(visible=True), gr.update(interactive=True), gr.update(elem_classes=btn_a_classes), gr.update(elem_classes=btn_b_classes), gr.update(elem_classes=btn_tie_classes), gr.update(elem_classes=btn_neither_classes) ] def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results): if winner is None: print("Warning: Submit called without a winner selected.") return {} save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b) return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results) def show_loading_state(): """Show loading state while fetching new content and reset UI elements""" return [ gr.update(value="Loading new question and summaries...", interactive=False), gr.update(value="Loading new question and summaries...", interactive=False), gr.update(interactive=False, elem_classes=["vote-button"]), # Reset styling gr.update(interactive=False, elem_classes=["vote-button"]), gr.update(interactive=False, elem_classes=["vote-button"]), gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]), gr.update(visible=False), # feedback_section gr.update(interactive=False), # submit_button gr.update(visible=False), # results_reveal_area gr.update(interactive=False), # random_question_btn None # Reset selected_winner ] def handle_new_example_click(): return load_context(set_interrupt=True)[0] def update_ui_for_new_context(example): context_desc = example.get('processed_context_desc', '') if context_desc: context_desc = f"
The question and context are about: {context_desc}
" return [ gr.update(value=example['question']), gr.update(value=context_desc, visible=bool(context_desc)), gr.update(value=get_context_html(example, False)), gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]), False ] def cleanup_on_disconnect(): print(f"Browser disconnected. Cleaning up resources...") generation_interrupt.set() with gr.Blocks(theme=gr.themes.Default( primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.slate )) as demo: css_path = os.path.join(os.getcwd(), 'static', 'styles.css') with open(css_path, 'r') as f: css_content = f.read() gr.HTML(f"") unload_js = """ """ gr.HTML(unload_js) current_example = gr.State({}) model_a_name = gr.State("") model_b_name = gr.State("") summary_a_text = gr.State("") summary_b_text = gr.State("") selected_winner = gr.State(None) feedback_list = gr.State([]) show_results_state = gr.State(False) results_agg = gr.State(load_leaderboard_data()) show_full_context = gr.State(False) with gr.Tabs() as tabs: with gr.TabItem("Arena", id="arena-tab"): gr.Markdown("# Small Language Model RAG Summarization/Generation Arena") gr.Markdown(""" 🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient. 📝 Insturction: 1. **Review the query and context**. 2. **Compare answers** generated by two different models. 3. **Vote for the better response** or select 'Tie/Neither' if appropriate. """) gr.HTML("
") with gr.Column(elem_id="main-interface-area") as main_interface_area: with gr.Row(elem_id="query-title-row"): gr.Markdown("### 💬 Query - Question About Document Content", elem_classes="section-heading") with gr.Row(elem_id="query-container"): with gr.Row(elem_classes="query-box-row"): query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section") random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button") context_description = gr.Markdown("", elem_classes="context-description") gr.HTML("
") with gr.Row(elem_id="context-header-row"): gr.Markdown("### 📋 Context - Retrieved Content from the Document", elem_classes="context-title") context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"]) context_display = gr.HTML(value="Loading context...", label="Context Chunks") gr.Markdown("---") gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading") with gr.Row(elem_id="summary-containers"): with gr.Column(scale=1): with gr.Group(elem_classes=["summary-card", "summary-card-a"]): summary_a_display = gr.Textbox( label="Model A", lines=10, interactive=False, show_copy_button=True, autoscroll=False, elem_id="summary-a-display" ) with gr.Column(scale=1): with gr.Group(elem_classes=["summary-card", "summary-card-b"]): summary_b_display = gr.Textbox( label="Model B", lines=10, interactive=False, show_copy_button=True, autoscroll=False, elem_id="summary-b-display" ) gr.HTML("
") gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading") with gr.Row(): vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False) vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"], interactive=False) vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False) vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False) with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section: feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False) submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button") with gr.Column(visible=False) as results_reveal_area: gr.Markdown("---") gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Model A was:", elem_classes="section-heading") model_a_reveal = gr.Markdown("", elem_classes="model-reveal model-a-reveal") with gr.Column(scale=1): gr.Markdown("### Model B was:", elem_classes="section-heading") model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal") gr.HTML("
") with gr.Row(elem_classes=["control-buttons"]): try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn") with gr.TabItem("Leaderboard", id="leaderboard-tab"): gr.Markdown("# RAG SLM Summarizer/Generator Leaderboard", elem_classes="orange-title") gr.Markdown("View performance statistics for all models ranked by Elo rating.") with gr.Group(elem_id="leaderboard-info"): gr.Markdown("""### About Elo Ratings The Elo rating system provides a more accurate ranking than simple win rates: - All models start at 1500 points - Points are exchanged after each comparison based on the expected outcome - Beating a stronger model earns more points than beating a weaker one - The ± value shows the statistical confidence interval (95%) """) results_table_display = gr.HTML(label="Model Performance") context_toggle_btn.click( fn=toggle_context_display, inputs=[current_example, show_full_context], outputs=[show_full_context, context_display, context_toggle_btn] ) demo.load( fn=load_context, inputs=[], outputs=[current_example, query_display, context_description, context_display, context_toggle_btn, show_full_context] ).then( fn=process_example, inputs=[current_example], outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text, selected_winner, feedback_list, show_results_state, results_agg, summary_a_display, summary_b_display, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, main_interface_area] ) demo.load( fn=load_leaderboard, inputs=[], outputs=[results_table_display] ) for btn in [random_question_btn, try_another_btn]: btn.click( fn=show_loading_state, inputs=[], outputs=[ summary_a_display, summary_b_display, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, feedback_section, submit_button, results_reveal_area, random_question_btn, selected_winner # Add selected_winner to reset vote state ] ).then( fn=handle_new_example_click, inputs=[], outputs=[current_example] ).then( fn=update_ui_for_new_context, inputs=[current_example], outputs=[query_display, context_description, context_display, context_toggle_btn, show_full_context] ).then( fn=process_example, inputs=[current_example], outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text, selected_winner, feedback_list, show_results_state, results_agg, summary_a_display, summary_b_display, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, main_interface_area] ) for btn, choice in zip( [vote_button_a, vote_button_b, vote_button_tie, vote_button_neither], ['left', 'right', 'tie', 'neither'] ): btn.click( fn=lambda choice=choice: select_vote_improved(choice), inputs=None, outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither] ) feedback_checkboxes.change( fn=update_feedback, inputs=[feedback_checkboxes], outputs=[feedback_list] ) submit_button.click( fn=handle_vote_submission, inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg], outputs=[show_results_state, results_agg, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, results_table_display, main_interface_area, context_toggle_btn, model_a_reveal, model_b_reveal] ) tabs.select( fn=load_leaderboard, inputs=[], outputs=[results_table_display], api_name="refresh_leaderboard" ) demo.unload(cleanup_on_disconnect) if __name__ == "__main__": demo.launch(debug=True)