import gradio as gr import random import pandas as pd import os import threading from threading import Event from utils.data_loader import get_random_example from utils.models import generate_summaries, model_names from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html from utils.vote_logger import save_vote_details # Global interrupt mechanism for model generation generation_interrupt = Event() # Feedback options for different voting outcomes feedback_options = { "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"], "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"], "tie": ["Both complete", "Both accurate", "Both well written", "Both handle refusal well (if applicable)"], "neither": ["Both incomplete", "Both hallucinate", "Both irrelevant", "Both incorrectly refuse (if applicable)", "A is bad", "B is bad"] } def load_context(): """Load a new question and context (fast operation)""" generation_interrupt.clear() example = get_random_example() context_desc = example.get('processed_context_desc', '') if context_desc: context_desc = f"

The question and context are about: {context_desc}

" show_full = False context_html = get_context_html(example, show_full=show_full) return [ example, gr.update(value=example['question']), gr.update(value=context_desc, visible=bool(context_desc)), gr.update(value=context_html), gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]), show_full ] def load_leaderboard(): """Loads and displays the leaderboard data""" results = load_leaderboard_data() leaderboard_html = generate_leaderboard_html(results) return leaderboard_html def generate_model_summaries(example): """Run model inference""" result = { "model_a": "", "model_b": "", "summary_a": "", "summary_b": "", "completed": False } if generation_interrupt.is_set(): return result try: m_a_name, m_b_name = random.sample(model_names, 2) s_a, s_b = generate_summaries(example, m_a_name, m_b_name) if not generation_interrupt.is_set(): result["model_a"] = m_a_name result["model_b"] = m_b_name result["summary_a"] = s_a result["summary_b"] = s_b result["completed"] = True except Exception as e: print(f"Error in generation: {e}") return result def process_generation_result(result): """Process the results from the generation function""" if not result["completed"]: # Generation was interrupted or failed return [ "", "", "", "", None, [], False, load_leaderboard_data(), gr.update(value="Generation was interrupted or failed. Please try again."), gr.update(value="Generation was interrupted or failed. Please try again."), gr.update(interactive=True, elem_classes=["vote-button"]), gr.update(interactive=True, elem_classes=["vote-button"]), gr.update(interactive=True, elem_classes=["vote-button"]), gr.update(interactive=True, elem_classes=["vote-button", "vote-button-neither"]), gr.update(choices=[], value=[], interactive=False, visible=False), gr.update(visible=False), gr.update(interactive=False, visible=True), gr.update(visible=False), gr.update(interactive=True), gr.update(elem_classes=[]) ] # Generation completed successfully agg_results = load_leaderboard_data() return [ result["model_a"], result["model_b"], result["summary_a"], result["summary_b"], None, [], False, agg_results, gr.update(value=result["summary_a"]), gr.update(value=result["summary_b"]), gr.update(interactive=True, elem_classes=["vote-button"]), gr.update(interactive=True, elem_classes=["vote-button"]), gr.update(interactive=True, elem_classes=["vote-button"]), gr.update(interactive=True, elem_classes=["vote-button", "vote-button-neither"]), gr.update(choices=[], value=[], interactive=False, visible=False), gr.update(visible=False), gr.update(interactive=False, visible=True), gr.update(visible=False), gr.update(interactive=True), gr.update(elem_classes=[]) ] def process_example(example): result = generate_model_summaries(example) return process_generation_result(result) def select_vote_improved(winner_choice): """Updates UI based on vote selection""" feedback_choices = feedback_options.get(winner_choice, []) btn_a_classes = ["vote-button"] btn_b_classes = ["vote-button"] btn_tie_classes = ["vote-button"] btn_neither_classes = ["vote-button", "vote-button-neither"] if winner_choice == 'left': btn_a_classes.append("selected") elif winner_choice == 'right': btn_b_classes.append("selected") elif winner_choice == 'tie': btn_tie_classes.append("selected") elif winner_choice == 'neither': btn_neither_classes.append("selected") return [ winner_choice, gr.update(choices=feedback_choices, value=[], interactive=True, visible=True), gr.update(visible=True), gr.update(interactive=True), gr.update(elem_classes=btn_a_classes), gr.update(elem_classes=btn_b_classes), gr.update(elem_classes=btn_tie_classes), gr.update(elem_classes=btn_neither_classes) ] def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results): """Handle vote submission - logs details and updates leaderboard""" if winner is None: print("Warning: Submit called without a winner selected.") return {} # Save detailed vote information save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b) # Update Elo ratings and get UI updates return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results) # Create Gradio interface with gr.Blocks(theme=gr.themes.Default( primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.slate )) as demo: # Load CSS css_path = os.path.join(os.getcwd(), 'static', 'styles.css') with open(css_path, 'r') as f: css_content = f.read() gr.HTML(f"") # State Variables current_example = gr.State({}) model_a_name = gr.State("") model_b_name = gr.State("") summary_a_text = gr.State("") summary_b_text = gr.State("") selected_winner = gr.State(None) feedback_list = gr.State([]) show_results_state = gr.State(False) results_agg = gr.State(load_leaderboard_data()) show_full_context = gr.State(False) # Create Tabs with gr.Tabs() as tabs: # Main Arena Tab with gr.TabItem("Arena", id="arena-tab"): gr.Markdown("# RAG Summarizer Arena") gr.Markdown("Compare summaries generated by different models based on the provided context and query. Select the better summary, or choose 'Tie' or 'Neither'. Your feedback helps evaluate model performance.") # Main container with gr.Column(elem_id="main-interface-area") as main_interface_area: # Query section with gr.Row(elem_id="query-title-row"): gr.Markdown("### Query", elem_classes="section-heading") with gr.Row(elem_id="query-container"): with gr.Row(elem_classes="query-box-row"): query_display = gr.Markdown(value="Loading question...", elem_classes="query-text") random_question_btn = gr.Button("🔄 Get Random Question", elem_classes="query-button") # Context description and display context_description = gr.Markdown("", elem_classes="context-description") with gr.Row(elem_id="context-header-row"): gr.Markdown("### Context Provided", elem_classes="context-title") context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"]) context_display = gr.HTML(value="Loading context...", label="Context Chunks") gr.Markdown("---") gr.Markdown("### Compare Summaries", elem_classes="section-heading") # Model summaries with gr.Row(): with gr.Column(scale=1): with gr.Group(elem_classes=["summary-card", "summary-card-a"]): summary_a_display = gr.Textbox(label="Model A", lines=10, interactive=False, show_copy_button=True) with gr.Column(scale=1): with gr.Group(elem_classes=["summary-card", "summary-card-b"]): summary_b_display = gr.Textbox(label="Model B", lines=10, interactive=False, show_copy_button=True) # Voting section gr.Markdown("### Cast Your Vote", elem_classes="section-heading") with gr.Row(): vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"]) vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"]) vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"]) vote_button_neither = gr.Button("❌ Neither is Adequate", elem_classes=["vote-button", "vote-button-neither"]) # Feedback and Submit sections with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section: feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False) submit_button = gr.Button("Submit Vote", variant="primary", interactive=False, elem_id="submit-button") # Results area with gr.Column(visible=False) as results_reveal_area: gr.Markdown("---") gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading") # Model reveal section with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Model A was actually:", elem_classes="section-heading") model_a_reveal = gr.Markdown("", elem_classes="model-reveal model-a-reveal") with gr.Column(scale=1): gr.Markdown("### Model B was actually:", elem_classes="section-heading") model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal") gr.HTML("

") # Try another button with gr.Row(elem_classes=["control-buttons"]): try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn") # Leaderboard Tab with gr.TabItem("Leaderboard", id="leaderboard-tab"): gr.Markdown("# Model Performance Leaderboard", elem_classes="orange-title") gr.Markdown("View performance statistics for all models ranked by Elo rating.") with gr.Group(elem_id="leaderboard-info"): gr.Markdown("""### About Elo Ratings The Elo rating system provides a more accurate ranking than simple win rates: - All models start at 1500 points - Points are exchanged after each comparison based on the expected outcome - Beating a stronger model earns more points than beating a weaker one - The ± value shows the statistical confidence interval (95%) """) results_table_display = gr.HTML(label="Model Performance") # Generic function to handle starting a new example def handle_new_example_click(): generation_interrupt.set() # Interrupt any ongoing generation return load_context()[0] def update_ui_for_new_context(example): return [ gr.update(value=example['question']), gr.update(value=example.get('processed_context_desc', ''), visible=bool(example.get('processed_context_desc', ''))), gr.update(value=get_context_html(example, False)), gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]), False ] # Event handling # Toggle context display context_toggle_btn.click( fn=toggle_context_display, inputs=[current_example, show_full_context], outputs=[show_full_context, context_display, context_toggle_btn] ) # Initial loading - context first, then summaries demo.load( fn=load_context, inputs=[], outputs=[current_example, query_display, context_description, context_display, context_toggle_btn, show_full_context] ).then( fn=process_example, inputs=[current_example], outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text, selected_winner, feedback_list, show_results_state, results_agg, summary_a_display, summary_b_display, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, main_interface_area] ) # Load leaderboard content on app start demo.load( fn=load_leaderboard, inputs=[], outputs=[results_table_display] ) # Random Question and Try Another buttons with interruption for btn in [random_question_btn, try_another_btn]: btn.click( fn=handle_new_example_click, inputs=[], outputs=[current_example] ).then( fn=update_ui_for_new_context, inputs=[current_example], outputs=[query_display, context_description, context_display, context_toggle_btn, show_full_context] ).then( fn=process_example, inputs=[current_example], outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text, selected_winner, feedback_list, show_results_state, results_agg, summary_a_display, summary_b_display, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, main_interface_area] ) # Vote button handlers for btn, choice in zip( [vote_button_a, vote_button_b, vote_button_tie, vote_button_neither], ['left', 'right', 'tie', 'neither'] ): btn.click( fn=lambda choice=choice: select_vote_improved(choice), inputs=None, outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither] ) # Update feedback when checkboxes change feedback_checkboxes.change( fn=update_feedback, inputs=[feedback_checkboxes], outputs=[feedback_list] ) # Process vote submission and reveal results submit_button.click( fn=handle_vote_submission, inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg], outputs=[show_results_state, results_agg, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, results_table_display, main_interface_area, context_toggle_btn, model_a_reveal, model_b_reveal] ) # Refresh leaderboard when switching to the leaderboard tab tabs.select( fn=load_leaderboard, inputs=[], outputs=[results_table_display], api_name="refresh_leaderboard" ) if __name__ == "__main__": demo.launch(debug=True)