SLM-RAG-Arena

Running on Zero

App Files Files Community

aizip-dev commited on May 14

Commit

c963ac8

verified ·

1 Parent(s): 076ecc1

add onboarding flow

Browse files

Files changed (1) hide show

app.py +206 -97

app.py CHANGED Viewed

@@ -78,13 +78,24 @@ def load_context(set_interrupt=False):
     return [
         example,
-        gr.update(value=example['question']),
         gr.update(value=context_desc, visible=bool(context_desc)),
         gr.update(value=context_html),
-        gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
         show_full
     ]
 def load_leaderboard():
     results = load_leaderboard_data()
     leaderboard_html = generate_leaderboard_html(results)
@@ -250,10 +261,10 @@ def update_ui_for_new_context(example):
         context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
     return [
-        gr.update(value=example['question']),
         gr.update(value=context_desc, visible=bool(context_desc)),
         gr.update(value=get_context_html(example, False)),
-        gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
         False
     ]
@@ -261,6 +272,24 @@ def cleanup_on_disconnect():
     print(f"Browser disconnected. Cleaning up resources...")
     generation_interrupt.set()
 with gr.Blocks(theme=gr.themes.Default(
     primary_hue=gr.themes.colors.orange,
     secondary_hue=gr.themes.colors.slate
@@ -281,6 +310,7 @@ with gr.Blocks(theme=gr.themes.Default(
     """
     gr.HTML(unload_js)
     current_example = gr.State({})
     model_a_name = gr.State("")
     model_b_name = gr.State("")
@@ -291,75 +321,96 @@ with gr.Blocks(theme=gr.themes.Default(
     show_results_state = gr.State(False)
     results_agg = gr.State(load_leaderboard_data())
     show_full_context = gr.State(False)
     with gr.Tabs() as tabs:
         with gr.TabItem("Arena", id="arena-tab"):
-            gr.Markdown("# Small Language Model RAG Summarization/Generation Arena")
             gr.Markdown("""
-🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient.
-📝 Insturction: 1. **Review the query and context**. 2. **Compare answers** generated by two different models. 3. **Vote for the better response** or select 'Tie/Neither' if appropriate.
 """)
-            gr.HTML("<hr>")
             with gr.Column(elem_id="main-interface-area") as main_interface_area:
                 with gr.Row(elem_id="query-title-row"):
                     gr.Markdown("### 💬 Query - Question About Document Content", elem_classes="section-heading")
                 with gr.Row(elem_id="query-container"):
                     with gr.Row(elem_classes="query-box-row"):
-                        query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section")
-                    random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button")
                 context_description = gr.Markdown("", elem_classes="context-description")
-                gr.HTML("<hr>")
-                with gr.Row(elem_id="context-header-row"):
-                    gr.Markdown("### 📋 Context - Retrieved Content from the Document", elem_classes="context-title")
-                    context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
-                context_display = gr.HTML(value="Loading context...", label="Context Chunks")
-                gr.Markdown("---")
-                gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
-                with gr.Row(elem_id="summary-containers"):
-                    with gr.Column(scale=1):
-                        with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
-                            summary_a_display = gr.Textbox(
-                                label="Model A",
-                                lines=10,
-                                interactive=False,
-                                show_copy_button=True,
-                                autoscroll=False,
-                                elem_id="summary-a-display"
-                            )
-                    with gr.Column(scale=1):
-                        with gr.Group(elem_classes=["summary-card", "summary-card-b"]):
-                            summary_b_display = gr.Textbox(
-                                label="Model B",
-                                lines=10,
-                                interactive=False,
-                                show_copy_button=True,
-                                autoscroll=False,
-                                elem_id="summary-b-display"
-                            )
-                gr.HTML("<hr>")
-                gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading")
-                with gr.Row():
-                    vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False)
-                    vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"], interactive=False)
-                    vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False)
-                    vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False)
                 with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
                     feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
-                submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button")
                 with gr.Column(visible=False) as results_reveal_area:
                     gr.Markdown("---")
@@ -394,17 +445,73 @@ The Elo rating system provides a more accurate ranking than simple win rates:
             results_table_display = gr.HTML(label="Model Performance")
     context_toggle_btn.click(
         fn=toggle_context_display,
         inputs=[current_example, show_full_context],
         outputs=[show_full_context, context_display, context_toggle_btn]
     )
     demo.load(
-        fn=load_context,
         inputs=[],
-        outputs=[current_example, query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
     ).then(
         fn=process_example,
         inputs=[current_example],
@@ -415,45 +522,51 @@ The Elo rating system provides a more accurate ranking than simple win rates:
                 submit_button, results_reveal_area, random_question_btn, main_interface_area]
     )
-    demo.load(
-        fn=load_leaderboard,
         inputs=[],
-        outputs=[results_table_display]
     )
-    for btn in [random_question_btn, try_another_btn]:
-        btn.click(
-            fn=show_loading_state,
-            inputs=[],
-            outputs=[
-                summary_a_display, summary_b_display,
-                vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
-                feedback_section, submit_button, results_reveal_area, random_question_btn,
-                selected_winner  # Add selected_winner to reset vote state
-            ]
-        ).then(
-            fn=handle_new_example_click,
-            inputs=[],
-            outputs=[current_example]
-        ).then(
-            fn=load_leaderboard_data,  # Add this to refresh results_agg
-            inputs=[],
-            outputs=[results_agg]
-        ).then(
-            fn=update_ui_for_new_context,
-            inputs=[current_example],
-            outputs=[query_display, context_description, context_display,
-                    context_toggle_btn, show_full_context]
-        ).then(
-            fn=process_example,
-            inputs=[current_example],
-            outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
-                    selected_winner, feedback_list, show_results_state, results_agg,
-                    summary_a_display, summary_b_display, vote_button_a, vote_button_b,
-                    vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
-                    submit_button, results_reveal_area, random_question_btn, main_interface_area]
-        )
     for btn, choice in zip(
         [vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
         ['left', 'right', 'tie', 'neither']
@@ -486,10 +599,6 @@ The Elo rating system provides a more accurate ranking than simple win rates:
         inputs=[],
         outputs=[results_table_display],
         api_name="refresh_leaderboard"
-    ).then(
-        fn=load_leaderboard_data,
-        inputs=[],
-        outputs=[results_agg]
     )
     demo.unload(cleanup_on_disconnect)

     return [
         example,
+        gr.update(value=example['question'], elem_classes="query-text"),  # Regular query styles
         gr.update(value=context_desc, visible=bool(context_desc)),
         gr.update(value=context_html),
+        gr.update(value="Show Full Context", elem_classes=["context-toggle-button"], visible=True),  # Ensure toggle is visible
         show_full
     ]
+def toggle_faq(expanded):
+    """Toggle FAQ visibility with proper arrow icons"""
+    new_state = not expanded
+    button_text = "▼ Why can't I upload a file or ask my own question?" if new_state else "▶ Why can't I upload a file or ask my own question?"
+    return new_state, gr.update(visible=new_state), gr.update(value=button_text)
+# Explicit function to hide the FAQ section completely
+def hide_faq_section():
+    """Completely hide the FAQ section and its content"""
+    return gr.update(visible=False), gr.update(visible=False)
 def load_leaderboard():
     results = load_leaderboard_data()
     leaderboard_html = generate_leaderboard_html(results)
         context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
     return [
+        gr.update(value=example['question'], elem_classes="query-text"),  # Regular query styles
         gr.update(value=context_desc, visible=bool(context_desc)),
         gr.update(value=get_context_html(example, False)),
+        gr.update(value="Show Full Context", elem_classes=["context-toggle-button"], visible=True),  # Ensure toggle is visible
         False
     ]
     print(f"Browser disconnected. Cleaning up resources...")
     generation_interrupt.set()
+# Helper functions for showing/hiding UI elements
+def initialize_empty_app():
+    return [
+        gr.update(visible=False),  # context_section
+        gr.update(visible=False),  # model_section
+        gr.update(visible=False),  # voting_section
+        gr.update(visible=False)   # submit_button
+    ]
+def show_all_after_loading():
+    return [
+        gr.update(visible=True),  # context_section
+        gr.update(visible=True),  # model_section
+        gr.update(visible=True),  # voting_section
+        gr.update(visible=True),  # submit_button
+        gr.update(value="🔄 Try a New Question", elem_classes=["query-button"])  # update button text
+    ]
 with gr.Blocks(theme=gr.themes.Default(
     primary_hue=gr.themes.colors.orange,
     secondary_hue=gr.themes.colors.slate
     """
     gr.HTML(unload_js)
+    # State variables
     current_example = gr.State({})
     model_a_name = gr.State("")
     model_b_name = gr.State("")
     show_results_state = gr.State(False)
     results_agg = gr.State(load_leaderboard_data())
     show_full_context = gr.State(False)
+    faq_expanded = gr.State(False)  # State for FAQ toggle
     with gr.Tabs() as tabs:
         with gr.TabItem("Arena", id="arena-tab"):
+            gr.Markdown("# Small Language Model RAG Arena")
             gr.Markdown("""
+🏟️ This arena evaluates how well SLMs (under 5B) answer questions based on document contexts.
+📝 Instructions：
+-  **Click the "Get a Question" button** to load a random question with context
+-  **Review the query and context** to understand the information provided to the models
+-  **Compare answers** generated by two different models on answer quality or appropriate refusal
+-  **Cast your vote** for the better response, or select 'Tie' if equally good or 'Neither' if both are inadequate
 """)
+            gr.Markdown("---")
             with gr.Column(elem_id="main-interface-area") as main_interface_area:
                 with gr.Row(elem_id="query-title-row"):
                     gr.Markdown("### 💬 Query - Question About Document Content", elem_classes="section-heading")
                 with gr.Row(elem_id="query-container"):
                     with gr.Row(elem_classes="query-box-row"):
+                        query_display = gr.Markdown(value="Click \"Get a Question\" to start", elem_classes=["query-text", "empty-query"], elem_id="query-section")
+                    random_question_btn = gr.Button("💡 Get a Question", elem_classes=["query-button", "initial-button"])
+                # Add the FAQ toggle and content here
+                with gr.Row(visible=True, elem_id="faq-container") as faq_container:
+                    faq_toggle_btn = gr.Button("▶ Why can't I upload a file or ask my own question?", elem_classes=["faq-toggle-button"])
+                # FAQ Content - initially hidden
+                with gr.Row(visible=False, elem_id="faq-content") as faq_content:
+                    gr.Markdown("""
+                    This arena tests how well different AI models summarize information using standardized questions and contexts. All models see the exact same inputs for fair comparison.
+                    We don't allow file uploads here as that would change what we're measuring. Instead, check our leaderboard to find top-performing models for your needs. We'll soon launch a separate playground where you can test models with your own files.
+                    """, elem_classes="faq-text")
                 context_description = gr.Markdown("", elem_classes="context-description")
+                # Create a section container for all context-related elements - INITIALLY HIDDEN
+                with gr.Column(visible=False, elem_id="context-section") as context_section:
+                    context_divider = gr.HTML("<hr>", elem_id="context-divider")
+                    with gr.Row(elem_id="context-header-row"):
+                        gr.Markdown("### 📋 Context - Retrieved Content from the Document", elem_classes="context-title")
+                        context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
+                    context_display = gr.HTML(value="", label="Context Chunks")
+                # Model comparison section - initially hidden
+                with gr.Column(visible=False, elem_id="model-section") as model_section:
+                    gr.Markdown("---")
+                    gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
+                    with gr.Row(elem_id="summary-containers"):
+                        with gr.Column(scale=1):
+                            with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
+                                summary_a_display = gr.Textbox(
+                                    label="Model A",
+                                    lines=10,
+                                    interactive=False,
+                                    show_copy_button=True,
+                                    autoscroll=False,
+                                    elem_id="summary-a-display"
+                                )
+                        with gr.Column(scale=1):
+                            with gr.Group(elem_classes=["summary-card", "summary-card-b"]):
+                                summary_b_display = gr.Textbox(
+                                    label="Model B",
+                                    lines=10,
+                                    interactive=False,
+                                    show_copy_button=True,
+                                    autoscroll=False,
+                                    elem_id="summary-b-display"
+                                )
+                # Voting section - initially hidden
+                with gr.Column(visible=False, elem_id="voting-section") as voting_section:
+                    gr.HTML("<hr>")
+                    gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading")
+                    with gr.Row():
+                        vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False)
+                        vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"], interactive=False)
+                        vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False)
+                        vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False)
                 with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
                     feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
+                # Submit button - initially hidden
+                submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button", visible=False)
                 with gr.Column(visible=False) as results_reveal_area:
                     gr.Markdown("---")
             results_table_display = gr.HTML(label="Model Performance")
+    # FAQ toggle functionality with icon change
+    faq_toggle_btn.click(
+        fn=toggle_faq,
+        inputs=[faq_expanded],
+        outputs=[faq_expanded, faq_content, faq_toggle_btn]
+    )
+    # Context toggle functionality
     context_toggle_btn.click(
         fn=toggle_context_display,
         inputs=[current_example, show_full_context],
         outputs=[show_full_context, context_display, context_toggle_btn]
     )
+    # Initialize UI to empty state on load
     demo.load(
+        fn=initialize_empty_app,
+        inputs=[],
+        outputs=[
+            context_section,
+            model_section,
+            voting_section,
+            submit_button
+        ]
+    )
+    # Load leaderboard on start
+    demo.load(
+        fn=load_leaderboard,
+        inputs=[],
+        outputs=[results_table_display]
+    )
+    # Getting a new question
+    random_question_btn.click(
+        fn=show_loading_state,
         inputs=[],
+        outputs=[
+            summary_a_display, summary_b_display,
+            vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
+            feedback_section, submit_button, results_reveal_area, random_question_btn,
+            selected_winner
+        ]
+    ).then(
+        fn=handle_new_example_click,
+        inputs=[],
+        outputs=[current_example]
+    ).then(
+        fn=update_ui_for_new_context,
+        inputs=[current_example],
+        outputs=[query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
+    ).then(
+        # IMPORTANT: Explicitly hide FAQ here
+        fn=hide_faq_section,
+        inputs=[],
+        outputs=[faq_container, faq_content]
+    ).then(
+        fn=show_all_after_loading,
+        inputs=[],
+        outputs=[
+            context_section,
+            model_section,
+            voting_section,
+            submit_button,
+            random_question_btn
+        ]
     ).then(
         fn=process_example,
         inputs=[current_example],
                 submit_button, results_reveal_area, random_question_btn, main_interface_area]
     )
+    # Try another question
+    try_another_btn.click(
+        fn=show_loading_state,
         inputs=[],
+        outputs=[
+            summary_a_display, summary_b_display,
+            vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
+            feedback_section, submit_button, results_reveal_area, random_question_btn,
+            selected_winner
+        ]
+    ).then(
+        fn=handle_new_example_click,
+        inputs=[],
+        outputs=[current_example]
+    ).then(
+        fn=update_ui_for_new_context,
+        inputs=[current_example],
+        outputs=[query_display, context_description, context_display,
+                context_toggle_btn, show_full_context]
+    ).then(
+        # IMPORTANT: Explicitly hide FAQ here too
+        fn=hide_faq_section,
+        inputs=[],
+        outputs=[faq_container, faq_content]
+    ).then(
+        fn=show_all_after_loading,
+        inputs=[],
+        outputs=[
+            context_section,
+            model_section,
+            voting_section,
+            submit_button,
+            random_question_btn
+        ]
+    ).then(
+        fn=process_example,
+        inputs=[current_example],
+        outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
+                selected_winner, feedback_list, show_results_state, results_agg,
+                summary_a_display, summary_b_display, vote_button_a, vote_button_b,
+                vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
+                submit_button, results_reveal_area, random_question_btn, main_interface_area]
     )
+    # Vote button handling
     for btn, choice in zip(
         [vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
         ['left', 'right', 'tie', 'neither']
         inputs=[],
         outputs=[results_table_display],
         api_name="refresh_leaderboard"
     )
     demo.unload(cleanup_on_disconnect)