aizip-dev commited on
Commit
c963ac8
·
verified ·
1 Parent(s): 076ecc1

add onboarding flow

Browse files
Files changed (1) hide show
  1. app.py +206 -97
app.py CHANGED
@@ -78,13 +78,24 @@ def load_context(set_interrupt=False):
78
 
79
  return [
80
  example,
81
- gr.update(value=example['question']),
82
  gr.update(value=context_desc, visible=bool(context_desc)),
83
  gr.update(value=context_html),
84
- gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
85
  show_full
86
  ]
87
 
 
 
 
 
 
 
 
 
 
 
 
88
  def load_leaderboard():
89
  results = load_leaderboard_data()
90
  leaderboard_html = generate_leaderboard_html(results)
@@ -250,10 +261,10 @@ def update_ui_for_new_context(example):
250
  context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
251
 
252
  return [
253
- gr.update(value=example['question']),
254
  gr.update(value=context_desc, visible=bool(context_desc)),
255
  gr.update(value=get_context_html(example, False)),
256
- gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
257
  False
258
  ]
259
 
@@ -261,6 +272,24 @@ def cleanup_on_disconnect():
261
  print(f"Browser disconnected. Cleaning up resources...")
262
  generation_interrupt.set()
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  with gr.Blocks(theme=gr.themes.Default(
265
  primary_hue=gr.themes.colors.orange,
266
  secondary_hue=gr.themes.colors.slate
@@ -281,6 +310,7 @@ with gr.Blocks(theme=gr.themes.Default(
281
  """
282
  gr.HTML(unload_js)
283
 
 
284
  current_example = gr.State({})
285
  model_a_name = gr.State("")
286
  model_b_name = gr.State("")
@@ -291,75 +321,96 @@ with gr.Blocks(theme=gr.themes.Default(
291
  show_results_state = gr.State(False)
292
  results_agg = gr.State(load_leaderboard_data())
293
  show_full_context = gr.State(False)
 
294
 
295
  with gr.Tabs() as tabs:
296
  with gr.TabItem("Arena", id="arena-tab"):
297
- gr.Markdown("# Small Language Model RAG Summarization/Generation Arena")
298
  gr.Markdown("""
299
- 🏟️ This arena evaluates SLMs on document QA tasks with retrieved context. They should provide **grounded, comprehensive** answers or **properly decline** when information is insufficient.
300
-
301
- 📝 Insturction: 1. **Review the query and context**. 2. **Compare answers** generated by two different models. 3. **Vote for the better response** or select 'Tie/Neither' if appropriate.
302
 
 
 
 
 
 
303
  """)
304
-
305
- gr.HTML("<hr>")
306
-
307
  with gr.Column(elem_id="main-interface-area") as main_interface_area:
308
  with gr.Row(elem_id="query-title-row"):
309
  gr.Markdown("### 💬 Query - Question About Document Content", elem_classes="section-heading")
310
 
311
  with gr.Row(elem_id="query-container"):
312
  with gr.Row(elem_classes="query-box-row"):
313
- query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section")
314
- random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button")
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
  context_description = gr.Markdown("", elem_classes="context-description")
317
 
318
- gr.HTML("<hr>")
319
-
320
- with gr.Row(elem_id="context-header-row"):
321
- gr.Markdown("### 📋 Context - Retrieved Content from the Document", elem_classes="context-title")
322
- context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
323
 
324
- context_display = gr.HTML(value="Loading context...", label="Context Chunks")
325
-
326
- gr.Markdown("---")
327
- gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
328
-
329
- with gr.Row(elem_id="summary-containers"):
330
- with gr.Column(scale=1):
331
- with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
332
- summary_a_display = gr.Textbox(
333
- label="Model A",
334
- lines=10,
335
- interactive=False,
336
- show_copy_button=True,
337
- autoscroll=False,
338
- elem_id="summary-a-display"
339
- )
340
- with gr.Column(scale=1):
341
- with gr.Group(elem_classes=["summary-card", "summary-card-b"]):
342
- summary_b_display = gr.Textbox(
343
- label="Model B",
344
- lines=10,
345
- interactive=False,
346
- show_copy_button=True,
347
- autoscroll=False,
348
- elem_id="summary-b-display"
349
- )
350
-
351
- gr.HTML("<hr>")
352
-
353
- gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading")
354
- with gr.Row():
355
- vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False)
356
- vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"], interactive=False)
357
- vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False)
358
- vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False)
 
 
 
 
 
 
 
359
 
360
  with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
361
  feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
362
- submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button")
 
 
363
 
364
  with gr.Column(visible=False) as results_reveal_area:
365
  gr.Markdown("---")
@@ -394,17 +445,73 @@ The Elo rating system provides a more accurate ranking than simple win rates:
394
 
395
  results_table_display = gr.HTML(label="Model Performance")
396
 
 
 
 
 
 
 
 
 
397
  context_toggle_btn.click(
398
  fn=toggle_context_display,
399
  inputs=[current_example, show_full_context],
400
  outputs=[show_full_context, context_display, context_toggle_btn]
401
  )
402
-
 
403
  demo.load(
404
- fn=load_context,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  inputs=[],
406
- outputs=[current_example, query_display, context_description, context_display,
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  context_toggle_btn, show_full_context]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  ).then(
409
  fn=process_example,
410
  inputs=[current_example],
@@ -415,45 +522,51 @@ The Elo rating system provides a more accurate ranking than simple win rates:
415
  submit_button, results_reveal_area, random_question_btn, main_interface_area]
416
  )
417
 
418
- demo.load(
419
- fn=load_leaderboard,
 
420
  inputs=[],
421
- outputs=[results_table_display]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  )
423
 
424
- for btn in [random_question_btn, try_another_btn]:
425
- btn.click(
426
- fn=show_loading_state,
427
- inputs=[],
428
- outputs=[
429
- summary_a_display, summary_b_display,
430
- vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
431
- feedback_section, submit_button, results_reveal_area, random_question_btn,
432
- selected_winner # Add selected_winner to reset vote state
433
- ]
434
- ).then(
435
- fn=handle_new_example_click,
436
- inputs=[],
437
- outputs=[current_example]
438
- ).then(
439
- fn=load_leaderboard_data, # Add this to refresh results_agg
440
- inputs=[],
441
- outputs=[results_agg]
442
- ).then(
443
- fn=update_ui_for_new_context,
444
- inputs=[current_example],
445
- outputs=[query_display, context_description, context_display,
446
- context_toggle_btn, show_full_context]
447
- ).then(
448
- fn=process_example,
449
- inputs=[current_example],
450
- outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
451
- selected_winner, feedback_list, show_results_state, results_agg,
452
- summary_a_display, summary_b_display, vote_button_a, vote_button_b,
453
- vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
454
- submit_button, results_reveal_area, random_question_btn, main_interface_area]
455
- )
456
-
457
  for btn, choice in zip(
458
  [vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
459
  ['left', 'right', 'tie', 'neither']
@@ -486,10 +599,6 @@ The Elo rating system provides a more accurate ranking than simple win rates:
486
  inputs=[],
487
  outputs=[results_table_display],
488
  api_name="refresh_leaderboard"
489
- ).then(
490
- fn=load_leaderboard_data,
491
- inputs=[],
492
- outputs=[results_agg]
493
  )
494
 
495
  demo.unload(cleanup_on_disconnect)
 
78
 
79
  return [
80
  example,
81
+ gr.update(value=example['question'], elem_classes="query-text"), # Regular query styles
82
  gr.update(value=context_desc, visible=bool(context_desc)),
83
  gr.update(value=context_html),
84
+ gr.update(value="Show Full Context", elem_classes=["context-toggle-button"], visible=True), # Ensure toggle is visible
85
  show_full
86
  ]
87
 
88
+ def toggle_faq(expanded):
89
+ """Toggle FAQ visibility with proper arrow icons"""
90
+ new_state = not expanded
91
+ button_text = "▼ Why can't I upload a file or ask my own question?" if new_state else "▶ Why can't I upload a file or ask my own question?"
92
+ return new_state, gr.update(visible=new_state), gr.update(value=button_text)
93
+
94
+ # Explicit function to hide the FAQ section completely
95
+ def hide_faq_section():
96
+ """Completely hide the FAQ section and its content"""
97
+ return gr.update(visible=False), gr.update(visible=False)
98
+
99
  def load_leaderboard():
100
  results = load_leaderboard_data()
101
  leaderboard_html = generate_leaderboard_html(results)
 
261
  context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
262
 
263
  return [
264
+ gr.update(value=example['question'], elem_classes="query-text"), # Regular query styles
265
  gr.update(value=context_desc, visible=bool(context_desc)),
266
  gr.update(value=get_context_html(example, False)),
267
+ gr.update(value="Show Full Context", elem_classes=["context-toggle-button"], visible=True), # Ensure toggle is visible
268
  False
269
  ]
270
 
 
272
  print(f"Browser disconnected. Cleaning up resources...")
273
  generation_interrupt.set()
274
 
275
+ # Helper functions for showing/hiding UI elements
276
+ def initialize_empty_app():
277
+ return [
278
+ gr.update(visible=False), # context_section
279
+ gr.update(visible=False), # model_section
280
+ gr.update(visible=False), # voting_section
281
+ gr.update(visible=False) # submit_button
282
+ ]
283
+
284
+ def show_all_after_loading():
285
+ return [
286
+ gr.update(visible=True), # context_section
287
+ gr.update(visible=True), # model_section
288
+ gr.update(visible=True), # voting_section
289
+ gr.update(visible=True), # submit_button
290
+ gr.update(value="🔄 Try a New Question", elem_classes=["query-button"]) # update button text
291
+ ]
292
+
293
  with gr.Blocks(theme=gr.themes.Default(
294
  primary_hue=gr.themes.colors.orange,
295
  secondary_hue=gr.themes.colors.slate
 
310
  """
311
  gr.HTML(unload_js)
312
 
313
+ # State variables
314
  current_example = gr.State({})
315
  model_a_name = gr.State("")
316
  model_b_name = gr.State("")
 
321
  show_results_state = gr.State(False)
322
  results_agg = gr.State(load_leaderboard_data())
323
  show_full_context = gr.State(False)
324
+ faq_expanded = gr.State(False) # State for FAQ toggle
325
 
326
  with gr.Tabs() as tabs:
327
  with gr.TabItem("Arena", id="arena-tab"):
328
+ gr.Markdown("# Small Language Model RAG Arena")
329
  gr.Markdown("""
330
+ 🏟️ This arena evaluates how well SLMs (under 5B) answer questions based on document contexts.
 
 
331
 
332
+ 📝 Instructions:
333
+ - **Click the "Get a Question" button** to load a random question with context
334
+ - **Review the query and context** to understand the information provided to the models
335
+ - **Compare answers** generated by two different models on answer quality or appropriate refusal
336
+ - **Cast your vote** for the better response, or select 'Tie' if equally good or 'Neither' if both are inadequate
337
  """)
338
+ gr.Markdown("---")
 
 
339
  with gr.Column(elem_id="main-interface-area") as main_interface_area:
340
  with gr.Row(elem_id="query-title-row"):
341
  gr.Markdown("### 💬 Query - Question About Document Content", elem_classes="section-heading")
342
 
343
  with gr.Row(elem_id="query-container"):
344
  with gr.Row(elem_classes="query-box-row"):
345
+ query_display = gr.Markdown(value="Click \"Get a Question\" to start", elem_classes=["query-text", "empty-query"], elem_id="query-section")
346
+ random_question_btn = gr.Button("💡 Get a Question", elem_classes=["query-button", "initial-button"])
347
+
348
+ # Add the FAQ toggle and content here
349
+ with gr.Row(visible=True, elem_id="faq-container") as faq_container:
350
+ faq_toggle_btn = gr.Button("▶ Why can't I upload a file or ask my own question?", elem_classes=["faq-toggle-button"])
351
+
352
+ # FAQ Content - initially hidden
353
+ with gr.Row(visible=False, elem_id="faq-content") as faq_content:
354
+ gr.Markdown("""
355
+ This arena tests how well different AI models summarize information using standardized questions and contexts. All models see the exact same inputs for fair comparison.
356
+
357
+ We don't allow file uploads here as that would change what we're measuring. Instead, check our leaderboard to find top-performing models for your needs. We'll soon launch a separate playground where you can test models with your own files.
358
+ """, elem_classes="faq-text")
359
 
360
  context_description = gr.Markdown("", elem_classes="context-description")
361
 
362
+ # Create a section container for all context-related elements - INITIALLY HIDDEN
363
+ with gr.Column(visible=False, elem_id="context-section") as context_section:
364
+ context_divider = gr.HTML("<hr>", elem_id="context-divider")
 
 
365
 
366
+ with gr.Row(elem_id="context-header-row"):
367
+ gr.Markdown("### 📋 Context - Retrieved Content from the Document", elem_classes="context-title")
368
+ context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
369
+
370
+ context_display = gr.HTML(value="", label="Context Chunks")
371
+
372
+ # Model comparison section - initially hidden
373
+ with gr.Column(visible=False, elem_id="model-section") as model_section:
374
+ gr.Markdown("---")
375
+ gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
376
+
377
+ with gr.Row(elem_id="summary-containers"):
378
+ with gr.Column(scale=1):
379
+ with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
380
+ summary_a_display = gr.Textbox(
381
+ label="Model A",
382
+ lines=10,
383
+ interactive=False,
384
+ show_copy_button=True,
385
+ autoscroll=False,
386
+ elem_id="summary-a-display"
387
+ )
388
+ with gr.Column(scale=1):
389
+ with gr.Group(elem_classes=["summary-card", "summary-card-b"]):
390
+ summary_b_display = gr.Textbox(
391
+ label="Model B",
392
+ lines=10,
393
+ interactive=False,
394
+ show_copy_button=True,
395
+ autoscroll=False,
396
+ elem_id="summary-b-display"
397
+ )
398
+
399
+ # Voting section - initially hidden
400
+ with gr.Column(visible=False, elem_id="voting-section") as voting_section:
401
+ gr.HTML("<hr>")
402
+ gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading")
403
+ with gr.Row():
404
+ vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False)
405
+ vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"], interactive=False)
406
+ vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False)
407
+ vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False)
408
 
409
  with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
410
  feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
411
+
412
+ # Submit button - initially hidden
413
+ submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button", visible=False)
414
 
415
  with gr.Column(visible=False) as results_reveal_area:
416
  gr.Markdown("---")
 
445
 
446
  results_table_display = gr.HTML(label="Model Performance")
447
 
448
+ # FAQ toggle functionality with icon change
449
+ faq_toggle_btn.click(
450
+ fn=toggle_faq,
451
+ inputs=[faq_expanded],
452
+ outputs=[faq_expanded, faq_content, faq_toggle_btn]
453
+ )
454
+
455
+ # Context toggle functionality
456
  context_toggle_btn.click(
457
  fn=toggle_context_display,
458
  inputs=[current_example, show_full_context],
459
  outputs=[show_full_context, context_display, context_toggle_btn]
460
  )
461
+
462
+ # Initialize UI to empty state on load
463
  demo.load(
464
+ fn=initialize_empty_app,
465
+ inputs=[],
466
+ outputs=[
467
+ context_section,
468
+ model_section,
469
+ voting_section,
470
+ submit_button
471
+ ]
472
+ )
473
+
474
+ # Load leaderboard on start
475
+ demo.load(
476
+ fn=load_leaderboard,
477
+ inputs=[],
478
+ outputs=[results_table_display]
479
+ )
480
+
481
+ # Getting a new question
482
+ random_question_btn.click(
483
+ fn=show_loading_state,
484
  inputs=[],
485
+ outputs=[
486
+ summary_a_display, summary_b_display,
487
+ vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
488
+ feedback_section, submit_button, results_reveal_area, random_question_btn,
489
+ selected_winner
490
+ ]
491
+ ).then(
492
+ fn=handle_new_example_click,
493
+ inputs=[],
494
+ outputs=[current_example]
495
+ ).then(
496
+ fn=update_ui_for_new_context,
497
+ inputs=[current_example],
498
+ outputs=[query_display, context_description, context_display,
499
  context_toggle_btn, show_full_context]
500
+ ).then(
501
+ # IMPORTANT: Explicitly hide FAQ here
502
+ fn=hide_faq_section,
503
+ inputs=[],
504
+ outputs=[faq_container, faq_content]
505
+ ).then(
506
+ fn=show_all_after_loading,
507
+ inputs=[],
508
+ outputs=[
509
+ context_section,
510
+ model_section,
511
+ voting_section,
512
+ submit_button,
513
+ random_question_btn
514
+ ]
515
  ).then(
516
  fn=process_example,
517
  inputs=[current_example],
 
522
  submit_button, results_reveal_area, random_question_btn, main_interface_area]
523
  )
524
 
525
+ # Try another question
526
+ try_another_btn.click(
527
+ fn=show_loading_state,
528
  inputs=[],
529
+ outputs=[
530
+ summary_a_display, summary_b_display,
531
+ vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
532
+ feedback_section, submit_button, results_reveal_area, random_question_btn,
533
+ selected_winner
534
+ ]
535
+ ).then(
536
+ fn=handle_new_example_click,
537
+ inputs=[],
538
+ outputs=[current_example]
539
+ ).then(
540
+ fn=update_ui_for_new_context,
541
+ inputs=[current_example],
542
+ outputs=[query_display, context_description, context_display,
543
+ context_toggle_btn, show_full_context]
544
+ ).then(
545
+ # IMPORTANT: Explicitly hide FAQ here too
546
+ fn=hide_faq_section,
547
+ inputs=[],
548
+ outputs=[faq_container, faq_content]
549
+ ).then(
550
+ fn=show_all_after_loading,
551
+ inputs=[],
552
+ outputs=[
553
+ context_section,
554
+ model_section,
555
+ voting_section,
556
+ submit_button,
557
+ random_question_btn
558
+ ]
559
+ ).then(
560
+ fn=process_example,
561
+ inputs=[current_example],
562
+ outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
563
+ selected_winner, feedback_list, show_results_state, results_agg,
564
+ summary_a_display, summary_b_display, vote_button_a, vote_button_b,
565
+ vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
566
+ submit_button, results_reveal_area, random_question_btn, main_interface_area]
567
  )
568
 
569
+ # Vote button handling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  for btn, choice in zip(
571
  [vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
572
  ['left', 'right', 'tie', 'neither']
 
599
  inputs=[],
600
  outputs=[results_table_display],
601
  api_name="refresh_leaderboard"
 
 
 
 
602
  )
603
 
604
  demo.unload(cleanup_on_disconnect)