Anas Awadalla commited on
Commit
402e797
·
1 Parent(s): 1ddd951

fix baselines for showdown-clicks

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +3 -69
src/streamlit_app.py CHANGED
@@ -53,7 +53,7 @@ BASELINES = {
53
  }
54
  }
55
 
56
- @st.cache_data(ttl=1200) # Cache for 20 minutes
57
  def fetch_leaderboard_data():
58
  """Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
59
  api = HfApi()
@@ -433,33 +433,21 @@ def main():
433
 
434
  # Add metric selector for screenspot datasets
435
  selected_metric = 'overall' # Default metric
436
- if not ui_metrics_df.empty and 'screenspot' in selected_dataset.lower():
437
  # Metric selector dropdown
438
  if selected_dataset == 'screenspot-v2':
439
  metric_options = {
440
  'overall': 'Overall Average (Desktop + Web) / 2',
441
- 'desktop_avg': 'Desktop Average',
442
- 'web_avg': 'Web Average',
443
  'desktop_text': 'Desktop (Text)',
444
  'desktop_icon': 'Desktop (Icon)',
445
  'web_text': 'Web (Text)',
446
  'web_icon': 'Web (Icon)',
447
- 'text_avg': 'Text Average',
448
- 'icon_avg': 'Icon Average'
449
  }
450
- elif selected_dataset in ['screenspot-pro', 'showdown-clicks']:
451
  # For screenspot-pro and showdown-clicks, only show overall average
452
  metric_options = {
453
  'overall': 'Overall Average'
454
  }
455
- else:
456
- metric_options = {
457
- 'overall': 'Overall Average',
458
- 'desktop_avg': 'Desktop Average',
459
- 'web_avg': 'Web Average',
460
- 'text_avg': 'Text Average',
461
- 'icon_avg': 'Icon Average'
462
- }
463
 
464
  selected_metric = st.selectbox(
465
  "Select metric to visualize:",
@@ -478,60 +466,6 @@ def main():
478
  st.altair_chart(chart, use_container_width=True)
479
  else:
480
  st.warning(f"No data available for {metric_options[selected_metric]}")
481
-
482
- else:
483
- # For non-ScreenSpot datasets, show a simple bar chart
484
- # Prepare data list for chart with evaluated models and baselines (if any)
485
- chart_rows = []
486
-
487
- # Add evaluated models
488
- for _, row in filtered_df.iterrows():
489
- chart_rows.append({
490
- 'Model': row['model'],
491
- 'Score': row['overall_accuracy'],
492
- 'Type': 'Evaluated'
493
- })
494
-
495
- # Add baselines if defined for this dataset
496
- if selected_dataset in BASELINES:
497
- for baseline_name, baseline_metrics in BASELINES[selected_dataset].items():
498
- if 'overall' in baseline_metrics:
499
- chart_rows.append({
500
- 'Model': baseline_name,
501
- 'Score': baseline_metrics['overall'],
502
- 'Type': 'Baseline'
503
- })
504
-
505
- if chart_rows:
506
- chart_df = pd.DataFrame(chart_rows)
507
-
508
- # Create the bar chart similar to create_bar_chart
509
- chart = alt.Chart(chart_df).mark_bar().encode(
510
- x=alt.X('Model:N', sort=alt.EncodingSortField(field='Score', order='descending'),
511
- axis=alt.Axis(labelAngle=-45)),
512
- y=alt.Y('Score:Q', scale=alt.Scale(domain=[0, 100]),
513
- axis=alt.Axis(title='Score (%)')),
514
- color=alt.Color('Type:N',
515
- scale=alt.Scale(domain=['Evaluated', 'Baseline'],
516
- range=['#4ECDC4', '#FFA726'])),
517
- tooltip=['Model', 'Score', 'Type']
518
- ).properties(
519
- width=800,
520
- height=400
521
- )
522
-
523
- # Add value labels
524
- text = chart.mark_text(
525
- align='center',
526
- baseline='bottom',
527
- dy=-5
528
- ).encode(
529
- text=alt.Text('Score:Q', format='.1f')
530
- )
531
-
532
- st.altair_chart(chart + text, use_container_width=True)
533
- else:
534
- st.warning("No data available for the selected dataset.")
535
 
536
  if __name__ == "__main__":
537
  main()
 
53
  }
54
  }
55
 
56
+ @st.cache_data(ttl=300) # Cache for 5 minutes
57
  def fetch_leaderboard_data():
58
  """Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
59
  api = HfApi()
 
433
 
434
  # Add metric selector for screenspot datasets
435
  selected_metric = 'overall' # Default metric
436
+ if not ui_metrics_df.empty:
437
  # Metric selector dropdown
438
  if selected_dataset == 'screenspot-v2':
439
  metric_options = {
440
  'overall': 'Overall Average (Desktop + Web) / 2',
 
 
441
  'desktop_text': 'Desktop (Text)',
442
  'desktop_icon': 'Desktop (Icon)',
443
  'web_text': 'Web (Text)',
444
  'web_icon': 'Web (Icon)',
 
 
445
  }
446
+ else:
447
  # For screenspot-pro and showdown-clicks, only show overall average
448
  metric_options = {
449
  'overall': 'Overall Average'
450
  }
 
 
 
 
 
 
 
 
451
 
452
  selected_metric = st.selectbox(
453
  "Select metric to visualize:",
 
466
  st.altair_chart(chart, use_container_width=True)
467
  else:
468
  st.warning(f"No data available for {metric_options[selected_metric]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
  if __name__ == "__main__":
471
  main()