Anas Awadalla commited on
Commit
fc25316
·
1 Parent(s): a860139

add subset avg for pro baselines

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +54 -8
src/streamlit_app.py CHANGED
@@ -41,12 +41,34 @@ BASELINES = {
41
  "Qwen2.5-VL-7B-Instruct": {"desktop_text": 87.6, "desktop_icon": 65.7, "web_text": 90.2, "web_icon": 79.8, "overall": 81.9},
42
  },
43
  "screenspot-pro": {
44
- "Qwen2.5-VL-3B-Instruct": {"overall": 16.1},
45
- "Qwen2.5-VL-7B-Instruct": {"overall": 26.8},
46
- "Qwen2.5-VL-72B-Instruct": {"overall": 53.3},
47
- "UI-TARS-2B": {"overall": 27.7},
48
- "UI-TARS-7B": {"overall": 35.7},
49
- "UI-TARS-72B": {"overall": 38.1}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  },
51
  "showdown-clicks": {
52
  "UI-TARS-2B": {"overall": 59.8},
@@ -491,7 +513,18 @@ def main():
491
 
492
  # Dataset filter
493
  datasets = sorted(df['dataset'].unique())
 
 
 
 
 
494
  selected_dataset = st.sidebar.selectbox("Select Dataset", datasets)
 
 
 
 
 
 
495
  st.session_state['selected_dataset'] = selected_dataset
496
 
497
  # Filter data
@@ -511,7 +544,7 @@ def main():
511
  st.session_state['selected_models'] = []
512
 
513
  # Initialize selected models if not in session state
514
- if 'selected_models' not in st.session_state:
515
  st.session_state['selected_models'] = all_models
516
 
517
  # Multi-select widget for models
@@ -568,8 +601,14 @@ def main():
568
  'web_text': 'Web (Text)',
569
  'web_icon': 'Web (Icon)',
570
  }
 
 
 
 
 
 
571
  else:
572
- # For screenspot-pro and showdown-clicks, only show overall average
573
  metric_options = {
574
  'overall': 'Overall Average'
575
  }
@@ -623,7 +662,14 @@ def main():
623
  baseline_row['text_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['web_text']) / 2
624
  if 'desktop_icon' in baseline_metrics and 'web_icon' in baseline_metrics:
625
  baseline_row['icon_avg'] = (baseline_metrics['desktop_icon'] + baseline_metrics['web_icon']) / 2
 
 
 
 
 
 
626
  else:
 
627
  baseline_row['overall'] = baseline_metrics.get('overall', 0)
628
 
629
  baseline_rows.append(baseline_row)
 
41
  "Qwen2.5-VL-7B-Instruct": {"desktop_text": 87.6, "desktop_icon": 65.7, "web_text": 90.2, "web_icon": 79.8, "overall": 81.9},
42
  },
43
  "screenspot-pro": {
44
+ "Qwen2.5-VL-3B-Instruct": {
45
+ "overall": 16.1,
46
+ "text": 23.6,
47
+ "icon": 3.8
48
+ },
49
+ "Qwen2.5-VL-7B-Instruct": {
50
+ "overall": 26.8,
51
+ "text": 38.9,
52
+ "icon": 7.1
53
+ },
54
+ "Qwen2.5-VL-72B-Instruct": {
55
+ "overall": 53.3,
56
+ },
57
+ "UI-TARS-2B": {
58
+ "overall": 27.7,
59
+ "text": 39.6,
60
+ "icon": 8.4
61
+ },
62
+ "UI-TARS-7B": {
63
+ "overall": 35.7,
64
+ "text": 47.8,
65
+ "icon": 16.2
66
+ },
67
+ "UI-TARS-72B": {
68
+ "overall": 38.1,
69
+ "text": 50.9,
70
+ "icon": 17.6
71
+ }
72
  },
73
  "showdown-clicks": {
74
  "UI-TARS-2B": {"overall": 59.8},
 
513
 
514
  # Dataset filter
515
  datasets = sorted(df['dataset'].unique())
516
+
517
+ # Check if dataset has changed
518
+ if 'previous_dataset' not in st.session_state:
519
+ st.session_state['previous_dataset'] = None
520
+
521
  selected_dataset = st.sidebar.selectbox("Select Dataset", datasets)
522
+
523
+ # Reset selected models if dataset changed
524
+ if selected_dataset != st.session_state.get('previous_dataset'):
525
+ st.session_state['selected_models'] = None # This will trigger default selection
526
+ st.session_state['previous_dataset'] = selected_dataset
527
+
528
  st.session_state['selected_dataset'] = selected_dataset
529
 
530
  # Filter data
 
544
  st.session_state['selected_models'] = []
545
 
546
  # Initialize selected models if not in session state
547
+ if 'selected_models' not in st.session_state or st.session_state['selected_models'] is None:
548
  st.session_state['selected_models'] = all_models
549
 
550
  # Multi-select widget for models
 
601
  'web_text': 'Web (Text)',
602
  'web_icon': 'Web (Icon)',
603
  }
604
+ elif selected_dataset == 'screenspot-pro':
605
+ metric_options = {
606
+ 'overall': 'Overall Average',
607
+ 'text': 'Text',
608
+ 'icon': 'Icon'
609
+ }
610
  else:
611
+ # For showdown-clicks, only show overall average
612
  metric_options = {
613
  'overall': 'Overall Average'
614
  }
 
662
  baseline_row['text_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['web_text']) / 2
663
  if 'desktop_icon' in baseline_metrics and 'web_icon' in baseline_metrics:
664
  baseline_row['icon_avg'] = (baseline_metrics['desktop_icon'] + baseline_metrics['web_icon']) / 2
665
+ elif selected_dataset == 'screenspot-pro':
666
+ baseline_row.update({
667
+ 'overall': baseline_metrics.get('overall', 0),
668
+ 'text': baseline_metrics.get('text', 0),
669
+ 'icon': baseline_metrics.get('icon', 0)
670
+ })
671
  else:
672
+ # For other datasets (showdown-clicks, etc.)
673
  baseline_row['overall'] = baseline_metrics.get('overall', 0)
674
 
675
  baseline_rows.append(baseline_row)