Anas Awadalla commited on
Commit
d37faa6
·
1 Parent(s): c94fd08

some fixes

Browse files
Files changed (2) hide show
  1. README.md +21 -14
  2. src/streamlit_app.py +29 -79
README.md CHANGED
@@ -56,7 +56,7 @@ The app will open in your browser at `http://localhost:8501`
56
  - Individual UI type metrics: Desktop (Text), Desktop (Icon), Web (Text), Web (Icon)
57
  - Text and Icon averages across environments
58
  - Baseline model comparisons shown in orange
59
- - Models marked with * indicate the best checkpoint is not the final one
60
 
61
  4. **Explore Details**:
62
  - Expand "Model Details" to see training metadata
@@ -89,19 +89,26 @@ To minimize local storage requirements, the app:
89
 
90
  ## Baseline Models
91
 
92
- For ScreenSpot-v2, the following baselines are included:
93
- - Qwen2-VL-7B: 37.96%
94
- - UI-TARS-2B: 82.8%
95
- - UI-TARS-7B: 92.2%
96
- - UI-TARS-72B: 88.3%
97
-
98
- For ScreenSpot-Pro, the following baselines are included:
99
- - Qwen2.5-VL-3B-Instruct: 16.1%
100
- - Qwen2.5-VL-7B-Instruct: 26.8%
101
- - Qwen2.5-VL-72B-Instruct: 53.3%
102
- - UI-TARS-2B: 27.7%
103
- - UI-TARS-7B: 35.7%
104
- - UI-TARS-72B: 38.1%
 
 
 
 
 
 
 
105
 
106
  ## Checkpoint Handling
107
 
 
56
  - Individual UI type metrics: Desktop (Text), Desktop (Icon), Web (Text), Web (Icon)
57
  - Text and Icon averages across environments
58
  - Baseline model comparisons shown in orange
59
+ - Models marked with * indicate the best checkpoint is not the last one
60
 
61
  4. **Explore Details**:
62
  - Expand "Model Details" to see training metadata
 
89
 
90
  ## Baseline Models
91
 
92
+ The dashboard includes baseline performance from established models:
93
+
94
+ ### ScreenSpot-v2 Baselines
95
+ - **Qwen2-VL-7B**: 38.0% overall
96
+ - **UI-TARS-2B**: 82.8% overall
97
+ - **UI-TARS-7B**: 92.2% overall
98
+ - **UI-TARS-72B**: 88.3% overall
99
+
100
+ ### ScreenSpot-Pro Baselines
101
+ - **Qwen2.5-VL-3B-Instruct**: 16.1% overall
102
+ - **Qwen2.5-VL-7B-Instruct**: 26.8% overall
103
+ - **Qwen2.5-VL-72B-Instruct**: 53.3% overall
104
+ - **UI-TARS-2B**: 27.7% overall
105
+ - **UI-TARS-7B**: 35.7% overall
106
+ - **UI-TARS-72B**: 38.1% overall
107
+
108
+ ### ShowDown-Clicks Baselines
109
+ - **Qwen2.5-VL-72B-Instruct**: 24.8% overall
110
+ - **UI-TARS-72B-SFT**: 54.4% overall
111
+ - **Molmo-72B-0924**: 54.8% overall
112
 
113
  ## Checkpoint Handling
114
 
src/streamlit_app.py CHANGED
@@ -26,53 +26,30 @@ GROUNDING_PATH = "grounding"
26
  BASELINES = {
27
  "screenspot-v2": {
28
  "Qwen2-VL-7B": {
29
- "desktop_text": 52.01,
30
- "desktop_icon": 44.98,
31
- "web_text": 33.04,
32
- "web_icon": 21.84,
33
- "overall": 37.96
34
  },
35
  "UI-TARS-2B": {
36
- "desktop_text": 90.7,
37
- "desktop_icon": 68.6,
38
- "web_text": 87.2,
39
- "web_icon": 84.7,
40
- "overall": 82.8
41
  },
42
  "UI-TARS-7B": {
43
- "desktop_text": 95.4,
44
- "desktop_icon": 87.8,
45
- "web_text": 93.8,
46
- "web_icon": 91.6,
47
- "overall": 92.2
48
  },
49
  "UI-TARS-72B": {
50
- "desktop_text": 91.2,
51
- "desktop_icon": 87.8,
52
- "web_text": 87.7,
53
- "web_icon": 86.3,
54
- "overall": 88.3
55
  }
56
  },
57
  "screenspot-pro": {
58
- "Qwen2.5-VL-3B-Instruct": {
59
- "overall": 16.1
60
- },
61
- "Qwen2.5-VL-7B-Instruct": {
62
- "overall": 26.8
63
- },
64
- "Qwen2.5-VL-72B-Instruct": {
65
- "overall": 53.3
66
- },
67
- "UI-TARS-2B": {
68
- "overall": 27.7
69
- },
70
- "UI-TARS-7B": {
71
- "overall": 35.7
72
- },
73
- "UI-TARS-72B": {
74
- "overall": 38.1
75
- }
76
  }
77
  }
78
 
@@ -472,6 +449,11 @@ def main():
472
  'text_avg': 'Text Average',
473
  'icon_avg': 'Icon Average'
474
  }
 
 
 
 
 
475
  else:
476
  metric_options = {
477
  'overall': 'Overall Average',
@@ -499,7 +481,7 @@ def main():
499
  else:
500
  st.warning(f"No data available for {metric_options[selected_metric]}")
501
 
502
- # Show all metrics in an expandable section
503
  with st.expander("View All Metrics"):
504
  if selected_dataset == 'screenspot-v2':
505
  # First row: Overall, Desktop, Web averages
@@ -556,35 +538,8 @@ def main():
556
  if chart:
557
  st.altair_chart(chart, use_container_width=True)
558
  else:
559
- # For other screenspot datasets, show the standard layout
560
- col1, col2 = st.columns(2)
561
-
562
- with col1:
563
- # Overall Average
564
- chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average')
565
- if chart:
566
- st.altair_chart(chart, use_container_width=True)
567
-
568
- # Desktop Average
569
- chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
570
- if chart:
571
- st.altair_chart(chart, use_container_width=True)
572
-
573
- # Text Average
574
- chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (UI-Type)')
575
- if chart:
576
- st.altair_chart(chart, use_container_width=True)
577
-
578
- with col2:
579
- # Web Average
580
- chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
581
- if chart:
582
- st.altair_chart(chart, use_container_width=True)
583
-
584
- # Icon Average
585
- chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (UI-Type)')
586
- if chart:
587
- st.altair_chart(chart, use_container_width=True)
588
 
589
  # Checkpoint progression visualization
590
  with st.expander("Checkpoint Progression Analysis"):
@@ -798,9 +753,9 @@ def main():
798
  else:
799
  st.info("No models with multiple checkpoints available for progression analysis")
800
 
801
- # Detailed breakdown
802
- if selected_dataset == 'screenspot-v2':
803
- with st.expander("Detailed UI Type Breakdown"):
804
  # Create a heatmap-style table
805
  detailed_metrics = []
806
  for _, row in ui_metrics_df.iterrows():
@@ -815,6 +770,9 @@ def main():
815
 
816
  if detailed_metrics:
817
  st.dataframe(pd.DataFrame(detailed_metrics), use_container_width=True)
 
 
 
818
 
819
  else:
820
  # For non-ScreenSpot datasets, show a simple bar chart
@@ -833,14 +791,6 @@ def main():
833
  )
834
 
835
  st.altair_chart(chart, use_container_width=True)
836
-
837
- # Model details table
838
- with st.expander("Model Details"):
839
- display_df = filtered_df[['model', 'overall_accuracy', 'total_samples', 'checkpoint_steps', 'training_loss', 'timestamp']].copy()
840
- display_df.columns = ['Model', 'Accuracy (%)', 'Samples', 'Checkpoint Steps', 'Training Loss', 'Timestamp']
841
- display_df['Accuracy (%)'] = display_df['Accuracy (%)'].apply(lambda x: f"{x:.2f}")
842
- display_df['Training Loss'] = display_df['Training Loss'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
843
- st.dataframe(display_df, use_container_width=True)
844
 
845
  if __name__ == "__main__":
846
  main()
 
26
  BASELINES = {
27
  "screenspot-v2": {
28
  "Qwen2-VL-7B": {
29
+ "desktop_text": 52.01, "desktop_icon": 44.98, "web_text": 33.04, "web_icon": 21.84, "overall": 37.96
 
 
 
 
30
  },
31
  "UI-TARS-2B": {
32
+ "desktop_text": 90.7, "desktop_icon": 68.6, "web_text": 87.2, "web_icon": 84.7, "overall": 82.8
 
 
 
 
33
  },
34
  "UI-TARS-7B": {
35
+ "desktop_text": 95.4, "desktop_icon": 87.8, "web_text": 93.8, "web_icon": 91.6, "overall": 92.2
 
 
 
 
36
  },
37
  "UI-TARS-72B": {
38
+ "desktop_text": 91.2, "desktop_icon": 87.8, "web_text": 87.7, "web_icon": 86.3, "overall": 88.3
 
 
 
 
39
  }
40
  },
41
  "screenspot-pro": {
42
+ "Qwen2.5-VL-3B-Instruct": {"overall": 16.1},
43
+ "Qwen2.5-VL-7B-Instruct": {"overall": 26.8},
44
+ "Qwen2.5-VL-72B-Instruct": {"overall": 53.3},
45
+ "UI-TARS-2B": {"overall": 27.7},
46
+ "UI-TARS-7B": {"overall": 35.7},
47
+ "UI-TARS-72B": {"overall": 38.1}
48
+ },
49
+ "showdown-clicks": {
50
+ "Qwen2.5-VL-72B-Instruct": {"overall": 24.78},
51
+ "UI-TARS-72B-SFT": {"overall": 54.4},
52
+ "Molmo-72B-0924": {"overall": 54.76}
 
 
 
 
 
 
 
53
  }
54
  }
55
 
 
449
  'text_avg': 'Text Average',
450
  'icon_avg': 'Icon Average'
451
  }
452
+ elif selected_dataset in ['screenspot-pro', 'showdown-clicks']:
453
+ # For screenspot-pro and showdown-clicks, only show overall average
454
+ metric_options = {
455
+ 'overall': 'Overall Average'
456
+ }
457
  else:
458
  metric_options = {
459
  'overall': 'Overall Average',
 
481
  else:
482
  st.warning(f"No data available for {metric_options[selected_metric]}")
483
 
484
+ # Show all metrics in an expandable section - available for all datasets
485
  with st.expander("View All Metrics"):
486
  if selected_dataset == 'screenspot-v2':
487
  # First row: Overall, Desktop, Web averages
 
538
  if chart:
539
  st.altair_chart(chart, use_container_width=True)
540
  else:
541
+ # For screenspot-pro and showdown-clicks
542
+ st.info("No additional UI type metrics available for this dataset. Only overall accuracy is reported.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
 
544
  # Checkpoint progression visualization
545
  with st.expander("Checkpoint Progression Analysis"):
 
753
  else:
754
  st.info("No models with multiple checkpoints available for progression analysis")
755
 
756
+ # Detailed breakdown - show for all datasets
757
+ with st.expander("Detailed UI Type Breakdown"):
758
+ if selected_dataset == 'screenspot-v2':
759
  # Create a heatmap-style table
760
  detailed_metrics = []
761
  for _, row in ui_metrics_df.iterrows():
 
770
 
771
  if detailed_metrics:
772
  st.dataframe(pd.DataFrame(detailed_metrics), use_container_width=True)
773
+ else:
774
+ # For screenspot-pro and showdown-clicks
775
+ st.info("Detailed UI type breakdown is only available for ScreenSpot-v2 dataset.")
776
 
777
  else:
778
  # For non-ScreenSpot datasets, show a simple bar chart
 
791
  )
792
 
793
  st.altair_chart(chart, use_container_width=True)
 
 
 
 
 
 
 
 
794
 
795
  if __name__ == "__main__":
796
  main()