Anas Awadalla
commited on
Commit
·
d37faa6
1
Parent(s):
c94fd08
some fixes
Browse files- README.md +21 -14
- src/streamlit_app.py +29 -79
README.md
CHANGED
@@ -56,7 +56,7 @@ The app will open in your browser at `http://localhost:8501`
|
|
56 |
- Individual UI type metrics: Desktop (Text), Desktop (Icon), Web (Text), Web (Icon)
|
57 |
- Text and Icon averages across environments
|
58 |
- Baseline model comparisons shown in orange
|
59 |
-
- Models marked with * indicate the best checkpoint is not the
|
60 |
|
61 |
4. **Explore Details**:
|
62 |
- Expand "Model Details" to see training metadata
|
@@ -89,19 +89,26 @@ To minimize local storage requirements, the app:
|
|
89 |
|
90 |
## Baseline Models
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
-
|
96 |
-
- UI-TARS-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
- Qwen2.5-VL-
|
102 |
-
-
|
103 |
-
-
|
104 |
-
- UI-TARS-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
## Checkpoint Handling
|
107 |
|
|
|
56 |
- Individual UI type metrics: Desktop (Text), Desktop (Icon), Web (Text), Web (Icon)
|
57 |
- Text and Icon averages across environments
|
58 |
- Baseline model comparisons shown in orange
|
59 |
+
- Models marked with * indicate the best checkpoint is not the last one
|
60 |
|
61 |
4. **Explore Details**:
|
62 |
- Expand "Model Details" to see training metadata
|
|
|
89 |
|
90 |
## Baseline Models
|
91 |
|
92 |
+
The dashboard includes baseline performance from established models:
|
93 |
+
|
94 |
+
### ScreenSpot-v2 Baselines
|
95 |
+
- **Qwen2-VL-7B**: 38.0% overall
|
96 |
+
- **UI-TARS-2B**: 82.8% overall
|
97 |
+
- **UI-TARS-7B**: 92.2% overall
|
98 |
+
- **UI-TARS-72B**: 88.3% overall
|
99 |
+
|
100 |
+
### ScreenSpot-Pro Baselines
|
101 |
+
- **Qwen2.5-VL-3B-Instruct**: 16.1% overall
|
102 |
+
- **Qwen2.5-VL-7B-Instruct**: 26.8% overall
|
103 |
+
- **Qwen2.5-VL-72B-Instruct**: 53.3% overall
|
104 |
+
- **UI-TARS-2B**: 27.7% overall
|
105 |
+
- **UI-TARS-7B**: 35.7% overall
|
106 |
+
- **UI-TARS-72B**: 38.1% overall
|
107 |
+
|
108 |
+
### ShowDown-Clicks Baselines
|
109 |
+
- **Qwen2.5-VL-72B-Instruct**: 24.8% overall
|
110 |
+
- **UI-TARS-72B-SFT**: 54.4% overall
|
111 |
+
- **Molmo-72B-0924**: 54.8% overall
|
112 |
|
113 |
## Checkpoint Handling
|
114 |
|
src/streamlit_app.py
CHANGED
@@ -26,53 +26,30 @@ GROUNDING_PATH = "grounding"
|
|
26 |
BASELINES = {
|
27 |
"screenspot-v2": {
|
28 |
"Qwen2-VL-7B": {
|
29 |
-
"desktop_text": 52.01,
|
30 |
-
"desktop_icon": 44.98,
|
31 |
-
"web_text": 33.04,
|
32 |
-
"web_icon": 21.84,
|
33 |
-
"overall": 37.96
|
34 |
},
|
35 |
"UI-TARS-2B": {
|
36 |
-
"desktop_text": 90.7,
|
37 |
-
"desktop_icon": 68.6,
|
38 |
-
"web_text": 87.2,
|
39 |
-
"web_icon": 84.7,
|
40 |
-
"overall": 82.8
|
41 |
},
|
42 |
"UI-TARS-7B": {
|
43 |
-
"desktop_text": 95.4,
|
44 |
-
"desktop_icon": 87.8,
|
45 |
-
"web_text": 93.8,
|
46 |
-
"web_icon": 91.6,
|
47 |
-
"overall": 92.2
|
48 |
},
|
49 |
"UI-TARS-72B": {
|
50 |
-
"desktop_text": 91.2,
|
51 |
-
"desktop_icon": 87.8,
|
52 |
-
"web_text": 87.7,
|
53 |
-
"web_icon": 86.3,
|
54 |
-
"overall": 88.3
|
55 |
}
|
56 |
},
|
57 |
"screenspot-pro": {
|
58 |
-
"Qwen2.5-VL-3B-Instruct": {
|
59 |
-
|
60 |
-
},
|
61 |
-
"
|
62 |
-
|
63 |
-
}
|
64 |
-
|
65 |
-
|
66 |
-
},
|
67 |
-
"UI-TARS-
|
68 |
-
|
69 |
-
},
|
70 |
-
"UI-TARS-7B": {
|
71 |
-
"overall": 35.7
|
72 |
-
},
|
73 |
-
"UI-TARS-72B": {
|
74 |
-
"overall": 38.1
|
75 |
-
}
|
76 |
}
|
77 |
}
|
78 |
|
@@ -472,6 +449,11 @@ def main():
|
|
472 |
'text_avg': 'Text Average',
|
473 |
'icon_avg': 'Icon Average'
|
474 |
}
|
|
|
|
|
|
|
|
|
|
|
475 |
else:
|
476 |
metric_options = {
|
477 |
'overall': 'Overall Average',
|
@@ -499,7 +481,7 @@ def main():
|
|
499 |
else:
|
500 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
501 |
|
502 |
-
# Show all metrics in an expandable section
|
503 |
with st.expander("View All Metrics"):
|
504 |
if selected_dataset == 'screenspot-v2':
|
505 |
# First row: Overall, Desktop, Web averages
|
@@ -556,35 +538,8 @@ def main():
|
|
556 |
if chart:
|
557 |
st.altair_chart(chart, use_container_width=True)
|
558 |
else:
|
559 |
-
# For
|
560 |
-
|
561 |
-
|
562 |
-
with col1:
|
563 |
-
# Overall Average
|
564 |
-
chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average')
|
565 |
-
if chart:
|
566 |
-
st.altair_chart(chart, use_container_width=True)
|
567 |
-
|
568 |
-
# Desktop Average
|
569 |
-
chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
|
570 |
-
if chart:
|
571 |
-
st.altair_chart(chart, use_container_width=True)
|
572 |
-
|
573 |
-
# Text Average
|
574 |
-
chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (UI-Type)')
|
575 |
-
if chart:
|
576 |
-
st.altair_chart(chart, use_container_width=True)
|
577 |
-
|
578 |
-
with col2:
|
579 |
-
# Web Average
|
580 |
-
chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
|
581 |
-
if chart:
|
582 |
-
st.altair_chart(chart, use_container_width=True)
|
583 |
-
|
584 |
-
# Icon Average
|
585 |
-
chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (UI-Type)')
|
586 |
-
if chart:
|
587 |
-
st.altair_chart(chart, use_container_width=True)
|
588 |
|
589 |
# Checkpoint progression visualization
|
590 |
with st.expander("Checkpoint Progression Analysis"):
|
@@ -798,9 +753,9 @@ def main():
|
|
798 |
else:
|
799 |
st.info("No models with multiple checkpoints available for progression analysis")
|
800 |
|
801 |
-
# Detailed breakdown
|
802 |
-
|
803 |
-
|
804 |
# Create a heatmap-style table
|
805 |
detailed_metrics = []
|
806 |
for _, row in ui_metrics_df.iterrows():
|
@@ -815,6 +770,9 @@ def main():
|
|
815 |
|
816 |
if detailed_metrics:
|
817 |
st.dataframe(pd.DataFrame(detailed_metrics), use_container_width=True)
|
|
|
|
|
|
|
818 |
|
819 |
else:
|
820 |
# For non-ScreenSpot datasets, show a simple bar chart
|
@@ -833,14 +791,6 @@ def main():
|
|
833 |
)
|
834 |
|
835 |
st.altair_chart(chart, use_container_width=True)
|
836 |
-
|
837 |
-
# Model details table
|
838 |
-
with st.expander("Model Details"):
|
839 |
-
display_df = filtered_df[['model', 'overall_accuracy', 'total_samples', 'checkpoint_steps', 'training_loss', 'timestamp']].copy()
|
840 |
-
display_df.columns = ['Model', 'Accuracy (%)', 'Samples', 'Checkpoint Steps', 'Training Loss', 'Timestamp']
|
841 |
-
display_df['Accuracy (%)'] = display_df['Accuracy (%)'].apply(lambda x: f"{x:.2f}")
|
842 |
-
display_df['Training Loss'] = display_df['Training Loss'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
|
843 |
-
st.dataframe(display_df, use_container_width=True)
|
844 |
|
845 |
if __name__ == "__main__":
|
846 |
main()
|
|
|
26 |
BASELINES = {
|
27 |
"screenspot-v2": {
|
28 |
"Qwen2-VL-7B": {
|
29 |
+
"desktop_text": 52.01, "desktop_icon": 44.98, "web_text": 33.04, "web_icon": 21.84, "overall": 37.96
|
|
|
|
|
|
|
|
|
30 |
},
|
31 |
"UI-TARS-2B": {
|
32 |
+
"desktop_text": 90.7, "desktop_icon": 68.6, "web_text": 87.2, "web_icon": 84.7, "overall": 82.8
|
|
|
|
|
|
|
|
|
33 |
},
|
34 |
"UI-TARS-7B": {
|
35 |
+
"desktop_text": 95.4, "desktop_icon": 87.8, "web_text": 93.8, "web_icon": 91.6, "overall": 92.2
|
|
|
|
|
|
|
|
|
36 |
},
|
37 |
"UI-TARS-72B": {
|
38 |
+
"desktop_text": 91.2, "desktop_icon": 87.8, "web_text": 87.7, "web_icon": 86.3, "overall": 88.3
|
|
|
|
|
|
|
|
|
39 |
}
|
40 |
},
|
41 |
"screenspot-pro": {
|
42 |
+
"Qwen2.5-VL-3B-Instruct": {"overall": 16.1},
|
43 |
+
"Qwen2.5-VL-7B-Instruct": {"overall": 26.8},
|
44 |
+
"Qwen2.5-VL-72B-Instruct": {"overall": 53.3},
|
45 |
+
"UI-TARS-2B": {"overall": 27.7},
|
46 |
+
"UI-TARS-7B": {"overall": 35.7},
|
47 |
+
"UI-TARS-72B": {"overall": 38.1}
|
48 |
+
},
|
49 |
+
"showdown-clicks": {
|
50 |
+
"Qwen2.5-VL-72B-Instruct": {"overall": 24.78},
|
51 |
+
"UI-TARS-72B-SFT": {"overall": 54.4},
|
52 |
+
"Molmo-72B-0924": {"overall": 54.76}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
}
|
55 |
|
|
|
449 |
'text_avg': 'Text Average',
|
450 |
'icon_avg': 'Icon Average'
|
451 |
}
|
452 |
+
elif selected_dataset in ['screenspot-pro', 'showdown-clicks']:
|
453 |
+
# For screenspot-pro and showdown-clicks, only show overall average
|
454 |
+
metric_options = {
|
455 |
+
'overall': 'Overall Average'
|
456 |
+
}
|
457 |
else:
|
458 |
metric_options = {
|
459 |
'overall': 'Overall Average',
|
|
|
481 |
else:
|
482 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
483 |
|
484 |
+
# Show all metrics in an expandable section - available for all datasets
|
485 |
with st.expander("View All Metrics"):
|
486 |
if selected_dataset == 'screenspot-v2':
|
487 |
# First row: Overall, Desktop, Web averages
|
|
|
538 |
if chart:
|
539 |
st.altair_chart(chart, use_container_width=True)
|
540 |
else:
|
541 |
+
# For screenspot-pro and showdown-clicks
|
542 |
+
st.info("No additional UI type metrics available for this dataset. Only overall accuracy is reported.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
|
544 |
# Checkpoint progression visualization
|
545 |
with st.expander("Checkpoint Progression Analysis"):
|
|
|
753 |
else:
|
754 |
st.info("No models with multiple checkpoints available for progression analysis")
|
755 |
|
756 |
+
# Detailed breakdown - show for all datasets
|
757 |
+
with st.expander("Detailed UI Type Breakdown"):
|
758 |
+
if selected_dataset == 'screenspot-v2':
|
759 |
# Create a heatmap-style table
|
760 |
detailed_metrics = []
|
761 |
for _, row in ui_metrics_df.iterrows():
|
|
|
770 |
|
771 |
if detailed_metrics:
|
772 |
st.dataframe(pd.DataFrame(detailed_metrics), use_container_width=True)
|
773 |
+
else:
|
774 |
+
# For screenspot-pro and showdown-clicks
|
775 |
+
st.info("Detailed UI type breakdown is only available for ScreenSpot-v2 dataset.")
|
776 |
|
777 |
else:
|
778 |
# For non-ScreenSpot datasets, show a simple bar chart
|
|
|
791 |
)
|
792 |
|
793 |
st.altair_chart(chart, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
794 |
|
795 |
if __name__ == "__main__":
|
796 |
main()
|