Anas Awadalla
commited on
Commit
·
fc25316
1
Parent(s):
a860139
add subset avg for pro baselines
Browse files- src/streamlit_app.py +54 -8
src/streamlit_app.py
CHANGED
@@ -41,12 +41,34 @@ BASELINES = {
|
|
41 |
"Qwen2.5-VL-7B-Instruct": {"desktop_text": 87.6, "desktop_icon": 65.7, "web_text": 90.2, "web_icon": 79.8, "overall": 81.9},
|
42 |
},
|
43 |
"screenspot-pro": {
|
44 |
-
"Qwen2.5-VL-3B-Instruct": {
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
},
|
51 |
"showdown-clicks": {
|
52 |
"UI-TARS-2B": {"overall": 59.8},
|
@@ -491,7 +513,18 @@ def main():
|
|
491 |
|
492 |
# Dataset filter
|
493 |
datasets = sorted(df['dataset'].unique())
|
|
|
|
|
|
|
|
|
|
|
494 |
selected_dataset = st.sidebar.selectbox("Select Dataset", datasets)
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
st.session_state['selected_dataset'] = selected_dataset
|
496 |
|
497 |
# Filter data
|
@@ -511,7 +544,7 @@ def main():
|
|
511 |
st.session_state['selected_models'] = []
|
512 |
|
513 |
# Initialize selected models if not in session state
|
514 |
-
if 'selected_models' not in st.session_state:
|
515 |
st.session_state['selected_models'] = all_models
|
516 |
|
517 |
# Multi-select widget for models
|
@@ -568,8 +601,14 @@ def main():
|
|
568 |
'web_text': 'Web (Text)',
|
569 |
'web_icon': 'Web (Icon)',
|
570 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
571 |
else:
|
572 |
-
# For
|
573 |
metric_options = {
|
574 |
'overall': 'Overall Average'
|
575 |
}
|
@@ -623,7 +662,14 @@ def main():
|
|
623 |
baseline_row['text_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['web_text']) / 2
|
624 |
if 'desktop_icon' in baseline_metrics and 'web_icon' in baseline_metrics:
|
625 |
baseline_row['icon_avg'] = (baseline_metrics['desktop_icon'] + baseline_metrics['web_icon']) / 2
|
|
|
|
|
|
|
|
|
|
|
|
|
626 |
else:
|
|
|
627 |
baseline_row['overall'] = baseline_metrics.get('overall', 0)
|
628 |
|
629 |
baseline_rows.append(baseline_row)
|
|
|
41 |
"Qwen2.5-VL-7B-Instruct": {"desktop_text": 87.6, "desktop_icon": 65.7, "web_text": 90.2, "web_icon": 79.8, "overall": 81.9},
|
42 |
},
|
43 |
"screenspot-pro": {
|
44 |
+
"Qwen2.5-VL-3B-Instruct": {
|
45 |
+
"overall": 16.1,
|
46 |
+
"text": 23.6,
|
47 |
+
"icon": 3.8
|
48 |
+
},
|
49 |
+
"Qwen2.5-VL-7B-Instruct": {
|
50 |
+
"overall": 26.8,
|
51 |
+
"text": 38.9,
|
52 |
+
"icon": 7.1
|
53 |
+
},
|
54 |
+
"Qwen2.5-VL-72B-Instruct": {
|
55 |
+
"overall": 53.3,
|
56 |
+
},
|
57 |
+
"UI-TARS-2B": {
|
58 |
+
"overall": 27.7,
|
59 |
+
"text": 39.6,
|
60 |
+
"icon": 8.4
|
61 |
+
},
|
62 |
+
"UI-TARS-7B": {
|
63 |
+
"overall": 35.7,
|
64 |
+
"text": 47.8,
|
65 |
+
"icon": 16.2
|
66 |
+
},
|
67 |
+
"UI-TARS-72B": {
|
68 |
+
"overall": 38.1,
|
69 |
+
"text": 50.9,
|
70 |
+
"icon": 17.6
|
71 |
+
}
|
72 |
},
|
73 |
"showdown-clicks": {
|
74 |
"UI-TARS-2B": {"overall": 59.8},
|
|
|
513 |
|
514 |
# Dataset filter
|
515 |
datasets = sorted(df['dataset'].unique())
|
516 |
+
|
517 |
+
# Check if dataset has changed
|
518 |
+
if 'previous_dataset' not in st.session_state:
|
519 |
+
st.session_state['previous_dataset'] = None
|
520 |
+
|
521 |
selected_dataset = st.sidebar.selectbox("Select Dataset", datasets)
|
522 |
+
|
523 |
+
# Reset selected models if dataset changed
|
524 |
+
if selected_dataset != st.session_state.get('previous_dataset'):
|
525 |
+
st.session_state['selected_models'] = None # This will trigger default selection
|
526 |
+
st.session_state['previous_dataset'] = selected_dataset
|
527 |
+
|
528 |
st.session_state['selected_dataset'] = selected_dataset
|
529 |
|
530 |
# Filter data
|
|
|
544 |
st.session_state['selected_models'] = []
|
545 |
|
546 |
# Initialize selected models if not in session state
|
547 |
+
if 'selected_models' not in st.session_state or st.session_state['selected_models'] is None:
|
548 |
st.session_state['selected_models'] = all_models
|
549 |
|
550 |
# Multi-select widget for models
|
|
|
601 |
'web_text': 'Web (Text)',
|
602 |
'web_icon': 'Web (Icon)',
|
603 |
}
|
604 |
+
elif selected_dataset == 'screenspot-pro':
|
605 |
+
metric_options = {
|
606 |
+
'overall': 'Overall Average',
|
607 |
+
'text': 'Text',
|
608 |
+
'icon': 'Icon'
|
609 |
+
}
|
610 |
else:
|
611 |
+
# For showdown-clicks, only show overall average
|
612 |
metric_options = {
|
613 |
'overall': 'Overall Average'
|
614 |
}
|
|
|
662 |
baseline_row['text_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['web_text']) / 2
|
663 |
if 'desktop_icon' in baseline_metrics and 'web_icon' in baseline_metrics:
|
664 |
baseline_row['icon_avg'] = (baseline_metrics['desktop_icon'] + baseline_metrics['web_icon']) / 2
|
665 |
+
elif selected_dataset == 'screenspot-pro':
|
666 |
+
baseline_row.update({
|
667 |
+
'overall': baseline_metrics.get('overall', 0),
|
668 |
+
'text': baseline_metrics.get('text', 0),
|
669 |
+
'icon': baseline_metrics.get('icon', 0)
|
670 |
+
})
|
671 |
else:
|
672 |
+
# For other datasets (showdown-clicks, etc.)
|
673 |
baseline_row['overall'] = baseline_metrics.get('overall', 0)
|
674 |
|
675 |
baseline_rows.append(baseline_row)
|