Anas Awadalla
commited on
Commit
·
4f9fa17
1
Parent(s):
fc25316
add subset avg for pro baselines
Browse files- src/streamlit_app.py +133 -8
src/streamlit_app.py
CHANGED
@@ -366,11 +366,30 @@ def create_bar_chart(data: pd.DataFrame, metric: str, title: str):
|
|
366 |
for baseline_name, baseline_metrics in BASELINES[dataset].items():
|
367 |
metric_key = metric.replace('_avg', '').replace('avg', 'overall')
|
368 |
if metric_key in baseline_metrics:
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
|
375 |
if not chart_data:
|
376 |
return None
|
@@ -565,6 +584,75 @@ def main():
|
|
565 |
# If no models selected, show empty dataframe
|
566 |
filtered_df = pd.DataFrame()
|
567 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
568 |
# Main content
|
569 |
st.header(f"Results for {selected_dataset}")
|
570 |
|
@@ -589,6 +677,30 @@ def main():
|
|
589 |
# Parse UI type metrics
|
590 |
ui_metrics_df = parse_ui_type_metrics(filtered_df, selected_dataset)
|
591 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
592 |
# Add metric selector for screenspot datasets
|
593 |
selected_metric = 'overall' # Default metric
|
594 |
if not ui_metrics_df.empty:
|
@@ -634,9 +746,9 @@ def main():
|
|
634 |
# Display results table
|
635 |
st.subheader("📊 Results Table")
|
636 |
|
637 |
-
#
|
638 |
if not ui_metrics_df.empty:
|
639 |
-
table_df = ui_metrics_df
|
640 |
|
641 |
# Add baselines to the table if available
|
642 |
if selected_dataset in BASELINES:
|
@@ -672,7 +784,20 @@ def main():
|
|
672 |
# For other datasets (showdown-clicks, etc.)
|
673 |
baseline_row['overall'] = baseline_metrics.get('overall', 0)
|
674 |
|
675 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
676 |
|
677 |
# Append baselines to table
|
678 |
if baseline_rows:
|
|
|
366 |
for baseline_name, baseline_metrics in BASELINES[dataset].items():
|
367 |
metric_key = metric.replace('_avg', '').replace('avg', 'overall')
|
368 |
if metric_key in baseline_metrics:
|
369 |
+
baseline_value = baseline_metrics[metric_key]
|
370 |
+
|
371 |
+
# Check performance bounds if filter is enabled
|
372 |
+
should_include = True
|
373 |
+
if st.session_state.get('perf_filter_enabled', False):
|
374 |
+
filter_metric = st.session_state.get('perf_filter_metric', 'overall')
|
375 |
+
min_perf = st.session_state.get('perf_filter_min', 0.0)
|
376 |
+
max_perf = st.session_state.get('perf_filter_max', 100.0)
|
377 |
+
|
378 |
+
# Only filter if we're filtering by the same metric being displayed
|
379 |
+
if filter_metric == metric and (baseline_value < min_perf or baseline_value > max_perf):
|
380 |
+
should_include = False
|
381 |
+
# Or if filtering by a different metric, check that metric's value
|
382 |
+
elif filter_metric != metric and filter_metric in baseline_metrics:
|
383 |
+
filter_value = baseline_metrics[filter_metric]
|
384 |
+
if filter_value < min_perf or filter_value > max_perf:
|
385 |
+
should_include = False
|
386 |
+
|
387 |
+
if should_include:
|
388 |
+
chart_data.append({
|
389 |
+
'Model': baseline_name,
|
390 |
+
'Score': baseline_value,
|
391 |
+
'Type': 'Baseline'
|
392 |
+
})
|
393 |
|
394 |
if not chart_data:
|
395 |
return None
|
|
|
584 |
# If no models selected, show empty dataframe
|
585 |
filtered_df = pd.DataFrame()
|
586 |
|
587 |
+
# Performance bounds filter
|
588 |
+
st.sidebar.divider()
|
589 |
+
st.sidebar.subheader("Performance Filters")
|
590 |
+
|
591 |
+
# Enable/disable performance filtering
|
592 |
+
enable_perf_filter = st.sidebar.checkbox("Enable performance bounds", value=False)
|
593 |
+
|
594 |
+
if enable_perf_filter:
|
595 |
+
# Get the metric to filter on
|
596 |
+
filter_metric_help = "Filter models based on their performance in the selected metric"
|
597 |
+
|
598 |
+
# Determine available metrics for filtering
|
599 |
+
if selected_dataset == 'screenspot-v2':
|
600 |
+
filter_metrics = ['overall', 'desktop_text', 'desktop_icon', 'web_text', 'web_icon']
|
601 |
+
filter_metric_names = {
|
602 |
+
'overall': 'Overall Average',
|
603 |
+
'desktop_text': 'Desktop (Text)',
|
604 |
+
'desktop_icon': 'Desktop (Icon)',
|
605 |
+
'web_text': 'Web (Text)',
|
606 |
+
'web_icon': 'Web (Icon)'
|
607 |
+
}
|
608 |
+
elif selected_dataset == 'screenspot-pro':
|
609 |
+
filter_metrics = ['overall', 'text', 'icon']
|
610 |
+
filter_metric_names = {
|
611 |
+
'overall': 'Overall Average',
|
612 |
+
'text': 'Text',
|
613 |
+
'icon': 'Icon'
|
614 |
+
}
|
615 |
+
else:
|
616 |
+
filter_metrics = ['overall']
|
617 |
+
filter_metric_names = {'overall': 'Overall Average'}
|
618 |
+
|
619 |
+
# Metric selector for filtering
|
620 |
+
filter_metric = st.sidebar.selectbox(
|
621 |
+
"Filter by metric:",
|
622 |
+
options=filter_metrics,
|
623 |
+
format_func=lambda x: filter_metric_names[x],
|
624 |
+
help=filter_metric_help
|
625 |
+
)
|
626 |
+
|
627 |
+
# Performance bounds inputs
|
628 |
+
col1, col2 = st.sidebar.columns(2)
|
629 |
+
with col1:
|
630 |
+
min_perf = st.number_input(
|
631 |
+
"Min %",
|
632 |
+
min_value=0.0,
|
633 |
+
max_value=100.0,
|
634 |
+
value=0.0,
|
635 |
+
step=5.0,
|
636 |
+
help="Minimum performance threshold"
|
637 |
+
)
|
638 |
+
with col2:
|
639 |
+
max_perf = st.number_input(
|
640 |
+
"Max %",
|
641 |
+
min_value=0.0,
|
642 |
+
max_value=100.0,
|
643 |
+
value=100.0,
|
644 |
+
step=5.0,
|
645 |
+
help="Maximum performance threshold"
|
646 |
+
)
|
647 |
+
|
648 |
+
# Store filter settings in session state
|
649 |
+
st.session_state['perf_filter_enabled'] = True
|
650 |
+
st.session_state['perf_filter_metric'] = filter_metric
|
651 |
+
st.session_state['perf_filter_min'] = min_perf
|
652 |
+
st.session_state['perf_filter_max'] = max_perf
|
653 |
+
else:
|
654 |
+
st.session_state['perf_filter_enabled'] = False
|
655 |
+
|
656 |
# Main content
|
657 |
st.header(f"Results for {selected_dataset}")
|
658 |
|
|
|
677 |
# Parse UI type metrics
|
678 |
ui_metrics_df = parse_ui_type_metrics(filtered_df, selected_dataset)
|
679 |
|
680 |
+
# Apply performance bounds filter if enabled
|
681 |
+
if st.session_state.get('perf_filter_enabled', False) and not ui_metrics_df.empty:
|
682 |
+
filter_metric = st.session_state.get('perf_filter_metric', 'overall')
|
683 |
+
min_perf = st.session_state.get('perf_filter_min', 0.0)
|
684 |
+
max_perf = st.session_state.get('perf_filter_max', 100.0)
|
685 |
+
|
686 |
+
# Check if the filter metric exists in the dataframe
|
687 |
+
if filter_metric in ui_metrics_df.columns:
|
688 |
+
# Filter models based on performance bounds
|
689 |
+
ui_metrics_df = ui_metrics_df[
|
690 |
+
(ui_metrics_df[filter_metric] >= min_perf) &
|
691 |
+
(ui_metrics_df[filter_metric] <= max_perf)
|
692 |
+
]
|
693 |
+
|
694 |
+
# Update selected models to only include those within bounds
|
695 |
+
models_in_bounds = ui_metrics_df['model'].tolist()
|
696 |
+
filtered_models = [m for m in selected_models if m in models_in_bounds]
|
697 |
+
|
698 |
+
# Show info about filtered models
|
699 |
+
total_models = len(selected_models)
|
700 |
+
shown_models = len(filtered_models)
|
701 |
+
if shown_models < total_models:
|
702 |
+
st.info(f"Showing {shown_models} of {total_models} selected models within performance bounds ({min_perf:.1f}% - {max_perf:.1f}% {filter_metric})")
|
703 |
+
|
704 |
# Add metric selector for screenspot datasets
|
705 |
selected_metric = 'overall' # Default metric
|
706 |
if not ui_metrics_df.empty:
|
|
|
746 |
# Display results table
|
747 |
st.subheader("📊 Results Table")
|
748 |
|
749 |
+
# Use the already filtered ui_metrics_df which respects performance bounds
|
750 |
if not ui_metrics_df.empty:
|
751 |
+
table_df = ui_metrics_df.copy()
|
752 |
|
753 |
# Add baselines to the table if available
|
754 |
if selected_dataset in BASELINES:
|
|
|
784 |
# For other datasets (showdown-clicks, etc.)
|
785 |
baseline_row['overall'] = baseline_metrics.get('overall', 0)
|
786 |
|
787 |
+
# Apply performance filter to baselines if enabled
|
788 |
+
should_include_baseline = True
|
789 |
+
if st.session_state.get('perf_filter_enabled', False):
|
790 |
+
filter_metric = st.session_state.get('perf_filter_metric', 'overall')
|
791 |
+
min_perf = st.session_state.get('perf_filter_min', 0.0)
|
792 |
+
max_perf = st.session_state.get('perf_filter_max', 100.0)
|
793 |
+
|
794 |
+
if filter_metric in baseline_row:
|
795 |
+
metric_value = baseline_row[filter_metric]
|
796 |
+
if metric_value < min_perf or metric_value > max_perf:
|
797 |
+
should_include_baseline = False
|
798 |
+
|
799 |
+
if should_include_baseline:
|
800 |
+
baseline_rows.append(baseline_row)
|
801 |
|
802 |
# Append baselines to table
|
803 |
if baseline_rows:
|