Anas Awadalla
commited on
Commit
·
402e797
1
Parent(s):
1ddd951
fix baselines for showdown-clicks
Browse files- src/streamlit_app.py +3 -69
src/streamlit_app.py
CHANGED
@@ -53,7 +53,7 @@ BASELINES = {
|
|
53 |
}
|
54 |
}
|
55 |
|
56 |
-
@st.cache_data(ttl=
|
57 |
def fetch_leaderboard_data():
|
58 |
"""Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
|
59 |
api = HfApi()
|
@@ -433,33 +433,21 @@ def main():
|
|
433 |
|
434 |
# Add metric selector for screenspot datasets
|
435 |
selected_metric = 'overall' # Default metric
|
436 |
-
if not ui_metrics_df.empty
|
437 |
# Metric selector dropdown
|
438 |
if selected_dataset == 'screenspot-v2':
|
439 |
metric_options = {
|
440 |
'overall': 'Overall Average (Desktop + Web) / 2',
|
441 |
-
'desktop_avg': 'Desktop Average',
|
442 |
-
'web_avg': 'Web Average',
|
443 |
'desktop_text': 'Desktop (Text)',
|
444 |
'desktop_icon': 'Desktop (Icon)',
|
445 |
'web_text': 'Web (Text)',
|
446 |
'web_icon': 'Web (Icon)',
|
447 |
-
'text_avg': 'Text Average',
|
448 |
-
'icon_avg': 'Icon Average'
|
449 |
}
|
450 |
-
|
451 |
# For screenspot-pro and showdown-clicks, only show overall average
|
452 |
metric_options = {
|
453 |
'overall': 'Overall Average'
|
454 |
}
|
455 |
-
else:
|
456 |
-
metric_options = {
|
457 |
-
'overall': 'Overall Average',
|
458 |
-
'desktop_avg': 'Desktop Average',
|
459 |
-
'web_avg': 'Web Average',
|
460 |
-
'text_avg': 'Text Average',
|
461 |
-
'icon_avg': 'Icon Average'
|
462 |
-
}
|
463 |
|
464 |
selected_metric = st.selectbox(
|
465 |
"Select metric to visualize:",
|
@@ -478,60 +466,6 @@ def main():
|
|
478 |
st.altair_chart(chart, use_container_width=True)
|
479 |
else:
|
480 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
481 |
-
|
482 |
-
else:
|
483 |
-
# For non-ScreenSpot datasets, show a simple bar chart
|
484 |
-
# Prepare data list for chart with evaluated models and baselines (if any)
|
485 |
-
chart_rows = []
|
486 |
-
|
487 |
-
# Add evaluated models
|
488 |
-
for _, row in filtered_df.iterrows():
|
489 |
-
chart_rows.append({
|
490 |
-
'Model': row['model'],
|
491 |
-
'Score': row['overall_accuracy'],
|
492 |
-
'Type': 'Evaluated'
|
493 |
-
})
|
494 |
-
|
495 |
-
# Add baselines if defined for this dataset
|
496 |
-
if selected_dataset in BASELINES:
|
497 |
-
for baseline_name, baseline_metrics in BASELINES[selected_dataset].items():
|
498 |
-
if 'overall' in baseline_metrics:
|
499 |
-
chart_rows.append({
|
500 |
-
'Model': baseline_name,
|
501 |
-
'Score': baseline_metrics['overall'],
|
502 |
-
'Type': 'Baseline'
|
503 |
-
})
|
504 |
-
|
505 |
-
if chart_rows:
|
506 |
-
chart_df = pd.DataFrame(chart_rows)
|
507 |
-
|
508 |
-
# Create the bar chart similar to create_bar_chart
|
509 |
-
chart = alt.Chart(chart_df).mark_bar().encode(
|
510 |
-
x=alt.X('Model:N', sort=alt.EncodingSortField(field='Score', order='descending'),
|
511 |
-
axis=alt.Axis(labelAngle=-45)),
|
512 |
-
y=alt.Y('Score:Q', scale=alt.Scale(domain=[0, 100]),
|
513 |
-
axis=alt.Axis(title='Score (%)')),
|
514 |
-
color=alt.Color('Type:N',
|
515 |
-
scale=alt.Scale(domain=['Evaluated', 'Baseline'],
|
516 |
-
range=['#4ECDC4', '#FFA726'])),
|
517 |
-
tooltip=['Model', 'Score', 'Type']
|
518 |
-
).properties(
|
519 |
-
width=800,
|
520 |
-
height=400
|
521 |
-
)
|
522 |
-
|
523 |
-
# Add value labels
|
524 |
-
text = chart.mark_text(
|
525 |
-
align='center',
|
526 |
-
baseline='bottom',
|
527 |
-
dy=-5
|
528 |
-
).encode(
|
529 |
-
text=alt.Text('Score:Q', format='.1f')
|
530 |
-
)
|
531 |
-
|
532 |
-
st.altair_chart(chart + text, use_container_width=True)
|
533 |
-
else:
|
534 |
-
st.warning("No data available for the selected dataset.")
|
535 |
|
536 |
if __name__ == "__main__":
|
537 |
main()
|
|
|
53 |
}
|
54 |
}
|
55 |
|
56 |
+
@st.cache_data(ttl=300) # Cache for 5 minutes
|
57 |
def fetch_leaderboard_data():
|
58 |
"""Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
|
59 |
api = HfApi()
|
|
|
433 |
|
434 |
# Add metric selector for screenspot datasets
|
435 |
selected_metric = 'overall' # Default metric
|
436 |
+
if not ui_metrics_df.empty:
|
437 |
# Metric selector dropdown
|
438 |
if selected_dataset == 'screenspot-v2':
|
439 |
metric_options = {
|
440 |
'overall': 'Overall Average (Desktop + Web) / 2',
|
|
|
|
|
441 |
'desktop_text': 'Desktop (Text)',
|
442 |
'desktop_icon': 'Desktop (Icon)',
|
443 |
'web_text': 'Web (Text)',
|
444 |
'web_icon': 'Web (Icon)',
|
|
|
|
|
445 |
}
|
446 |
+
else:
|
447 |
# For screenspot-pro and showdown-clicks, only show overall average
|
448 |
metric_options = {
|
449 |
'overall': 'Overall Average'
|
450 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
|
452 |
selected_metric = st.selectbox(
|
453 |
"Select metric to visualize:",
|
|
|
466 |
st.altair_chart(chart, use_container_width=True)
|
467 |
else:
|
468 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
|
470 |
if __name__ == "__main__":
|
471 |
main()
|