Anas Awadalla
commited on
Commit
·
1ddd951
1
Parent(s):
6ebe143
fix baselines for showdown-clicks
Browse files- src/streamlit_app.py +54 -15
src/streamlit_app.py
CHANGED
@@ -53,7 +53,7 @@ BASELINES = {
|
|
53 |
}
|
54 |
}
|
55 |
|
56 |
-
@st.cache_data(ttl=
|
57 |
def fetch_leaderboard_data():
|
58 |
"""Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
|
59 |
api = HfApi()
|
@@ -366,7 +366,8 @@ def create_bar_chart(data: pd.DataFrame, metric: str, title: str):
|
|
366 |
range=['#4ECDC4', '#FFA726'])),
|
367 |
tooltip=['Model', 'Score', 'Type']
|
368 |
).properties(
|
369 |
-
|
|
|
370 |
height=400
|
371 |
)
|
372 |
|
@@ -480,19 +481,57 @@ def main():
|
|
480 |
|
481 |
else:
|
482 |
# For non-ScreenSpot datasets, show a simple bar chart
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
|
497 |
if __name__ == "__main__":
|
498 |
main()
|
|
|
53 |
}
|
54 |
}
|
55 |
|
56 |
+
@st.cache_data(ttl=1200) # Cache for 20 minutes
|
57 |
def fetch_leaderboard_data():
|
58 |
"""Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
|
59 |
api = HfApi()
|
|
|
366 |
range=['#4ECDC4', '#FFA726'])),
|
367 |
tooltip=['Model', 'Score', 'Type']
|
368 |
).properties(
|
369 |
+
title=title,
|
370 |
+
width=500,
|
371 |
height=400
|
372 |
)
|
373 |
|
|
|
481 |
|
482 |
else:
|
483 |
# For non-ScreenSpot datasets, show a simple bar chart
|
484 |
+
# Prepare data list for chart with evaluated models and baselines (if any)
|
485 |
+
chart_rows = []
|
486 |
+
|
487 |
+
# Add evaluated models
|
488 |
+
for _, row in filtered_df.iterrows():
|
489 |
+
chart_rows.append({
|
490 |
+
'Model': row['model'],
|
491 |
+
'Score': row['overall_accuracy'],
|
492 |
+
'Type': 'Evaluated'
|
493 |
+
})
|
494 |
+
|
495 |
+
# Add baselines if defined for this dataset
|
496 |
+
if selected_dataset in BASELINES:
|
497 |
+
for baseline_name, baseline_metrics in BASELINES[selected_dataset].items():
|
498 |
+
if 'overall' in baseline_metrics:
|
499 |
+
chart_rows.append({
|
500 |
+
'Model': baseline_name,
|
501 |
+
'Score': baseline_metrics['overall'],
|
502 |
+
'Type': 'Baseline'
|
503 |
+
})
|
504 |
+
|
505 |
+
if chart_rows:
|
506 |
+
chart_df = pd.DataFrame(chart_rows)
|
507 |
+
|
508 |
+
# Create the bar chart similar to create_bar_chart
|
509 |
+
chart = alt.Chart(chart_df).mark_bar().encode(
|
510 |
+
x=alt.X('Model:N', sort=alt.EncodingSortField(field='Score', order='descending'),
|
511 |
+
axis=alt.Axis(labelAngle=-45)),
|
512 |
+
y=alt.Y('Score:Q', scale=alt.Scale(domain=[0, 100]),
|
513 |
+
axis=alt.Axis(title='Score (%)')),
|
514 |
+
color=alt.Color('Type:N',
|
515 |
+
scale=alt.Scale(domain=['Evaluated', 'Baseline'],
|
516 |
+
range=['#4ECDC4', '#FFA726'])),
|
517 |
+
tooltip=['Model', 'Score', 'Type']
|
518 |
+
).properties(
|
519 |
+
width=800,
|
520 |
+
height=400
|
521 |
+
)
|
522 |
+
|
523 |
+
# Add value labels
|
524 |
+
text = chart.mark_text(
|
525 |
+
align='center',
|
526 |
+
baseline='bottom',
|
527 |
+
dy=-5
|
528 |
+
).encode(
|
529 |
+
text=alt.Text('Score:Q', format='.1f')
|
530 |
+
)
|
531 |
+
|
532 |
+
st.altair_chart(chart + text, use_container_width=True)
|
533 |
+
else:
|
534 |
+
st.warning("No data available for the selected dataset.")
|
535 |
|
536 |
if __name__ == "__main__":
|
537 |
main()
|