Anas Awadalla commited on
Commit
587e0bc
·
1 Parent(s): b47cdd1
Files changed (3) hide show
  1. README.md +70 -4
  2. requirements.txt +5 -3
  3. src/streamlit_app.py +351 -37
README.md CHANGED
@@ -11,9 +11,75 @@ pinned: false
11
  short_description: Streamlit template space
12
  ---
13
 
14
- # Welcome to Streamlit!
15
 
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
 
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  short_description: Streamlit template space
12
  ---
13
 
14
+ # Grounding Benchmark Leaderboard Viewer
15
 
16
+ A Streamlit application for visualizing model performance on grounding benchmarks.
17
 
18
+ ## Features
19
+
20
+ - **Real-time Data**: Fetches results directly from the HuggingFace leaderboard repository
21
+ - **Interactive Visualizations**: Bar charts comparing model performance across different metrics
22
+ - **Baseline Comparisons**: Shows baseline models (Qwen2-VL, UI-TARS) alongside evaluated models
23
+ - **UI Type Breakdown**: For ScreenSpot datasets, shows performance by:
24
+ - Desktop vs Web
25
+ - Text vs Icon elements
26
+ - Overall averages
27
+ - **Model Details**: View training loss, checkpoint steps, and evaluation timestamps
28
+ - **Raw Data Access**: Inspect the complete evaluation results JSON
29
+
30
+ ## Installation
31
+
32
+ 1. Clone or download this directory
33
+ 2. Install dependencies:
34
+ ```bash
35
+ pip install -r requirements.txt
36
+ ```
37
+
38
+ ## Running the App
39
+
40
+ ```bash
41
+ streamlit run src/streamlit_app.py
42
+ ```
43
+
44
+ The app will open in your browser at `http://localhost:8501`
45
+
46
+ ## Usage
47
+
48
+ 1. **Select Dataset**: Use the sidebar to choose which benchmark dataset to view (e.g., screenspot-v2, screenspot-pro)
49
+
50
+ 2. **Filter Models**: Optionally filter to view a specific model or all models
51
+
52
+ 3. **View Charts**: The main page displays:
53
+ - Overall metrics (number of models, best accuracy, total samples)
54
+ - Bar charts comparing performance across different UI types
55
+ - Baseline model comparisons (shown in orange)
56
+
57
+ 4. **Explore Details**:
58
+ - Expand "Model Details" to see training metadata
59
+ - Expand "Detailed UI Type Breakdown" for a comprehensive table
60
+ - Expand "Raw Data" to inspect the complete JSON results
61
+
62
+ ## Data Source
63
+
64
+ The app fetches data from the HuggingFace dataset repository:
65
+ - Repository: `mlfoundations-cua-dev/leaderboard`
66
+ - Path: `grounding/[dataset_name]/[model_results].json`
67
+
68
+ ## Supported Datasets
69
+
70
+ - **ScreenSpot-v2**: Web and desktop UI element grounding
71
+ - **ScreenSpot-Pro**: Professional UI grounding benchmark
72
+ - **ShowdownClicks**: Click prediction benchmark
73
+ - And more as they are added to the leaderboard
74
+
75
+ ## Baseline Models
76
+
77
+ For ScreenSpot-v2, the following baselines are included:
78
+ - Qwen2-VL-7B
79
+ - UI-TARS-2B
80
+ - UI-TARS-7B
81
+ - UI-TARS-72B
82
+
83
+ ## Caching
84
+
85
+ Results are cached for 5 minutes to improve performance. The cache automatically refreshes to show new evaluation results.
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
1
+ streamlit>=1.28.0
2
+ pandas>=1.5.0
3
+ altair>=5.0.0
4
+ huggingface-hub>=0.19.0
5
+ numpy>=1.24.0
src/streamlit_app.py CHANGED
@@ -1,40 +1,354 @@
 
 
1
  import altair as alt
 
 
 
 
 
2
  import numpy as np
3
- import pandas as pd
4
- import streamlit as st
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
  import altair as alt
4
+ from huggingface_hub import HfApi, hf_hub_download
5
+ import json
6
+ from pathlib import Path
7
+ import os
8
+ from typing import Dict, List, Optional
9
  import numpy as np
 
 
10
 
11
+ # Page config
12
+ st.set_page_config(
13
+ page_title="Grounding Benchmark Leaderboard",
14
+ page_icon="🎯",
15
+ layout="wide"
16
+ )
17
+
18
+ # Constants
19
+ REPO_ID = "mlfoundations-cua-dev/leaderboard"
20
+ GROUNDING_PATH = "grounding"
21
+
22
+ # Baselines for different datasets
23
+ BASELINES = {
24
+ "screenspot-v2": {
25
+ "Qwen2-VL-7B": {
26
+ "desktop_text": 52.01,
27
+ "desktop_icon": 44.98,
28
+ "web_text": 33.04,
29
+ "web_icon": 21.84,
30
+ "overall": 37.96
31
+ },
32
+ "UI-TARS-2B": {
33
+ "desktop_text": 90.7,
34
+ "desktop_icon": 68.6,
35
+ "web_text": 87.2,
36
+ "web_icon": 84.7,
37
+ "overall": 82.8
38
+ },
39
+ "UI-TARS-7B": {
40
+ "desktop_text": 95.4,
41
+ "desktop_icon": 87.8,
42
+ "web_text": 93.8,
43
+ "web_icon": 91.6,
44
+ "overall": 92.2
45
+ },
46
+ "UI-TARS-72B": {
47
+ "desktop_text": 91.2,
48
+ "desktop_icon": 87.8,
49
+ "web_text": 87.7,
50
+ "web_icon": 86.3,
51
+ "overall": 88.3
52
+ }
53
+ }
54
+ }
55
+
56
+ @st.cache_data(ttl=300) # Cache for 5 minutes
57
+ def fetch_leaderboard_data():
58
+ """Fetch all grounding results from HuggingFace leaderboard."""
59
+ api = HfApi()
60
+
61
+ try:
62
+ # List all files in the grounding directory
63
+ files = api.list_repo_files(repo_id=REPO_ID, repo_type="dataset")
64
+ grounding_files = [f for f in files if f.startswith(f"{GROUNDING_PATH}/") and f.endswith(".json")]
65
+
66
+ results = []
67
+ for file_path in grounding_files:
68
+ try:
69
+ # Download and parse each JSON file
70
+ local_path = hf_hub_download(
71
+ repo_id=REPO_ID,
72
+ filename=file_path,
73
+ repo_type="dataset"
74
+ )
75
+
76
+ with open(local_path, 'r') as f:
77
+ data = json.load(f)
78
+
79
+ # Extract key information
80
+ metadata = data.get("metadata", {})
81
+ metrics = data.get("metrics", {})
82
+ detailed_results = data.get("detailed_results", {})
83
+
84
+ # Parse the file path to get dataset and model info
85
+ path_parts = file_path.split('/')
86
+ dataset_name = path_parts[1] if len(path_parts) > 1 else "unknown"
87
+
88
+ # Get model name from metadata or path
89
+ model_name = metadata.get("model_checkpoint", "").split('/')[-1]
90
+ if not model_name and len(path_parts) > 2:
91
+ model_name = path_parts[2].replace("results_", "").replace(".json", "")
92
+
93
+ # Extract UI type results if available
94
+ ui_type_results = detailed_results.get("by_ui_type", {})
95
+ dataset_type_results = detailed_results.get("by_dataset_type", {})
96
+
97
+ results.append({
98
+ "dataset": dataset_name,
99
+ "model": model_name,
100
+ "model_path": metadata.get("model_checkpoint", ""),
101
+ "overall_accuracy": metrics.get("accuracy", 0) * 100, # Convert to percentage
102
+ "total_samples": metrics.get("total", 0),
103
+ "timestamp": metadata.get("evaluation_timestamp", ""),
104
+ "checkpoint_steps": metadata.get("checkpoint_steps"),
105
+ "training_loss": metadata.get("training_loss"),
106
+ "ui_type_results": ui_type_results,
107
+ "dataset_type_results": dataset_type_results,
108
+ "raw_data": data
109
+ })
110
+
111
+ except Exception as e:
112
+ st.warning(f"Error loading {file_path}: {str(e)}")
113
+ continue
114
+
115
+ return pd.DataFrame(results)
116
+
117
+ except Exception as e:
118
+ st.error(f"Error fetching leaderboard data: {str(e)}")
119
+ return pd.DataFrame()
120
+
121
+ def parse_ui_type_metrics(df: pd.DataFrame, dataset_filter: str) -> pd.DataFrame:
122
+ """Parse UI type metrics from the results dataframe."""
123
+ metrics_list = []
124
+
125
+ for _, row in df.iterrows():
126
+ if row['dataset'] != dataset_filter:
127
+ continue
128
+
129
+ model = row['model']
130
+ ui_results = row['ui_type_results']
131
+
132
+ # For ScreenSpot datasets, we have desktop/web and text/icon
133
+ if 'screenspot' in dataset_filter.lower():
134
+ # Calculate aggregated metrics
135
+ desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
136
+ desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
137
+ web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
138
+ web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
139
+
140
+ # Calculate averages
141
+ desktop_avg = (desktop_text + desktop_icon) / 2 if desktop_text or desktop_icon else 0
142
+ web_avg = (web_text + web_icon) / 2 if web_text or web_icon else 0
143
+ text_avg = (desktop_text + web_text) / 2 if desktop_text or web_text else 0
144
+ icon_avg = (desktop_icon + web_icon) / 2 if desktop_icon or web_icon else 0
145
+
146
+ metrics_list.append({
147
+ 'model': model,
148
+ 'desktop_text': desktop_text,
149
+ 'desktop_icon': desktop_icon,
150
+ 'web_text': web_text,
151
+ 'web_icon': web_icon,
152
+ 'desktop_avg': desktop_avg,
153
+ 'web_avg': web_avg,
154
+ 'text_avg': text_avg,
155
+ 'icon_avg': icon_avg,
156
+ 'overall': row['overall_accuracy']
157
+ })
158
+
159
+ return pd.DataFrame(metrics_list)
160
+
161
+ def create_bar_chart(data: pd.DataFrame, metric: str, title: str):
162
+ """Create a bar chart for a specific metric."""
163
+ # Prepare data for the chart
164
+ chart_data = []
165
+
166
+ # Add model results
167
+ for _, row in data.iterrows():
168
+ if metric in row and row[metric] > 0:
169
+ chart_data.append({
170
+ 'Model': row['model'],
171
+ 'Score': row[metric],
172
+ 'Type': 'Evaluated'
173
+ })
174
+
175
+ # Add baselines if available
176
+ dataset = st.session_state.get('selected_dataset', '')
177
+ if dataset in BASELINES:
178
+ for baseline_name, baseline_metrics in BASELINES[dataset].items():
179
+ metric_key = metric.replace('_avg', '').replace('avg', 'overall')
180
+ if metric_key in baseline_metrics:
181
+ chart_data.append({
182
+ 'Model': baseline_name,
183
+ 'Score': baseline_metrics[metric_key],
184
+ 'Type': 'Baseline'
185
+ })
186
+
187
+ if not chart_data:
188
+ return None
189
+
190
+ df_chart = pd.DataFrame(chart_data)
191
+
192
+ # Create the bar chart
193
+ chart = alt.Chart(df_chart).mark_bar().encode(
194
+ x=alt.X('Model:N',
195
+ sort=alt.EncodingSortField(field='Score', order='descending'),
196
+ axis=alt.Axis(labelAngle=-45)),
197
+ y=alt.Y('Score:Q',
198
+ scale=alt.Scale(domain=[0, 100]),
199
+ axis=alt.Axis(title='Score (%)')),
200
+ color=alt.Color('Type:N',
201
+ scale=alt.Scale(domain=['Evaluated', 'Baseline'],
202
+ range=['#4ECDC4', '#FFA726'])),
203
+ tooltip=['Model', 'Score', 'Type']
204
+ ).properties(
205
+ title=title,
206
+ width=400,
207
+ height=300
208
+ )
209
+
210
+ # Add value labels
211
+ text = chart.mark_text(
212
+ align='center',
213
+ baseline='bottom',
214
+ dy=-5
215
+ ).encode(
216
+ text=alt.Text('Score:Q', format='.1f')
217
+ )
218
+
219
+ return chart + text
220
+
221
+ def main():
222
+ st.title("🎯 Grounding Benchmark Leaderboard")
223
+ st.markdown("Visualization of model performance on grounding benchmarks")
224
+
225
+ # Fetch data
226
+ with st.spinner("Loading leaderboard data..."):
227
+ df = fetch_leaderboard_data()
228
+
229
+ if df.empty:
230
+ st.warning("No data available in the leaderboard.")
231
+ return
232
+
233
+ # Sidebar filters
234
+ st.sidebar.header("Filters")
235
+
236
+ # Dataset filter
237
+ datasets = sorted(df['dataset'].unique())
238
+ selected_dataset = st.sidebar.selectbox("Select Dataset", datasets)
239
+ st.session_state['selected_dataset'] = selected_dataset
240
+
241
+ # Filter data
242
+ filtered_df = df[df['dataset'] == selected_dataset]
243
+
244
+ # Model filter (optional)
245
+ models = ['All'] + sorted(filtered_df['model'].unique())
246
+ selected_model = st.sidebar.selectbox("Select Model", models)
247
+
248
+ if selected_model != 'All':
249
+ filtered_df = filtered_df[filtered_df['model'] == selected_model]
250
+
251
+ # Main content
252
+ st.header(f"Results for {selected_dataset}")
253
+
254
+ # Overall metrics
255
+ col1, col2, col3 = st.columns(3)
256
+ with col1:
257
+ st.metric("Models Evaluated", len(filtered_df))
258
+ with col2:
259
+ if not filtered_df.empty:
260
+ best_acc = filtered_df['overall_accuracy'].max()
261
+ best_model = filtered_df[filtered_df['overall_accuracy'] == best_acc]['model'].iloc[0]
262
+ st.metric("Best Overall Accuracy", f"{best_acc:.1f}%", help=f"Model: {best_model}")
263
+ with col3:
264
+ total_samples = filtered_df['total_samples'].sum()
265
+ st.metric("Total Samples Evaluated", f"{total_samples:,}")
266
+
267
+ # Parse UI type metrics
268
+ ui_metrics_df = parse_ui_type_metrics(filtered_df, selected_dataset)
269
+
270
+ if not ui_metrics_df.empty and 'screenspot' in selected_dataset.lower():
271
+ st.subheader("Performance by UI Type")
272
+
273
+ # Create charts in a grid
274
+ col1, col2 = st.columns(2)
275
+
276
+ with col1:
277
+ # Overall Average
278
+ chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average')
279
+ if chart:
280
+ st.altair_chart(chart, use_container_width=True)
281
+
282
+ # Desktop Average
283
+ chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
284
+ if chart:
285
+ st.altair_chart(chart, use_container_width=True)
286
+
287
+ # Text Average
288
+ chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (UI-Type)')
289
+ if chart:
290
+ st.altair_chart(chart, use_container_width=True)
291
+
292
+ with col2:
293
+ # Web Average
294
+ chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
295
+ if chart:
296
+ st.altair_chart(chart, use_container_width=True)
297
+
298
+ # Icon Average
299
+ chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (UI-Type)')
300
+ if chart:
301
+ st.altair_chart(chart, use_container_width=True)
302
+
303
+ # Detailed breakdown
304
+ with st.expander("Detailed UI Type Breakdown"):
305
+ # Create a heatmap-style table
306
+ detailed_metrics = []
307
+ for _, row in ui_metrics_df.iterrows():
308
+ detailed_metrics.append({
309
+ 'Model': row['model'],
310
+ 'Desktop Text': f"{row['desktop_text']:.1f}%",
311
+ 'Desktop Icon': f"{row['desktop_icon']:.1f}%",
312
+ 'Web Text': f"{row['web_text']:.1f}%",
313
+ 'Web Icon': f"{row['web_icon']:.1f}%",
314
+ 'Overall': f"{row['overall']:.1f}%"
315
+ })
316
+
317
+ if detailed_metrics:
318
+ st.dataframe(pd.DataFrame(detailed_metrics), use_container_width=True)
319
+
320
+ else:
321
+ # For non-ScreenSpot datasets, show a simple bar chart
322
+ st.subheader("Model Performance")
323
+
324
+ chart_data = filtered_df[['model', 'overall_accuracy']].copy()
325
+ chart_data.columns = ['Model', 'Accuracy']
326
+
327
+ chart = alt.Chart(chart_data).mark_bar().encode(
328
+ x=alt.X('Model:N', sort='-y', axis=alt.Axis(labelAngle=-45)),
329
+ y=alt.Y('Accuracy:Q', scale=alt.Scale(domain=[0, 100])),
330
+ tooltip=['Model', 'Accuracy']
331
+ ).properties(
332
+ width=800,
333
+ height=400
334
+ )
335
+
336
+ st.altair_chart(chart, use_container_width=True)
337
+
338
+ # Model details table
339
+ with st.expander("Model Details"):
340
+ display_df = filtered_df[['model', 'overall_accuracy', 'total_samples', 'checkpoint_steps', 'training_loss', 'timestamp']].copy()
341
+ display_df.columns = ['Model', 'Accuracy (%)', 'Samples', 'Checkpoint Steps', 'Training Loss', 'Timestamp']
342
+ display_df['Accuracy (%)'] = display_df['Accuracy (%)'].apply(lambda x: f"{x:.2f}")
343
+ display_df['Training Loss'] = display_df['Training Loss'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
344
+ st.dataframe(display_df, use_container_width=True)
345
+
346
+ # Raw data viewer
347
+ with st.expander("Raw Data"):
348
+ if selected_model != 'All' and len(filtered_df) == 1:
349
+ st.json(filtered_df.iloc[0]['raw_data'])
350
+ else:
351
+ st.info("Select a specific model to view raw data")
352
+
353
+ if __name__ == "__main__":
354
+ main()