Anas Awadalla
commited on
Commit
·
2dbb46e
1
Parent(s):
2a7516c
try streaming
Browse files- README.md +13 -5
- src/streamlit_app.py +64 -20
README.md
CHANGED
@@ -17,7 +17,7 @@ A Streamlit application for visualizing model performance on grounding benchmark
|
|
17 |
|
18 |
## Features
|
19 |
|
20 |
-
- **Real-time Data**:
|
21 |
- **Interactive Visualizations**: Bar charts comparing model performance across different metrics
|
22 |
- **Baseline Comparisons**: Shows baseline models (Qwen2-VL, UI-TARS) alongside evaluated models
|
23 |
- **UI Type Breakdown**: For ScreenSpot datasets, shows performance by:
|
@@ -25,7 +25,7 @@ A Streamlit application for visualizing model performance on grounding benchmark
|
|
25 |
- Text vs Icon elements
|
26 |
- Overall averages
|
27 |
- **Model Details**: View training loss, checkpoint steps, and evaluation timestamps
|
28 |
-
- **
|
29 |
|
30 |
## Installation
|
31 |
|
@@ -57,14 +57,22 @@ The app will open in your browser at `http://localhost:8501`
|
|
57 |
4. **Explore Details**:
|
58 |
- Expand "Model Details" to see training metadata
|
59 |
- Expand "Detailed UI Type Breakdown" for a comprehensive table
|
60 |
-
- Expand "
|
61 |
|
62 |
## Data Source
|
63 |
|
64 |
-
The app
|
65 |
- Repository: `mlfoundations-cua-dev/leaderboard`
|
66 |
- Path: `grounding/[dataset_name]/[model_results].json`
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
## Supported Datasets
|
69 |
|
70 |
- **ScreenSpot-v2**: Web and desktop UI element grounding
|
@@ -82,4 +90,4 @@ For ScreenSpot-v2, the following baselines are included:
|
|
82 |
|
83 |
## Caching
|
84 |
|
85 |
-
Results are cached for 5 minutes to improve performance. The cache automatically refreshes to show new evaluation results.
|
|
|
17 |
|
18 |
## Features
|
19 |
|
20 |
+
- **Real-time Data**: Streams results directly from the HuggingFace leaderboard repository without local storage
|
21 |
- **Interactive Visualizations**: Bar charts comparing model performance across different metrics
|
22 |
- **Baseline Comparisons**: Shows baseline models (Qwen2-VL, UI-TARS) alongside evaluated models
|
23 |
- **UI Type Breakdown**: For ScreenSpot datasets, shows performance by:
|
|
|
25 |
- Text vs Icon elements
|
26 |
- Overall averages
|
27 |
- **Model Details**: View training loss, checkpoint steps, and evaluation timestamps
|
28 |
+
- **Sample Results**: Inspect the first 5 evaluation samples for each model
|
29 |
|
30 |
## Installation
|
31 |
|
|
|
57 |
4. **Explore Details**:
|
58 |
- Expand "Model Details" to see training metadata
|
59 |
- Expand "Detailed UI Type Breakdown" for a comprehensive table
|
60 |
+
- Expand "Sample Results" to see the first 5 evaluation samples
|
61 |
|
62 |
## Data Source
|
63 |
|
64 |
+
The app streams data directly from the HuggingFace dataset repository:
|
65 |
- Repository: `mlfoundations-cua-dev/leaderboard`
|
66 |
- Path: `grounding/[dataset_name]/[model_results].json`
|
67 |
|
68 |
+
## Streaming Approach
|
69 |
+
|
70 |
+
To minimize local storage requirements, the app:
|
71 |
+
- Streams JSON files directly from HuggingFace Hub
|
72 |
+
- Extracts only the necessary data for visualization
|
73 |
+
- Discards the full JSON after processing
|
74 |
+
- Caches the extracted data in memory for 5 minutes
|
75 |
+
|
76 |
## Supported Datasets
|
77 |
|
78 |
- **ScreenSpot-v2**: Web and desktop UI element grounding
|
|
|
90 |
|
91 |
## Caching
|
92 |
|
93 |
+
Results are cached in memory for 5 minutes to improve performance. The cache automatically refreshes to show new evaluation results.
|
src/streamlit_app.py
CHANGED
@@ -5,7 +5,7 @@ os.environ["HF_HOME"] = "src/data_cache"
|
|
5 |
import streamlit as st
|
6 |
import pandas as pd
|
7 |
import altair as alt
|
8 |
-
from huggingface_hub import HfApi,
|
9 |
import json
|
10 |
from pathlib import Path
|
11 |
from typing import Dict, List, Optional
|
@@ -58,8 +58,9 @@ BASELINES = {
|
|
58 |
|
59 |
@st.cache_data(ttl=300) # Cache for 5 minutes
|
60 |
def fetch_leaderboard_data():
|
61 |
-
"""Fetch all grounding results from HuggingFace leaderboard."""
|
62 |
api = HfApi()
|
|
|
63 |
|
64 |
try:
|
65 |
# List all files in the grounding directory
|
@@ -67,19 +68,26 @@ def fetch_leaderboard_data():
|
|
67 |
grounding_files = [f for f in files if f.startswith(f"{GROUNDING_PATH}/") and f.endswith(".json")]
|
68 |
|
69 |
results = []
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
71 |
try:
|
72 |
-
#
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
repo_type="dataset"
|
77 |
-
)
|
78 |
|
79 |
-
|
|
|
|
|
|
|
|
|
80 |
data = json.load(f)
|
81 |
|
82 |
-
# Extract
|
83 |
metadata = data.get("metadata", {})
|
84 |
metrics = data.get("metrics", {})
|
85 |
detailed_results = data.get("detailed_results", {})
|
@@ -89,18 +97,30 @@ def fetch_leaderboard_data():
|
|
89 |
dataset_name = path_parts[1] if len(path_parts) > 1 else "unknown"
|
90 |
|
91 |
# Get model name from metadata or path
|
92 |
-
|
|
|
|
|
|
|
93 |
if not model_name and len(path_parts) > 2:
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
# Extract UI type results if available
|
97 |
ui_type_results = detailed_results.get("by_ui_type", {})
|
98 |
dataset_type_results = detailed_results.get("by_dataset_type", {})
|
99 |
|
100 |
-
|
|
|
101 |
"dataset": dataset_name,
|
102 |
"model": model_name,
|
103 |
-
"model_path":
|
104 |
"overall_accuracy": metrics.get("accuracy", 0) * 100, # Convert to percentage
|
105 |
"total_samples": metrics.get("total", 0),
|
106 |
"timestamp": metadata.get("evaluation_timestamp", ""),
|
@@ -108,13 +128,23 @@ def fetch_leaderboard_data():
|
|
108 |
"training_loss": metadata.get("training_loss"),
|
109 |
"ui_type_results": ui_type_results,
|
110 |
"dataset_type_results": dataset_type_results,
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
except Exception as e:
|
115 |
st.warning(f"Error loading {file_path}: {str(e)}")
|
116 |
continue
|
117 |
|
|
|
|
|
|
|
|
|
118 |
return pd.DataFrame(results)
|
119 |
|
120 |
except Exception as e:
|
@@ -347,11 +377,25 @@ def main():
|
|
347 |
st.dataframe(display_df, use_container_width=True)
|
348 |
|
349 |
# Raw data viewer
|
350 |
-
with st.expander("
|
351 |
if selected_model != 'All' and len(filtered_df) == 1:
|
352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
else:
|
354 |
-
st.info("Select a specific model to view
|
355 |
|
356 |
if __name__ == "__main__":
|
357 |
main()
|
|
|
5 |
import streamlit as st
|
6 |
import pandas as pd
|
7 |
import altair as alt
|
8 |
+
from huggingface_hub import HfApi, HfFileSystem
|
9 |
import json
|
10 |
from pathlib import Path
|
11 |
from typing import Dict, List, Optional
|
|
|
58 |
|
59 |
@st.cache_data(ttl=300) # Cache for 5 minutes
|
60 |
def fetch_leaderboard_data():
|
61 |
+
"""Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
|
62 |
api = HfApi()
|
63 |
+
fs = HfFileSystem()
|
64 |
|
65 |
try:
|
66 |
# List all files in the grounding directory
|
|
|
68 |
grounding_files = [f for f in files if f.startswith(f"{GROUNDING_PATH}/") and f.endswith(".json")]
|
69 |
|
70 |
results = []
|
71 |
+
|
72 |
+
# Create progress bar for loading
|
73 |
+
progress_bar = st.progress(0)
|
74 |
+
status_text = st.empty()
|
75 |
+
|
76 |
+
for idx, file_path in enumerate(grounding_files):
|
77 |
try:
|
78 |
+
# Update progress
|
79 |
+
progress = (idx + 1) / len(grounding_files)
|
80 |
+
progress_bar.progress(progress)
|
81 |
+
status_text.text(f"Loading {idx + 1}/{len(grounding_files)} files...")
|
|
|
|
|
82 |
|
83 |
+
# Stream the JSON file content directly from HuggingFace
|
84 |
+
file_url = f"datasets/{REPO_ID}/{file_path}"
|
85 |
+
|
86 |
+
# Read the file content directly without downloading
|
87 |
+
with fs.open(file_url, 'r') as f:
|
88 |
data = json.load(f)
|
89 |
|
90 |
+
# Extract only the necessary information
|
91 |
metadata = data.get("metadata", {})
|
92 |
metrics = data.get("metrics", {})
|
93 |
detailed_results = data.get("detailed_results", {})
|
|
|
97 |
dataset_name = path_parts[1] if len(path_parts) > 1 else "unknown"
|
98 |
|
99 |
# Get model name from metadata or path
|
100 |
+
model_checkpoint = metadata.get("model_checkpoint", "")
|
101 |
+
model_name = model_checkpoint.split('/')[-1]
|
102 |
+
|
103 |
+
# Handle checkpoint names
|
104 |
if not model_name and len(path_parts) > 2:
|
105 |
+
# Check if it's a checkpoint subdirectory structure
|
106 |
+
if len(path_parts) > 3 and path_parts[2] != path_parts[3]:
|
107 |
+
# Format: grounding/dataset/base_model/checkpoint.json
|
108 |
+
base_model = path_parts[2]
|
109 |
+
checkpoint_file = path_parts[3].replace(".json", "")
|
110 |
+
model_name = f"{base_model}/{checkpoint_file}"
|
111 |
+
else:
|
112 |
+
# Regular format: grounding/dataset/results_modelname.json
|
113 |
+
model_name = path_parts[2].replace("results_", "").replace(".json", "")
|
114 |
|
115 |
# Extract UI type results if available
|
116 |
ui_type_results = detailed_results.get("by_ui_type", {})
|
117 |
dataset_type_results = detailed_results.get("by_dataset_type", {})
|
118 |
|
119 |
+
# Create a compact result entry (only keep what we need for visualization)
|
120 |
+
result_entry = {
|
121 |
"dataset": dataset_name,
|
122 |
"model": model_name,
|
123 |
+
"model_path": model_checkpoint,
|
124 |
"overall_accuracy": metrics.get("accuracy", 0) * 100, # Convert to percentage
|
125 |
"total_samples": metrics.get("total", 0),
|
126 |
"timestamp": metadata.get("evaluation_timestamp", ""),
|
|
|
128 |
"training_loss": metadata.get("training_loss"),
|
129 |
"ui_type_results": ui_type_results,
|
130 |
"dataset_type_results": dataset_type_results,
|
131 |
+
# Store minimal sample results for inspection
|
132 |
+
"sample_results_summary": {
|
133 |
+
"total_samples": len(data.get("sample_results", [])),
|
134 |
+
"first_5_samples": data.get("sample_results", [])[:5]
|
135 |
+
}
|
136 |
+
}
|
137 |
+
|
138 |
+
results.append(result_entry)
|
139 |
|
140 |
except Exception as e:
|
141 |
st.warning(f"Error loading {file_path}: {str(e)}")
|
142 |
continue
|
143 |
|
144 |
+
# Clear progress indicators
|
145 |
+
progress_bar.empty()
|
146 |
+
status_text.empty()
|
147 |
+
|
148 |
return pd.DataFrame(results)
|
149 |
|
150 |
except Exception as e:
|
|
|
377 |
st.dataframe(display_df, use_container_width=True)
|
378 |
|
379 |
# Raw data viewer
|
380 |
+
with st.expander("Sample Results"):
|
381 |
if selected_model != 'All' and len(filtered_df) == 1:
|
382 |
+
summary = filtered_df.iloc[0]['sample_results_summary']
|
383 |
+
st.write(f"**Total evaluation samples:** {summary['total_samples']}")
|
384 |
+
st.write("**First 5 sample results:**")
|
385 |
+
for i, sample in enumerate(summary['first_5_samples'], 1):
|
386 |
+
st.write(f"\n**Sample {i}:**")
|
387 |
+
col1, col2 = st.columns([1, 2])
|
388 |
+
with col1:
|
389 |
+
st.write(f"- **Correct:** {'✅' if sample.get('is_correct') else '❌'}")
|
390 |
+
st.write(f"- **Image:** {sample.get('img_filename', 'N/A')}")
|
391 |
+
with col2:
|
392 |
+
st.write(f"- **Instruction:** {sample.get('instruction', 'N/A')}")
|
393 |
+
if sample.get('predicted_click'):
|
394 |
+
st.write(f"- **Predicted Click:** {sample['predicted_click']}")
|
395 |
+
if sample.get('error_msg'):
|
396 |
+
st.write(f"- **Error:** {sample['error_msg']}")
|
397 |
else:
|
398 |
+
st.info("Select a specific model to view sample results")
|
399 |
|
400 |
if __name__ == "__main__":
|
401 |
main()
|