Anas Awadalla commited on
Commit
2dbb46e
·
1 Parent(s): 2a7516c

try streaming

Browse files
Files changed (2) hide show
  1. README.md +13 -5
  2. src/streamlit_app.py +64 -20
README.md CHANGED
@@ -17,7 +17,7 @@ A Streamlit application for visualizing model performance on grounding benchmark
17
 
18
  ## Features
19
 
20
- - **Real-time Data**: Fetches results directly from the HuggingFace leaderboard repository
21
  - **Interactive Visualizations**: Bar charts comparing model performance across different metrics
22
  - **Baseline Comparisons**: Shows baseline models (Qwen2-VL, UI-TARS) alongside evaluated models
23
  - **UI Type Breakdown**: For ScreenSpot datasets, shows performance by:
@@ -25,7 +25,7 @@ A Streamlit application for visualizing model performance on grounding benchmark
25
  - Text vs Icon elements
26
  - Overall averages
27
  - **Model Details**: View training loss, checkpoint steps, and evaluation timestamps
28
- - **Raw Data Access**: Inspect the complete evaluation results JSON
29
 
30
  ## Installation
31
 
@@ -57,14 +57,22 @@ The app will open in your browser at `http://localhost:8501`
57
  4. **Explore Details**:
58
  - Expand "Model Details" to see training metadata
59
  - Expand "Detailed UI Type Breakdown" for a comprehensive table
60
- - Expand "Raw Data" to inspect the complete JSON results
61
 
62
  ## Data Source
63
 
64
- The app fetches data from the HuggingFace dataset repository:
65
  - Repository: `mlfoundations-cua-dev/leaderboard`
66
  - Path: `grounding/[dataset_name]/[model_results].json`
67
 
 
 
 
 
 
 
 
 
68
  ## Supported Datasets
69
 
70
  - **ScreenSpot-v2**: Web and desktop UI element grounding
@@ -82,4 +90,4 @@ For ScreenSpot-v2, the following baselines are included:
82
 
83
  ## Caching
84
 
85
- Results are cached for 5 minutes to improve performance. The cache automatically refreshes to show new evaluation results.
 
17
 
18
  ## Features
19
 
20
+ - **Real-time Data**: Streams results directly from the HuggingFace leaderboard repository without local storage
21
  - **Interactive Visualizations**: Bar charts comparing model performance across different metrics
22
  - **Baseline Comparisons**: Shows baseline models (Qwen2-VL, UI-TARS) alongside evaluated models
23
  - **UI Type Breakdown**: For ScreenSpot datasets, shows performance by:
 
25
  - Text vs Icon elements
26
  - Overall averages
27
  - **Model Details**: View training loss, checkpoint steps, and evaluation timestamps
28
+ - **Sample Results**: Inspect the first 5 evaluation samples for each model
29
 
30
  ## Installation
31
 
 
57
  4. **Explore Details**:
58
  - Expand "Model Details" to see training metadata
59
  - Expand "Detailed UI Type Breakdown" for a comprehensive table
60
+ - Expand "Sample Results" to see the first 5 evaluation samples
61
 
62
  ## Data Source
63
 
64
+ The app streams data directly from the HuggingFace dataset repository:
65
  - Repository: `mlfoundations-cua-dev/leaderboard`
66
  - Path: `grounding/[dataset_name]/[model_results].json`
67
 
68
+ ## Streaming Approach
69
+
70
+ To minimize local storage requirements, the app:
71
+ - Streams JSON files directly from HuggingFace Hub
72
+ - Extracts only the necessary data for visualization
73
+ - Discards the full JSON after processing
74
+ - Caches the extracted data in memory for 5 minutes
75
+
76
  ## Supported Datasets
77
 
78
  - **ScreenSpot-v2**: Web and desktop UI element grounding
 
90
 
91
  ## Caching
92
 
93
+ Results are cached in memory for 5 minutes to improve performance. The cache automatically refreshes to show new evaluation results.
src/streamlit_app.py CHANGED
@@ -5,7 +5,7 @@ os.environ["HF_HOME"] = "src/data_cache"
5
  import streamlit as st
6
  import pandas as pd
7
  import altair as alt
8
- from huggingface_hub import HfApi, hf_hub_download
9
  import json
10
  from pathlib import Path
11
  from typing import Dict, List, Optional
@@ -58,8 +58,9 @@ BASELINES = {
58
 
59
  @st.cache_data(ttl=300) # Cache for 5 minutes
60
  def fetch_leaderboard_data():
61
- """Fetch all grounding results from HuggingFace leaderboard."""
62
  api = HfApi()
 
63
 
64
  try:
65
  # List all files in the grounding directory
@@ -67,19 +68,26 @@ def fetch_leaderboard_data():
67
  grounding_files = [f for f in files if f.startswith(f"{GROUNDING_PATH}/") and f.endswith(".json")]
68
 
69
  results = []
70
- for file_path in grounding_files:
 
 
 
 
 
71
  try:
72
- # Download and parse each JSON file
73
- local_path = hf_hub_download(
74
- repo_id=REPO_ID,
75
- filename=file_path,
76
- repo_type="dataset"
77
- )
78
 
79
- with open(local_path, 'r') as f:
 
 
 
 
80
  data = json.load(f)
81
 
82
- # Extract key information
83
  metadata = data.get("metadata", {})
84
  metrics = data.get("metrics", {})
85
  detailed_results = data.get("detailed_results", {})
@@ -89,18 +97,30 @@ def fetch_leaderboard_data():
89
  dataset_name = path_parts[1] if len(path_parts) > 1 else "unknown"
90
 
91
  # Get model name from metadata or path
92
- model_name = metadata.get("model_checkpoint", "").split('/')[-1]
 
 
 
93
  if not model_name and len(path_parts) > 2:
94
- model_name = path_parts[2].replace("results_", "").replace(".json", "")
 
 
 
 
 
 
 
 
95
 
96
  # Extract UI type results if available
97
  ui_type_results = detailed_results.get("by_ui_type", {})
98
  dataset_type_results = detailed_results.get("by_dataset_type", {})
99
 
100
- results.append({
 
101
  "dataset": dataset_name,
102
  "model": model_name,
103
- "model_path": metadata.get("model_checkpoint", ""),
104
  "overall_accuracy": metrics.get("accuracy", 0) * 100, # Convert to percentage
105
  "total_samples": metrics.get("total", 0),
106
  "timestamp": metadata.get("evaluation_timestamp", ""),
@@ -108,13 +128,23 @@ def fetch_leaderboard_data():
108
  "training_loss": metadata.get("training_loss"),
109
  "ui_type_results": ui_type_results,
110
  "dataset_type_results": dataset_type_results,
111
- "raw_data": data
112
- })
 
 
 
 
 
 
113
 
114
  except Exception as e:
115
  st.warning(f"Error loading {file_path}: {str(e)}")
116
  continue
117
 
 
 
 
 
118
  return pd.DataFrame(results)
119
 
120
  except Exception as e:
@@ -347,11 +377,25 @@ def main():
347
  st.dataframe(display_df, use_container_width=True)
348
 
349
  # Raw data viewer
350
- with st.expander("Raw Data"):
351
  if selected_model != 'All' and len(filtered_df) == 1:
352
- st.json(filtered_df.iloc[0]['raw_data'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  else:
354
- st.info("Select a specific model to view raw data")
355
 
356
  if __name__ == "__main__":
357
  main()
 
5
  import streamlit as st
6
  import pandas as pd
7
  import altair as alt
8
+ from huggingface_hub import HfApi, HfFileSystem
9
  import json
10
  from pathlib import Path
11
  from typing import Dict, List, Optional
 
58
 
59
  @st.cache_data(ttl=300) # Cache for 5 minutes
60
  def fetch_leaderboard_data():
61
+ """Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
62
  api = HfApi()
63
+ fs = HfFileSystem()
64
 
65
  try:
66
  # List all files in the grounding directory
 
68
  grounding_files = [f for f in files if f.startswith(f"{GROUNDING_PATH}/") and f.endswith(".json")]
69
 
70
  results = []
71
+
72
+ # Create progress bar for loading
73
+ progress_bar = st.progress(0)
74
+ status_text = st.empty()
75
+
76
+ for idx, file_path in enumerate(grounding_files):
77
  try:
78
+ # Update progress
79
+ progress = (idx + 1) / len(grounding_files)
80
+ progress_bar.progress(progress)
81
+ status_text.text(f"Loading {idx + 1}/{len(grounding_files)} files...")
 
 
82
 
83
+ # Stream the JSON file content directly from HuggingFace
84
+ file_url = f"datasets/{REPO_ID}/{file_path}"
85
+
86
+ # Read the file content directly without downloading
87
+ with fs.open(file_url, 'r') as f:
88
  data = json.load(f)
89
 
90
+ # Extract only the necessary information
91
  metadata = data.get("metadata", {})
92
  metrics = data.get("metrics", {})
93
  detailed_results = data.get("detailed_results", {})
 
97
  dataset_name = path_parts[1] if len(path_parts) > 1 else "unknown"
98
 
99
  # Get model name from metadata or path
100
+ model_checkpoint = metadata.get("model_checkpoint", "")
101
+ model_name = model_checkpoint.split('/')[-1]
102
+
103
+ # Handle checkpoint names
104
  if not model_name and len(path_parts) > 2:
105
+ # Check if it's a checkpoint subdirectory structure
106
+ if len(path_parts) > 3 and path_parts[2] != path_parts[3]:
107
+ # Format: grounding/dataset/base_model/checkpoint.json
108
+ base_model = path_parts[2]
109
+ checkpoint_file = path_parts[3].replace(".json", "")
110
+ model_name = f"{base_model}/{checkpoint_file}"
111
+ else:
112
+ # Regular format: grounding/dataset/results_modelname.json
113
+ model_name = path_parts[2].replace("results_", "").replace(".json", "")
114
 
115
  # Extract UI type results if available
116
  ui_type_results = detailed_results.get("by_ui_type", {})
117
  dataset_type_results = detailed_results.get("by_dataset_type", {})
118
 
119
+ # Create a compact result entry (only keep what we need for visualization)
120
+ result_entry = {
121
  "dataset": dataset_name,
122
  "model": model_name,
123
+ "model_path": model_checkpoint,
124
  "overall_accuracy": metrics.get("accuracy", 0) * 100, # Convert to percentage
125
  "total_samples": metrics.get("total", 0),
126
  "timestamp": metadata.get("evaluation_timestamp", ""),
 
128
  "training_loss": metadata.get("training_loss"),
129
  "ui_type_results": ui_type_results,
130
  "dataset_type_results": dataset_type_results,
131
+ # Store minimal sample results for inspection
132
+ "sample_results_summary": {
133
+ "total_samples": len(data.get("sample_results", [])),
134
+ "first_5_samples": data.get("sample_results", [])[:5]
135
+ }
136
+ }
137
+
138
+ results.append(result_entry)
139
 
140
  except Exception as e:
141
  st.warning(f"Error loading {file_path}: {str(e)}")
142
  continue
143
 
144
+ # Clear progress indicators
145
+ progress_bar.empty()
146
+ status_text.empty()
147
+
148
  return pd.DataFrame(results)
149
 
150
  except Exception as e:
 
377
  st.dataframe(display_df, use_container_width=True)
378
 
379
  # Raw data viewer
380
+ with st.expander("Sample Results"):
381
  if selected_model != 'All' and len(filtered_df) == 1:
382
+ summary = filtered_df.iloc[0]['sample_results_summary']
383
+ st.write(f"**Total evaluation samples:** {summary['total_samples']}")
384
+ st.write("**First 5 sample results:**")
385
+ for i, sample in enumerate(summary['first_5_samples'], 1):
386
+ st.write(f"\n**Sample {i}:**")
387
+ col1, col2 = st.columns([1, 2])
388
+ with col1:
389
+ st.write(f"- **Correct:** {'✅' if sample.get('is_correct') else '❌'}")
390
+ st.write(f"- **Image:** {sample.get('img_filename', 'N/A')}")
391
+ with col2:
392
+ st.write(f"- **Instruction:** {sample.get('instruction', 'N/A')}")
393
+ if sample.get('predicted_click'):
394
+ st.write(f"- **Predicted Click:** {sample['predicted_click']}")
395
+ if sample.get('error_msg'):
396
+ st.write(f"- **Error:** {sample['error_msg']}")
397
  else:
398
+ st.info("Select a specific model to view sample results")
399
 
400
  if __name__ == "__main__":
401
  main()