Spaces:

mlfoundations-cua-dev
/

leaderboard-viewer

Running

App Files Files Community

Anas Awadalla commited on Jul 24

Commit

2dbb46e

1 Parent(s): 2a7516c

try streaming

Browse files

Files changed (2) hide show

README.md +13 -5
src/streamlit_app.py +64 -20

README.md CHANGED Viewed

@@ -17,7 +17,7 @@ A Streamlit application for visualizing model performance on grounding benchmark
 ## Features
-- **Real-time Data**: Fetches results directly from the HuggingFace leaderboard repository
 - **Interactive Visualizations**: Bar charts comparing model performance across different metrics
 - **Baseline Comparisons**: Shows baseline models (Qwen2-VL, UI-TARS) alongside evaluated models
 - **UI Type Breakdown**: For ScreenSpot datasets, shows performance by:
@@ -25,7 +25,7 @@ A Streamlit application for visualizing model performance on grounding benchmark
   - Text vs Icon elements
   - Overall averages
 - **Model Details**: View training loss, checkpoint steps, and evaluation timestamps
-- **Raw Data Access**: Inspect the complete evaluation results JSON
 ## Installation
@@ -57,14 +57,22 @@ The app will open in your browser at `http://localhost:8501`
 4. **Explore Details**:
    - Expand "Model Details" to see training metadata
    - Expand "Detailed UI Type Breakdown" for a comprehensive table
-   - Expand "Raw Data" to inspect the complete JSON results
 ## Data Source
-The app fetches data from the HuggingFace dataset repository:
 - Repository: `mlfoundations-cua-dev/leaderboard`
 - Path: `grounding/[dataset_name]/[model_results].json`
 ## Supported Datasets
 - **ScreenSpot-v2**: Web and desktop UI element grounding
@@ -82,4 +90,4 @@ For ScreenSpot-v2, the following baselines are included:
 ## Caching
-Results are cached for 5 minutes to improve performance. The cache automatically refreshes to show new evaluation results.

 ## Features
+- **Real-time Data**: Streams results directly from the HuggingFace leaderboard repository without local storage
 - **Interactive Visualizations**: Bar charts comparing model performance across different metrics
 - **Baseline Comparisons**: Shows baseline models (Qwen2-VL, UI-TARS) alongside evaluated models
 - **UI Type Breakdown**: For ScreenSpot datasets, shows performance by:
   - Text vs Icon elements
   - Overall averages
 - **Model Details**: View training loss, checkpoint steps, and evaluation timestamps
+- **Sample Results**: Inspect the first 5 evaluation samples for each model
 ## Installation
 4. **Explore Details**:
    - Expand "Model Details" to see training metadata
    - Expand "Detailed UI Type Breakdown" for a comprehensive table
+   - Expand "Sample Results" to see the first 5 evaluation samples
 ## Data Source
+The app streams data directly from the HuggingFace dataset repository:
 - Repository: `mlfoundations-cua-dev/leaderboard`
 - Path: `grounding/[dataset_name]/[model_results].json`
+## Streaming Approach
+To minimize local storage requirements, the app:
+- Streams JSON files directly from HuggingFace Hub
+- Extracts only the necessary data for visualization
+- Discards the full JSON after processing
+- Caches the extracted data in memory for 5 minutes
 ## Supported Datasets
 - **ScreenSpot-v2**: Web and desktop UI element grounding
 ## Caching
+Results are cached in memory for 5 minutes to improve performance. The cache automatically refreshes to show new evaluation results.

src/streamlit_app.py CHANGED Viewed

@@ -5,7 +5,7 @@ os.environ["HF_HOME"] = "src/data_cache"
 import streamlit as st
 import pandas as pd
 import altair as alt
-from huggingface_hub import HfApi, hf_hub_download
 import json
 from pathlib import Path
 from typing import Dict, List, Optional
@@ -58,8 +58,9 @@ BASELINES = {
 @st.cache_data(ttl=300)  # Cache for 5 minutes
 def fetch_leaderboard_data():
-    """Fetch all grounding results from HuggingFace leaderboard."""
     api = HfApi()
     try:
         # List all files in the grounding directory
@@ -67,19 +68,26 @@ def fetch_leaderboard_data():
         grounding_files = [f for f in files if f.startswith(f"{GROUNDING_PATH}/") and f.endswith(".json")]
         results = []
-        for file_path in grounding_files:
             try:
-                # Download and parse each JSON file
-                local_path = hf_hub_download(
-                    repo_id=REPO_ID,
-                    filename=file_path,
-                    repo_type="dataset"
-                )
-                with open(local_path, 'r') as f:
                     data = json.load(f)
-                # Extract key information
                 metadata = data.get("metadata", {})
                 metrics = data.get("metrics", {})
                 detailed_results = data.get("detailed_results", {})
@@ -89,18 +97,30 @@ def fetch_leaderboard_data():
                 dataset_name = path_parts[1] if len(path_parts) > 1 else "unknown"
                 # Get model name from metadata or path
-                model_name = metadata.get("model_checkpoint", "").split('/')[-1]
                 if not model_name and len(path_parts) > 2:
-                    model_name = path_parts[2].replace("results_", "").replace(".json", "")
                 # Extract UI type results if available
                 ui_type_results = detailed_results.get("by_ui_type", {})
                 dataset_type_results = detailed_results.get("by_dataset_type", {})
-                results.append({
                     "dataset": dataset_name,
                     "model": model_name,
-                    "model_path": metadata.get("model_checkpoint", ""),
                     "overall_accuracy": metrics.get("accuracy", 0) * 100,  # Convert to percentage
                     "total_samples": metrics.get("total", 0),
                     "timestamp": metadata.get("evaluation_timestamp", ""),
@@ -108,13 +128,23 @@ def fetch_leaderboard_data():
                     "training_loss": metadata.get("training_loss"),
                     "ui_type_results": ui_type_results,
                     "dataset_type_results": dataset_type_results,
-                    "raw_data": data
-                })
             except Exception as e:
                 st.warning(f"Error loading {file_path}: {str(e)}")
                 continue
         return pd.DataFrame(results)
     except Exception as e:
@@ -347,11 +377,25 @@ def main():
         st.dataframe(display_df, use_container_width=True)
     # Raw data viewer
-    with st.expander("Raw Data"):
         if selected_model != 'All' and len(filtered_df) == 1:
-            st.json(filtered_df.iloc[0]['raw_data'])
         else:
-            st.info("Select a specific model to view raw data")
 if __name__ == "__main__":
     main()

 import streamlit as st
 import pandas as pd
 import altair as alt
+from huggingface_hub import HfApi, HfFileSystem
 import json
 from pathlib import Path
 from typing import Dict, List, Optional
 @st.cache_data(ttl=300)  # Cache for 5 minutes
 def fetch_leaderboard_data():
+    """Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
     api = HfApi()
+    fs = HfFileSystem()
     try:
         # List all files in the grounding directory
         grounding_files = [f for f in files if f.startswith(f"{GROUNDING_PATH}/") and f.endswith(".json")]
         results = []
+        # Create progress bar for loading
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        for idx, file_path in enumerate(grounding_files):
             try:
+                # Update progress
+                progress = (idx + 1) / len(grounding_files)
+                progress_bar.progress(progress)
+                status_text.text(f"Loading {idx + 1}/{len(grounding_files)} files...")
+                # Stream the JSON file content directly from HuggingFace
+                file_url = f"datasets/{REPO_ID}/{file_path}"
+                # Read the file content directly without downloading
+                with fs.open(file_url, 'r') as f:
                     data = json.load(f)
+                # Extract only the necessary information
                 metadata = data.get("metadata", {})
                 metrics = data.get("metrics", {})
                 detailed_results = data.get("detailed_results", {})
                 dataset_name = path_parts[1] if len(path_parts) > 1 else "unknown"
                 # Get model name from metadata or path
+                model_checkpoint = metadata.get("model_checkpoint", "")
+                model_name = model_checkpoint.split('/')[-1]
+                # Handle checkpoint names
                 if not model_name and len(path_parts) > 2:
+                    # Check if it's a checkpoint subdirectory structure
+                    if len(path_parts) > 3 and path_parts[2] != path_parts[3]:
+                        # Format: grounding/dataset/base_model/checkpoint.json
+                        base_model = path_parts[2]
+                        checkpoint_file = path_parts[3].replace(".json", "")
+                        model_name = f"{base_model}/{checkpoint_file}"
+                    else:
+                        # Regular format: grounding/dataset/results_modelname.json
+                        model_name = path_parts[2].replace("results_", "").replace(".json", "")
                 # Extract UI type results if available
                 ui_type_results = detailed_results.get("by_ui_type", {})
                 dataset_type_results = detailed_results.get("by_dataset_type", {})
+                # Create a compact result entry (only keep what we need for visualization)
+                result_entry = {
                     "dataset": dataset_name,
                     "model": model_name,
+                    "model_path": model_checkpoint,
                     "overall_accuracy": metrics.get("accuracy", 0) * 100,  # Convert to percentage
                     "total_samples": metrics.get("total", 0),
                     "timestamp": metadata.get("evaluation_timestamp", ""),
                     "training_loss": metadata.get("training_loss"),
                     "ui_type_results": ui_type_results,
                     "dataset_type_results": dataset_type_results,
+                    # Store minimal sample results for inspection
+                    "sample_results_summary": {
+                        "total_samples": len(data.get("sample_results", [])),
+                        "first_5_samples": data.get("sample_results", [])[:5]
+                    }
+                }
+                results.append(result_entry)
             except Exception as e:
                 st.warning(f"Error loading {file_path}: {str(e)}")
                 continue
+        # Clear progress indicators
+        progress_bar.empty()
+        status_text.empty()
         return pd.DataFrame(results)
     except Exception as e:
         st.dataframe(display_df, use_container_width=True)
     # Raw data viewer
+    with st.expander("Sample Results"):
         if selected_model != 'All' and len(filtered_df) == 1:
+            summary = filtered_df.iloc[0]['sample_results_summary']
+            st.write(f"**Total evaluation samples:** {summary['total_samples']}")
+            st.write("**First 5 sample results:**")
+            for i, sample in enumerate(summary['first_5_samples'], 1):
+                st.write(f"\n**Sample {i}:**")
+                col1, col2 = st.columns([1, 2])
+                with col1:
+                    st.write(f"- **Correct:** {'✅' if sample.get('is_correct') else '❌'}")
+                    st.write(f"- **Image:** {sample.get('img_filename', 'N/A')}")
+                with col2:
+                    st.write(f"- **Instruction:** {sample.get('instruction', 'N/A')}")
+                    if sample.get('predicted_click'):
+                        st.write(f"- **Predicted Click:** {sample['predicted_click']}")
+                    if sample.get('error_msg'):
+                        st.write(f"- **Error:** {sample['error_msg']}")
         else:
+            st.info("Select a specific model to view sample results")
 if __name__ == "__main__":
     main()