Spaces:

mlfoundations-cua-dev
/

leaderboard-viewer

Running

App Files Files Community

Anas Awadalla commited on Jul 24

Commit

d37faa6

1 Parent(s): c94fd08

some fixes

Browse files

Files changed (2) hide show

README.md +21 -14
src/streamlit_app.py +29 -79

README.md CHANGED Viewed

@@ -56,7 +56,7 @@ The app will open in your browser at `http://localhost:8501`
      - Individual UI type metrics: Desktop (Text), Desktop (Icon), Web (Text), Web (Icon)
      - Text and Icon averages across environments
    - Baseline model comparisons shown in orange
-   - Models marked with * indicate the best checkpoint is not the final one
 4. **Explore Details**:
    - Expand "Model Details" to see training metadata
@@ -89,19 +89,26 @@ To minimize local storage requirements, the app:
 ## Baseline Models
-For ScreenSpot-v2, the following baselines are included:
-- Qwen2-VL-7B: 37.96%
-- UI-TARS-2B: 82.8%
-- UI-TARS-7B: 92.2%
-- UI-TARS-72B: 88.3%
-For ScreenSpot-Pro, the following baselines are included:
-- Qwen2.5-VL-3B-Instruct: 16.1%
-- Qwen2.5-VL-7B-Instruct: 26.8%
-- Qwen2.5-VL-72B-Instruct: 53.3%
-- UI-TARS-2B: 27.7%
-- UI-TARS-7B: 35.7%
-- UI-TARS-72B: 38.1%
 ## Checkpoint Handling

      - Individual UI type metrics: Desktop (Text), Desktop (Icon), Web (Text), Web (Icon)
      - Text and Icon averages across environments
    - Baseline model comparisons shown in orange
+   - Models marked with * indicate the best checkpoint is not the last one
 4. **Explore Details**:
    - Expand "Model Details" to see training metadata
 ## Baseline Models
+The dashboard includes baseline performance from established models:
+### ScreenSpot-v2 Baselines
+- **Qwen2-VL-7B**: 38.0% overall
+- **UI-TARS-2B**: 82.8% overall
+- **UI-TARS-7B**: 92.2% overall
+- **UI-TARS-72B**: 88.3% overall
+### ScreenSpot-Pro Baselines
+- **Qwen2.5-VL-3B-Instruct**: 16.1% overall
+- **Qwen2.5-VL-7B-Instruct**: 26.8% overall
+- **Qwen2.5-VL-72B-Instruct**: 53.3% overall
+- **UI-TARS-2B**: 27.7% overall
+- **UI-TARS-7B**: 35.7% overall
+- **UI-TARS-72B**: 38.1% overall
+### ShowDown-Clicks Baselines
+- **Qwen2.5-VL-72B-Instruct**: 24.8% overall
+- **UI-TARS-72B-SFT**: 54.4% overall
+- **Molmo-72B-0924**: 54.8% overall
 ## Checkpoint Handling

src/streamlit_app.py CHANGED Viewed

@@ -26,53 +26,30 @@ GROUNDING_PATH = "grounding"
 BASELINES = {
     "screenspot-v2": {
         "Qwen2-VL-7B": {
-            "desktop_text": 52.01,
-            "desktop_icon": 44.98,
-            "web_text": 33.04,
-            "web_icon": 21.84,
-            "overall": 37.96
         },
         "UI-TARS-2B": {
-            "desktop_text": 90.7,
-            "desktop_icon": 68.6,
-            "web_text": 87.2,
-            "web_icon": 84.7,
-            "overall": 82.8
         },
         "UI-TARS-7B": {
-            "desktop_text": 95.4,
-            "desktop_icon": 87.8,
-            "web_text": 93.8,
-            "web_icon": 91.6,
-            "overall": 92.2
         },
         "UI-TARS-72B": {
-            "desktop_text": 91.2,
-            "desktop_icon": 87.8,
-            "web_text": 87.7,
-            "web_icon": 86.3,
-            "overall": 88.3
         }
     },
     "screenspot-pro": {
-        "Qwen2.5-VL-3B-Instruct": {
-            "overall": 16.1
-        },
-        "Qwen2.5-VL-7B-Instruct": {
-            "overall": 26.8
-        },
-        "Qwen2.5-VL-72B-Instruct": {
-            "overall": 53.3
-        },
-        "UI-TARS-2B": {
-            "overall": 27.7
-        },
-        "UI-TARS-7B": {
-            "overall": 35.7
-        },
-        "UI-TARS-72B": {
-            "overall": 38.1
-        }
     }
 }
@@ -472,6 +449,11 @@ def main():
                 'text_avg': 'Text Average',
                 'icon_avg': 'Icon Average'
             }
         else:
             metric_options = {
                 'overall': 'Overall Average',
@@ -499,7 +481,7 @@ def main():
         else:
             st.warning(f"No data available for {metric_options[selected_metric]}")
-        # Show all metrics in an expandable section
         with st.expander("View All Metrics"):
             if selected_dataset == 'screenspot-v2':
                 # First row: Overall, Desktop, Web averages
@@ -556,35 +538,8 @@ def main():
                     if chart:
                         st.altair_chart(chart, use_container_width=True)
             else:
-                # For other screenspot datasets, show the standard layout
-                col1, col2 = st.columns(2)
-                with col1:
-                    # Overall Average
-                    chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average')
-                    if chart:
-                        st.altair_chart(chart, use_container_width=True)
-                    # Desktop Average
-                    chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
-                    if chart:
-                        st.altair_chart(chart, use_container_width=True)
-                    # Text Average
-                    chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (UI-Type)')
-                    if chart:
-                        st.altair_chart(chart, use_container_width=True)
-                with col2:
-                    # Web Average
-                    chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
-                    if chart:
-                        st.altair_chart(chart, use_container_width=True)
-                    # Icon Average
-                    chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (UI-Type)')
-                    if chart:
-                        st.altair_chart(chart, use_container_width=True)
         # Checkpoint progression visualization
         with st.expander("Checkpoint Progression Analysis"):
@@ -798,9 +753,9 @@ def main():
             else:
                 st.info("No models with multiple checkpoints available for progression analysis")
-        # Detailed breakdown
-        if selected_dataset == 'screenspot-v2':
-            with st.expander("Detailed UI Type Breakdown"):
                 # Create a heatmap-style table
                 detailed_metrics = []
                 for _, row in ui_metrics_df.iterrows():
@@ -815,6 +770,9 @@ def main():
                 if detailed_metrics:
                     st.dataframe(pd.DataFrame(detailed_metrics), use_container_width=True)
     else:
         # For non-ScreenSpot datasets, show a simple bar chart
@@ -833,14 +791,6 @@ def main():
         )
         st.altair_chart(chart, use_container_width=True)
-    # Model details table
-    with st.expander("Model Details"):
-        display_df = filtered_df[['model', 'overall_accuracy', 'total_samples', 'checkpoint_steps', 'training_loss', 'timestamp']].copy()
-        display_df.columns = ['Model', 'Accuracy (%)', 'Samples', 'Checkpoint Steps', 'Training Loss', 'Timestamp']
-        display_df['Accuracy (%)'] = display_df['Accuracy (%)'].apply(lambda x: f"{x:.2f}")
-        display_df['Training Loss'] = display_df['Training Loss'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
-        st.dataframe(display_df, use_container_width=True)
 if __name__ == "__main__":
     main()

 BASELINES = {
     "screenspot-v2": {
         "Qwen2-VL-7B": {
+            "desktop_text": 52.01, "desktop_icon": 44.98, "web_text": 33.04, "web_icon": 21.84, "overall": 37.96
         },
         "UI-TARS-2B": {
+            "desktop_text": 90.7, "desktop_icon": 68.6, "web_text": 87.2, "web_icon": 84.7, "overall": 82.8
         },
         "UI-TARS-7B": {
+            "desktop_text": 95.4, "desktop_icon": 87.8, "web_text": 93.8, "web_icon": 91.6, "overall": 92.2
         },
         "UI-TARS-72B": {
+            "desktop_text": 91.2, "desktop_icon": 87.8, "web_text": 87.7, "web_icon": 86.3, "overall": 88.3
         }
     },
     "screenspot-pro": {
+        "Qwen2.5-VL-3B-Instruct": {"overall": 16.1},
+        "Qwen2.5-VL-7B-Instruct": {"overall": 26.8},
+        "Qwen2.5-VL-72B-Instruct": {"overall": 53.3},
+        "UI-TARS-2B": {"overall": 27.7},
+        "UI-TARS-7B": {"overall": 35.7},
+        "UI-TARS-72B": {"overall": 38.1}
+    },
+    "showdown-clicks": {
+        "Qwen2.5-VL-72B-Instruct": {"overall": 24.78},
+        "UI-TARS-72B-SFT": {"overall": 54.4},
+        "Molmo-72B-0924": {"overall": 54.76}
     }
 }
                 'text_avg': 'Text Average',
                 'icon_avg': 'Icon Average'
             }
+        elif selected_dataset in ['screenspot-pro', 'showdown-clicks']:
+            # For screenspot-pro and showdown-clicks, only show overall average
+            metric_options = {
+                'overall': 'Overall Average'
+            }
         else:
             metric_options = {
                 'overall': 'Overall Average',
         else:
             st.warning(f"No data available for {metric_options[selected_metric]}")
+        # Show all metrics in an expandable section - available for all datasets
         with st.expander("View All Metrics"):
             if selected_dataset == 'screenspot-v2':
                 # First row: Overall, Desktop, Web averages
                     if chart:
                         st.altair_chart(chart, use_container_width=True)
             else:
+                # For screenspot-pro and showdown-clicks
+                st.info("No additional UI type metrics available for this dataset. Only overall accuracy is reported.")
         # Checkpoint progression visualization
         with st.expander("Checkpoint Progression Analysis"):
             else:
                 st.info("No models with multiple checkpoints available for progression analysis")
+        # Detailed breakdown - show for all datasets
+        with st.expander("Detailed UI Type Breakdown"):
+            if selected_dataset == 'screenspot-v2':
                 # Create a heatmap-style table
                 detailed_metrics = []
                 for _, row in ui_metrics_df.iterrows():
                 if detailed_metrics:
                     st.dataframe(pd.DataFrame(detailed_metrics), use_container_width=True)
+            else:
+                # For screenspot-pro and showdown-clicks
+                st.info("Detailed UI type breakdown is only available for ScreenSpot-v2 dataset.")
     else:
         # For non-ScreenSpot datasets, show a simple bar chart
         )
         st.altair_chart(chart, use_container_width=True)
 if __name__ == "__main__":
     main()