Spaces:

mlfoundations-cua-dev
/

leaderboard-viewer

Running

App Files Files Community

Anas Awadalla commited on Jul 24

Commit

4db9f63

1 Parent(s): 79cb6e1

some fixes

Browse files

Files changed (2) hide show

README.md +0 -2
src/streamlit_app.py +325 -165

README.md CHANGED Viewed

@@ -26,7 +26,6 @@ A Streamlit application for visualizing model performance on grounding benchmark
   - For other datasets: Desktop vs Web and Text vs Icon performance
 - **Checkpoint Progression Analysis**: Visualize how metrics evolve during training
 - **Model Details**: View training loss, checkpoint steps, and evaluation timestamps
-- **Sample Results**: Inspect the first 5 evaluation samples for each model
 ## Installation
@@ -62,7 +61,6 @@ The app will open in your browser at `http://localhost:8501`
 4. **Explore Details**:
    - Expand "Model Details" to see training metadata
    - Expand "Detailed UI Type Breakdown" for a comprehensive table
-   - Expand "Sample Results" to see the first 5 evaluation samples
    - Expand "Checkpoint Progression Analysis" to:
      - View accuracy progression over training steps
      - See the relationship between training loss and accuracy

   - For other datasets: Desktop vs Web and Text vs Icon performance
 - **Checkpoint Progression Analysis**: Visualize how metrics evolve during training
 - **Model Details**: View training loss, checkpoint steps, and evaluation timestamps
 ## Installation
 4. **Explore Details**:
    - Expand "Model Details" to see training metadata
    - Expand "Detailed UI Type Breakdown" for a comprehensive table
    - Expand "Checkpoint Progression Analysis" to:
      - View accuracy progression over training steps
      - See the relationship between training loss and accuracy

src/streamlit_app.py CHANGED Viewed

@@ -167,12 +167,7 @@ def fetch_leaderboard_data():
                     "checkpoint_steps": metadata.get("checkpoint_steps"),
                     "training_loss": metadata.get("training_loss"),
                     "ui_type_results": ui_type_results,
-                    "dataset_type_results": dataset_type_results,
-                    # Store minimal sample results for inspection
-                    "sample_results_summary": {
-                        "total_samples": len(data.get("sample_results", [])),
-                        "first_5_samples": data.get("sample_results", [])[:5]
-                    }
                 }
                 results.append(result_entry)
@@ -242,16 +237,31 @@ def parse_ui_type_metrics(df: pd.DataFrame, dataset_filter: str) -> pd.DataFrame
             continue
         model = row['model']
-        ui_results = row['ui_type_results']
         # For ScreenSpot datasets, we have desktop/web and text/icon
         if 'screenspot' in dataset_filter.lower():
-            # Calculate individual metrics
             desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
             desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
             web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
             web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
             # Calculate averages
             desktop_avg = (desktop_text + desktop_icon) / 2 if (desktop_text > 0 or desktop_icon > 0) else 0
             web_avg = (web_text + web_icon) / 2 if (web_text > 0 or web_icon > 0) else 0
@@ -260,7 +270,7 @@ def parse_ui_type_metrics(df: pd.DataFrame, dataset_filter: str) -> pd.DataFrame
             # For screenspot-v2, calculate the overall as average of desktop and web
             if dataset_filter == 'screenspot-v2':
-                overall = (desktop_avg + web_avg) / 2 if (desktop_avg > 0 or web_avg > 0) else 0
             else:
                 overall = row['overall_accuracy']
@@ -278,6 +288,14 @@ def parse_ui_type_metrics(df: pd.DataFrame, dataset_filter: str) -> pd.DataFrame
                 'is_best_not_last': row.get('is_best_not_last', False),
                 'all_checkpoints': row.get('all_checkpoints', [])
             })
     return pd.DataFrame(metrics_list)
@@ -326,8 +344,8 @@ def create_bar_chart(data: pd.DataFrame, metric: str, title: str):
         tooltip=['Model', 'Score', 'Type']
     ).properties(
         title=title,
-        width=400,
-        height=300
     )
     # Add value labels
@@ -374,6 +392,38 @@ def main():
     # Main content
     st.header(f"Results for {selected_dataset}")
     # Overall metrics
     col1, col2, col3 = st.columns(3)
     with col1:
@@ -390,98 +440,137 @@ def main():
     # Parse UI type metrics
     ui_metrics_df = parse_ui_type_metrics(filtered_df, selected_dataset)
     if not ui_metrics_df.empty and 'screenspot' in selected_dataset.lower():
         st.subheader("Performance by UI Type")
         # Add note about asterisks
         if any(ui_metrics_df['is_best_not_last']):
             st.info("* indicates the best checkpoint is not the last checkpoint")
-        # Create charts in a grid
-        if selected_dataset == 'screenspot-v2':
-            # First row: Overall, Desktop, Web averages
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average (Desktop + Web) / 2')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-            with col2:
-                chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-            with col3:
-                chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-            # Second row: Individual UI type metrics
-            col1, col2, col3, col4 = st.columns(4)
-            with col1:
-                chart = create_bar_chart(ui_metrics_df, 'desktop_text', 'Desktop (Text)')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-            with col2:
-                chart = create_bar_chart(ui_metrics_df, 'desktop_icon', 'Desktop (Icon)')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-            with col3:
-                chart = create_bar_chart(ui_metrics_df, 'web_text', 'Web (Text)')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-            with col4:
-                chart = create_bar_chart(ui_metrics_df, 'web_icon', 'Web (Icon)')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-            # Third row: Text vs Icon averages
-            col1, col2 = st.columns(2)
-            with col1:
-                chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (Desktop + Web)')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-            with col2:
-                chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (Desktop + Web)')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
         else:
-            # For other screenspot datasets, show the standard layout
-            col1, col2 = st.columns(2)
-            with col1:
-                # Overall Average
-                chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-                # Desktop Average
-                chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-                # Text Average
-                chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (UI-Type)')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-            with col2:
-                # Web Average
-                chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
-                # Icon Average
-                chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (UI-Type)')
-                if chart:
-                    st.altair_chart(chart, use_container_width=True)
         # Checkpoint progression visualization
         with st.expander("Checkpoint Progression Analysis"):
@@ -504,23 +593,46 @@ def main():
                 # Prepare data for visualization
                 checkpoint_metrics = []
                 for _, cp in checkpoint_df.iterrows():
-                    ui_results = cp['ui_type_results']
-                    # Calculate metrics
                     desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
                     desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
                     web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
                     web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
                     desktop_avg = (desktop_text + desktop_icon) / 2
                     web_avg = (web_text + web_icon) / 2
                     overall = (desktop_avg + web_avg) / 2 if selected_dataset == 'screenspot-v2' else cp['overall_accuracy']
                     checkpoint_metrics.append({
                         'steps': cp['checkpoint_steps'] or 0,
                         'overall': overall,
-                        'desktop': desktop_avg,
-                        'web': web_avg,
                         'loss': cp['training_loss'],
                         'neg_log_loss': -np.log(cp['training_loss']) if cp['training_loss'] and cp['training_loss'] > 0 else None
                     })
@@ -533,74 +645,143 @@ def main():
                 with col1:
                     st.write("**Accuracy over Training Steps**")
-                    # Melt data for multi-line chart
-                    melted = metrics_df[['steps', 'overall', 'desktop', 'web']].melt(
-                        id_vars=['steps'],
-                        var_name='Metric',
-                        value_name='Accuracy'
-                    )
-                    chart = alt.Chart(melted).mark_line(point=True).encode(
-                        x=alt.X('steps:Q', title='Training Steps'),
-                        y=alt.Y('Accuracy:Q', scale=alt.Scale(domain=[0, 100]), title='Accuracy (%)'),
-                        color=alt.Color('Metric:N', scale=alt.Scale(
-                            domain=['overall', 'desktop', 'web'],
-                            range=['#4ECDC4', '#45B7D1', '#96CEB4']
-                        )),
-                        tooltip=['steps', 'Metric', 'Accuracy']
-                    ).properties(
-                        width=400,
-                        height=300,
-                        title='Accuracy Progression During Training'
-                    )
-                    st.altair_chart(chart, use_container_width=True)
                 with col2:
-                    st.write("**Accuracy vs. Training Loss**")
                     if metrics_df['neg_log_loss'].notna().any():
                         scatter_data = metrics_df[metrics_df['neg_log_loss'].notna()]
                         chart = alt.Chart(scatter_data).mark_circle(size=100).encode(
                             x=alt.X('neg_log_loss:Q', title='-log(Training Loss)'),
-                            y=alt.Y('overall:Q', scale=alt.Scale(domain=[0, 100]), title='Overall Accuracy (%)'),
                             color=alt.Color('steps:Q', scale=alt.Scale(scheme='viridis'), title='Training Steps'),
-                            tooltip=['steps', 'loss', 'overall']
                         ).properties(
-                            width=400,
-                            height=300,
-                            title='Accuracy vs. -log(Training Loss)'
                         )
                         st.altair_chart(chart, use_container_width=True)
                     else:
                         st.info("No training loss data available for this model")
-                # Show checkpoint details table
                 st.write("**Checkpoint Details**")
-                display_metrics = metrics_df[['steps', 'overall', 'desktop', 'web', 'loss']].copy()
-                display_metrics.columns = ['Steps', 'Overall %', 'Desktop %', 'Web %', 'Training Loss']
-                display_metrics[['Overall %', 'Desktop %', 'Web %']] = display_metrics[['Overall %', 'Desktop %', 'Web %']].round(2)
                 display_metrics['Training Loss'] = display_metrics['Training Loss'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
                 st.dataframe(display_metrics, use_container_width=True)
             else:
                 st.info("No models with multiple checkpoints available for progression analysis")
         # Detailed breakdown
-        with st.expander("Detailed UI Type Breakdown"):
-            # Create a heatmap-style table
-            detailed_metrics = []
-            for _, row in ui_metrics_df.iterrows():
-                detailed_metrics.append({
-                    'Model': row['model'],
-                    'Desktop Text': f"{row['desktop_text']:.1f}%",
-                    'Desktop Icon': f"{row['desktop_icon']:.1f}%",
-                    'Web Text': f"{row['web_text']:.1f}%",
-                    'Web Icon': f"{row['web_icon']:.1f}%",
-                    'Overall': f"{row['overall']:.1f}%"
-                })
-            if detailed_metrics:
-                st.dataframe(pd.DataFrame(detailed_metrics), use_container_width=True)
     else:
         # For non-ScreenSpot datasets, show a simple bar chart
@@ -627,27 +808,6 @@ def main():
         display_df['Accuracy (%)'] = display_df['Accuracy (%)'].apply(lambda x: f"{x:.2f}")
         display_df['Training Loss'] = display_df['Training Loss'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
         st.dataframe(display_df, use_container_width=True)
-    # Raw data viewer
-    with st.expander("Sample Results"):
-        if selected_model != 'All' and len(filtered_df) == 1:
-            summary = filtered_df.iloc[0]['sample_results_summary']
-            st.write(f"**Total evaluation samples:** {summary['total_samples']}")
-            st.write("**First 5 sample results:**")
-            for i, sample in enumerate(summary['first_5_samples'], 1):
-                st.write(f"\n**Sample {i}:**")
-                col1, col2 = st.columns([1, 2])
-                with col1:
-                    st.write(f"- **Correct:** {'✅' if sample.get('is_correct') else '❌'}")
-                    st.write(f"- **Image:** {sample.get('img_filename', 'N/A')}")
-                with col2:
-                    st.write(f"- **Instruction:** {sample.get('instruction', 'N/A')}")
-                    if sample.get('predicted_click'):
-                        st.write(f"- **Predicted Click:** {sample['predicted_click']}")
-                    if sample.get('error_msg'):
-                        st.write(f"- **Error:** {sample['error_msg']}")
-        else:
-            st.info("Select a specific model to view sample results")
 if __name__ == "__main__":
     main()

                     "checkpoint_steps": metadata.get("checkpoint_steps"),
                     "training_loss": metadata.get("training_loss"),
                     "ui_type_results": ui_type_results,
+                    "dataset_type_results": dataset_type_results
                 }
                 results.append(result_entry)
             continue
         model = row['model']
+        ui_results = row.get('ui_type_results', {})
+        dataset_type_results = row.get('dataset_type_results', {})
         # For ScreenSpot datasets, we have desktop/web and text/icon
         if 'screenspot' in dataset_filter.lower():
+            # First try to get from ui_type_results
             desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
             desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
             web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
             web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
+            # If all zeros, try to get from dataset_type_results
+            if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
+                # Check if data is nested under dataset types (e.g., 'screenspot-v2')
+                for dataset_key in dataset_type_results:
+                    if 'screenspot' in dataset_key.lower():
+                        dataset_data = dataset_type_results[dataset_key]
+                        if 'by_ui_type' in dataset_data:
+                            ui_data = dataset_data['by_ui_type']
+                            desktop_text = ui_data.get('desktop_text', {}).get('correct', 0) / max(ui_data.get('desktop_text', {}).get('total', 1), 1) * 100
+                            desktop_icon = ui_data.get('desktop_icon', {}).get('correct', 0) / max(ui_data.get('desktop_icon', {}).get('total', 1), 1) * 100
+                            web_text = ui_data.get('web_text', {}).get('correct', 0) / max(ui_data.get('web_text', {}).get('total', 1), 1) * 100
+                            web_icon = ui_data.get('web_icon', {}).get('correct', 0) / max(ui_data.get('web_icon', {}).get('total', 1), 1) * 100
+                            break
             # Calculate averages
             desktop_avg = (desktop_text + desktop_icon) / 2 if (desktop_text > 0 or desktop_icon > 0) else 0
             web_avg = (web_text + web_icon) / 2 if (web_text > 0 or web_icon > 0) else 0
             # For screenspot-v2, calculate the overall as average of desktop and web
             if dataset_filter == 'screenspot-v2':
+                overall = (desktop_avg + web_avg) / 2 if (desktop_avg > 0 or web_avg > 0) else row['overall_accuracy']
             else:
                 overall = row['overall_accuracy']
                 'is_best_not_last': row.get('is_best_not_last', False),
                 'all_checkpoints': row.get('all_checkpoints', [])
             })
+        else:
+            # For non-screenspot datasets, just pass through overall accuracy
+            metrics_list.append({
+                'model': model,
+                'overall': row['overall_accuracy'],
+                'is_best_not_last': row.get('is_best_not_last', False),
+                'all_checkpoints': row.get('all_checkpoints', [])
+            })
     return pd.DataFrame(metrics_list)
         tooltip=['Model', 'Score', 'Type']
     ).properties(
         title=title,
+        width=500,  # Increased from 400
+        height=400  # Increased from 300
     )
     # Add value labels
     # Main content
     st.header(f"Results for {selected_dataset}")
+    # Debug information (can be removed later)
+    with st.expander("Debug Information"):
+        st.write(f"Total rows in filtered_df: {len(filtered_df)}")
+        st.write(f"Total rows in ui_metrics_df: {len(ui_metrics_df)}")
+        if not filtered_df.empty:
+            st.write("Sample data from filtered_df:")
+            st.write(filtered_df[['model', 'base_model', 'is_checkpoint', 'overall_accuracy']].head())
+            # Show UI type results structure
+            st.write("\nUI Type Results Structure:")
+            for idx, row in filtered_df.head(2).iterrows():
+                st.write(f"\nModel: {row['model']}")
+                ui_results = row.get('ui_type_results', {})
+                if ui_results:
+                    st.write("UI Type Keys:", list(ui_results.keys()))
+                    # Show a sample of the structure
+                    for key in list(ui_results.keys())[:2]:
+                        st.write(f"  {key}: {ui_results[key]}")
+                else:
+                    st.write("  No UI type results found")
+                # Also check dataset_type_results
+                dataset_type_results = row.get('dataset_type_results', {})
+                if dataset_type_results:
+                    st.write("Dataset Type Results Keys:", list(dataset_type_results.keys()))
+                    for key in list(dataset_type_results.keys())[:2]:
+                        st.write(f"  {key}: {dataset_type_results[key]}")
+        if not ui_metrics_df.empty:
+            st.write("\nSample data from ui_metrics_df:")
+            st.write(ui_metrics_df[['model', 'overall', 'desktop_avg', 'web_avg']].head())
     # Overall metrics
     col1, col2, col3 = st.columns(3)
     with col1:
     # Parse UI type metrics
     ui_metrics_df = parse_ui_type_metrics(filtered_df, selected_dataset)
+    # Add metric selector for screenspot datasets
+    selected_metric = 'overall'  # Default metric
     if not ui_metrics_df.empty and 'screenspot' in selected_dataset.lower():
         st.subheader("Performance by UI Type")
+        # Metric selector dropdown
+        if selected_dataset == 'screenspot-v2':
+            metric_options = {
+                'overall': 'Overall Average (Desktop + Web) / 2',
+                'desktop_avg': 'Desktop Average',
+                'web_avg': 'Web Average',
+                'desktop_text': 'Desktop (Text)',
+                'desktop_icon': 'Desktop (Icon)',
+                'web_text': 'Web (Text)',
+                'web_icon': 'Web (Icon)',
+                'text_avg': 'Text Average',
+                'icon_avg': 'Icon Average'
+            }
+        else:
+            metric_options = {
+                'overall': 'Overall Average',
+                'desktop_avg': 'Desktop Average',
+                'web_avg': 'Web Average',
+                'text_avg': 'Text Average',
+                'icon_avg': 'Icon Average'
+            }
+        selected_metric = st.selectbox(
+            "Select metric to visualize:",
+            options=list(metric_options.keys()),
+            format_func=lambda x: metric_options[x],
+            key="metric_selector"
+        )
         # Add note about asterisks
         if any(ui_metrics_df['is_best_not_last']):
             st.info("* indicates the best checkpoint is not the last checkpoint")
+        # Create single chart for selected metric
+        chart = create_bar_chart(ui_metrics_df, selected_metric, metric_options[selected_metric])
+        if chart:
+            st.altair_chart(chart, use_container_width=True)
         else:
+            st.warning(f"No data available for {metric_options[selected_metric]}")
+        # Show all metrics in an expandable section
+        with st.expander("View All Metrics"):
+            if selected_dataset == 'screenspot-v2':
+                # First row: Overall, Desktop, Web averages
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average (Desktop + Web) / 2')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col2:
+                    chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col3:
+                    chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                # Second row: Individual UI type metrics
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    chart = create_bar_chart(ui_metrics_df, 'desktop_text', 'Desktop (Text)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col2:
+                    chart = create_bar_chart(ui_metrics_df, 'desktop_icon', 'Desktop (Icon)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col3:
+                    chart = create_bar_chart(ui_metrics_df, 'web_text', 'Web (Text)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col4:
+                    chart = create_bar_chart(ui_metrics_df, 'web_icon', 'Web (Icon)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                # Third row: Text vs Icon averages
+                col1, col2 = st.columns(2)
+                with col1:
+                    chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (Desktop + Web)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col2:
+                    chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (Desktop + Web)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+            else:
+                # For other screenspot datasets, show the standard layout
+                col1, col2 = st.columns(2)
+                with col1:
+                    # Overall Average
+                    chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                    # Desktop Average
+                    chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                    # Text Average
+                    chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (UI-Type)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col2:
+                    # Web Average
+                    chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                    # Icon Average
+                    chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (UI-Type)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
         # Checkpoint progression visualization
         with st.expander("Checkpoint Progression Analysis"):
                 # Prepare data for visualization
                 checkpoint_metrics = []
                 for _, cp in checkpoint_df.iterrows():
+                    ui_results = cp.get('ui_type_results', {})
+                    dataset_type_results = cp.get('dataset_type_results', {})
+                    # First try to get from ui_type_results
                     desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
                     desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
                     web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
                     web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
+                    # If all zeros, try to get from dataset_type_results
+                    if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
+                        # Check if data is nested under dataset types
+                        for dataset_key in dataset_type_results:
+                            if 'screenspot' in dataset_key.lower():
+                                dataset_data = dataset_type_results[dataset_key]
+                                if 'by_ui_type' in dataset_data:
+                                    ui_data = dataset_data['by_ui_type']
+                                    desktop_text = ui_data.get('desktop_text', {}).get('correct', 0) / max(ui_data.get('desktop_text', {}).get('total', 1), 1) * 100
+                                    desktop_icon = ui_data.get('desktop_icon', {}).get('correct', 0) / max(ui_data.get('desktop_icon', {}).get('total', 1), 1) * 100
+                                    web_text = ui_data.get('web_text', {}).get('correct', 0) / max(ui_data.get('web_text', {}).get('total', 1), 1) * 100
+                                    web_icon = ui_data.get('web_icon', {}).get('correct', 0) / max(ui_data.get('web_icon', {}).get('total', 1), 1) * 100
+                                    break
                     desktop_avg = (desktop_text + desktop_icon) / 2
                     web_avg = (web_text + web_icon) / 2
+                    text_avg = (desktop_text + web_text) / 2
+                    icon_avg = (desktop_icon + web_icon) / 2
                     overall = (desktop_avg + web_avg) / 2 if selected_dataset == 'screenspot-v2' else cp['overall_accuracy']
                     checkpoint_metrics.append({
                         'steps': cp['checkpoint_steps'] or 0,
                         'overall': overall,
+                        'desktop_avg': desktop_avg,
+                        'web_avg': web_avg,
+                        'desktop_text': desktop_text,
+                        'desktop_icon': desktop_icon,
+                        'web_text': web_text,
+                        'web_icon': web_icon,
+                        'text_avg': text_avg,
+                        'icon_avg': icon_avg,
                         'loss': cp['training_loss'],
                         'neg_log_loss': -np.log(cp['training_loss']) if cp['training_loss'] and cp['training_loss'] > 0 else None
                     })
                 with col1:
                     st.write("**Accuracy over Training Steps**")
+                    # Determine which metrics to show based on selected metric
+                    if selected_metric == 'overall':
+                        # Show overall, desktop, and web averages
+                        metrics_to_show = ['overall', 'desktop_avg', 'web_avg']
+                        metric_labels = ['Overall', 'Desktop Avg', 'Web Avg']
+                        colors = ['#4ECDC4', '#45B7D1', '#96CEB4']
+                    elif 'desktop' in selected_metric:
+                        # Show all desktop metrics
+                        metrics_to_show = ['desktop_avg', 'desktop_text', 'desktop_icon']
+                        metric_labels = ['Desktop Avg', 'Desktop Text', 'Desktop Icon']
+                        colors = ['#45B7D1', '#FFA726', '#FF6B6B']
+                    elif 'web' in selected_metric:
+                        # Show all web metrics
+                        metrics_to_show = ['web_avg', 'web_text', 'web_icon']
+                        metric_labels = ['Web Avg', 'Web Text', 'Web Icon']
+                        colors = ['#96CEB4', '#9C27B0', '#E91E63']
+                    elif 'text' in selected_metric:
+                        # Show text metrics across environments
+                        metrics_to_show = ['text_avg', 'desktop_text', 'web_text']
+                        metric_labels = ['Text Avg', 'Desktop Text', 'Web Text']
+                        colors = ['#FF9800', '#FFA726', '#FFB74D']
+                    elif 'icon' in selected_metric:
+                        # Show icon metrics across environments
+                        metrics_to_show = ['icon_avg', 'desktop_icon', 'web_icon']
+                        metric_labels = ['Icon Avg', 'Desktop Icon', 'Web Icon']
+                        colors = ['#3F51B5', '#5C6BC0', '#7986CB']
+                    else:
+                        # Default: just show the selected metric
+                        metrics_to_show = [selected_metric]
+                        metric_labels = [metric_options.get(selected_metric, selected_metric)]
+                        colors = ['#4ECDC4']
+                    # Create multi-line chart data
+                    chart_data = []
+                    for i, (metric, label) in enumerate(zip(metrics_to_show, metric_labels)):
+                        for _, row in metrics_df.iterrows():
+                            if metric in row:
+                                chart_data.append({
+                                    'steps': row['steps'],
+                                    'value': row[metric],
+                                    'metric': label,
+                                    'color_idx': i
+                                })
+                    if chart_data:
+                        chart_df = pd.DataFrame(chart_data)
+                        # Create multi-line chart with distinct colors
+                        chart = alt.Chart(chart_df).mark_line(point=True, strokeWidth=2).encode(
+                            x=alt.X('steps:Q', title='Training Steps'),
+                            y=alt.Y('value:Q', scale=alt.Scale(domain=[0, 100]), title='Accuracy (%)'),
+                            color=alt.Color('metric:N',
+                                          scale=alt.Scale(domain=metric_labels, range=colors),
+                                          legend=alt.Legend(title="Metric")),
+                            tooltip=['steps:Q', 'metric:N', alt.Tooltip('value:Q', format='.1f', title='Accuracy')]
+                        ).properties(
+                            width=500,
+                            height=400,
+                            title='Accuracy Progression During Training'
+                        )
+                        st.altair_chart(chart, use_container_width=True)
+                    else:
+                        st.warning("No data available for the selected metrics")
                 with col2:
+                    st.write(f"**{metric_options[selected_metric]} vs. Training Loss**")
                     if metrics_df['neg_log_loss'].notna().any():
                         scatter_data = metrics_df[metrics_df['neg_log_loss'].notna()]
                         chart = alt.Chart(scatter_data).mark_circle(size=100).encode(
                             x=alt.X('neg_log_loss:Q', title='-log(Training Loss)'),
+                            y=alt.Y(f'{selected_metric}:Q', scale=alt.Scale(domain=[0, 100]), title=f'{metric_options[selected_metric]} (%)'),
                             color=alt.Color('steps:Q', scale=alt.Scale(scheme='viridis'), title='Training Steps'),
+                            tooltip=['steps', 'loss', selected_metric]
                         ).properties(
+                            width=500,  # Increased from 400
+                            height=400,  # Increased from 300
+                            title=f'{metric_options[selected_metric]} vs. -log(Training Loss)'
                         )
                         st.altair_chart(chart, use_container_width=True)
                     else:
                         st.info("No training loss data available for this model")
+                # Show checkpoint details table with selected metric
                 st.write("**Checkpoint Details**")
+                # Determine columns to display based on selected metric category
+                if selected_metric == 'overall':
+                    display_cols = ['steps', 'overall', 'desktop_avg', 'web_avg', 'loss']
+                    col_labels = ['Steps', 'Overall %', 'Desktop Avg %', 'Web Avg %', 'Training Loss']
+                elif 'desktop' in selected_metric:
+                    display_cols = ['steps', 'desktop_avg', 'desktop_text', 'desktop_icon', 'loss']
+                    col_labels = ['Steps', 'Desktop Avg %', 'Desktop Text %', 'Desktop Icon %', 'Training Loss']
+                elif 'web' in selected_metric:
+                    display_cols = ['steps', 'web_avg', 'web_text', 'web_icon', 'loss']
+                    col_labels = ['Steps', 'Web Avg %', 'Web Text %', 'Web Icon %', 'Training Loss']
+                elif 'text' in selected_metric:
+                    display_cols = ['steps', 'text_avg', 'desktop_text', 'web_text', 'loss']
+                    col_labels = ['Steps', 'Text Avg %', 'Desktop Text %', 'Web Text %', 'Training Loss']
+                elif 'icon' in selected_metric:
+                    display_cols = ['steps', 'icon_avg', 'desktop_icon', 'web_icon', 'loss']
+                    col_labels = ['Steps', 'Icon Avg %', 'Desktop Icon %', 'Web Icon %', 'Training Loss']
+                else:
+                    display_cols = ['steps', selected_metric, 'loss']
+                    col_labels = ['Steps', f'{metric_options[selected_metric]} %', 'Training Loss']
+                display_metrics = metrics_df[display_cols].copy()
+                display_metrics.columns = col_labels
+                # Format percentage columns
+                for col in col_labels:
+                    if '%' in col and col != 'Training Loss':
+                        display_metrics[col] = display_metrics[col].round(2)
                 display_metrics['Training Loss'] = display_metrics['Training Loss'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
                 st.dataframe(display_metrics, use_container_width=True)
             else:
                 st.info("No models with multiple checkpoints available for progression analysis")
         # Detailed breakdown
+        if selected_dataset == 'screenspot-v2':
+            with st.expander("Detailed UI Type Breakdown"):
+                # Create a heatmap-style table
+                detailed_metrics = []
+                for _, row in ui_metrics_df.iterrows():
+                    detailed_metrics.append({
+                        'Model': row['model'],
+                        'Desktop Text': f"{row['desktop_text']:.1f}%",
+                        'Desktop Icon': f"{row['desktop_icon']:.1f}%",
+                        'Web Text': f"{row['web_text']:.1f}%",
+                        'Web Icon': f"{row['web_icon']:.1f}%",
+                        'Overall': f"{row['overall']:.1f}%"
+                    })
+                if detailed_metrics:
+                    st.dataframe(pd.DataFrame(detailed_metrics), use_container_width=True)
     else:
         # For non-ScreenSpot datasets, show a simple bar chart
         display_df['Accuracy (%)'] = display_df['Accuracy (%)'].apply(lambda x: f"{x:.2f}")
         display_df['Training Loss'] = display_df['Training Loss'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
         st.dataframe(display_df, use_container_width=True)
 if __name__ == "__main__":
     main()