Spaces:

mlfoundations-cua-dev
/

leaderboard-viewer

Running

App Files Files Community

Anas Awadalla commited on 29 days ago

Commit

a860139

1 Parent(s): 98d976c

multi select models

Browse files

Files changed (1) hide show

src/streamlit_app.py +133 -0

src/streamlit_app.py CHANGED Viewed

@@ -384,6 +384,89 @@ def create_bar_chart(data: pd.DataFrame, metric: str, title: str):
     return chart + text
 def main():
     st.title("🎯 Grounding Benchmark Leaderboard")
     st.markdown("Visualization of model performance on grounding benchmarks")
@@ -508,6 +591,56 @@ def main():
             st.altair_chart(chart, use_container_width=True)
         else:
             st.warning(f"No data available for {metric_options[selected_metric]}")
 if __name__ == "__main__":
     main()

     return chart + text
+def create_results_table(data: pd.DataFrame, dataset: str):
+    """Create a formatted results table with best scores highlighted."""
+    if data.empty:
+        return None
+    # Copy data to avoid modifying original
+    table_data = data.copy()
+    # Remove columns we don't want to display
+    columns_to_drop = ['is_best_not_last', 'all_checkpoints']
+    table_data = table_data.drop(columns=[col for col in columns_to_drop if col in table_data.columns])
+    # Sort by overall score in descending order
+    if 'overall' in table_data.columns:
+        table_data = table_data.sort_values('overall', ascending=False)
+    # Determine which columns to show based on dataset
+    if dataset == 'screenspot-v2':
+        # Show all breakdown columns
+        column_order = ['model', 'desktop_text', 'desktop_icon', 'web_text', 'web_icon',
+                       'desktop_avg', 'web_avg', 'text_avg', 'icon_avg', 'overall']
+        column_names = {
+            'model': 'Model',
+            'desktop_text': 'Desktop Text',
+            'desktop_icon': 'Desktop Icon',
+            'web_text': 'Web Text',
+            'web_icon': 'Web Icon',
+            'desktop_avg': 'Desktop Avg',
+            'web_avg': 'Web Avg',
+            'text_avg': 'Text Avg',
+            'icon_avg': 'Icon Avg',
+            'overall': 'Overall'
+        }
+    elif 'text' in table_data.columns and 'icon' in table_data.columns:
+        # Show text/icon breakdown
+        column_order = ['model', 'text', 'icon', 'overall']
+        column_names = {
+            'model': 'Model',
+            'text': 'Text',
+            'icon': 'Icon',
+            'overall': 'Overall'
+        }
+    else:
+        # Show only overall
+        column_order = ['model', 'overall']
+        column_names = {
+            'model': 'Model',
+            'overall': 'Overall'
+        }
+    # Filter and reorder columns
+    available_columns = [col for col in column_order if col in table_data.columns]
+    table_data = table_data[available_columns]
+    # Rename columns for display
+    table_data = table_data.rename(columns=column_names)
+    # Round numeric columns to 1 decimal place
+    numeric_columns = [col for col in table_data.columns if col != 'Model']
+    for col in numeric_columns:
+        if col in table_data.columns:
+            table_data[col] = table_data[col].round(1)
+    # Apply styling to highlight best scores
+    def highlight_best(s):
+        """Highlight the best score in each column."""
+        if s.name == 'Model':
+            return [''] * len(s)
+        # Find the maximum value
+        max_val = s.max()
+        # Return style for each cell
+        return ['font-weight: bold; color: #2E7D32' if v == max_val else '' for v in s]
+    # Style the dataframe
+    styled_table = table_data.style.apply(highlight_best)
+    # Format numbers to show 1 decimal place
+    format_dict = {col: '{:.1f}' for col in numeric_columns if col in table_data.columns}
+    styled_table = styled_table.format(format_dict)
+    return styled_table
 def main():
     st.title("🎯 Grounding Benchmark Leaderboard")
     st.markdown("Visualization of model performance on grounding benchmarks")
             st.altair_chart(chart, use_container_width=True)
         else:
             st.warning(f"No data available for {metric_options[selected_metric]}")
+    # Display results table
+    st.subheader("📊 Results Table")
+    # Filter ui_metrics_df to only include selected models
+    if not ui_metrics_df.empty:
+        table_df = ui_metrics_df[ui_metrics_df['model'].isin(selected_models)].copy()
+        # Add baselines to the table if available
+        if selected_dataset in BASELINES:
+            baseline_rows = []
+            for baseline_name, baseline_metrics in BASELINES[selected_dataset].items():
+                baseline_row = {'model': f"{baseline_name} (baseline)"}
+                # Map baseline metrics to table columns
+                if selected_dataset == 'screenspot-v2':
+                    baseline_row.update({
+                        'desktop_text': baseline_metrics.get('desktop_text', 0),
+                        'desktop_icon': baseline_metrics.get('desktop_icon', 0),
+                        'web_text': baseline_metrics.get('web_text', 0),
+                        'web_icon': baseline_metrics.get('web_icon', 0),
+                        'overall': baseline_metrics.get('overall', 0)
+                    })
+                    # Calculate averages if not provided
+                    if 'desktop_text' in baseline_metrics and 'desktop_icon' in baseline_metrics:
+                        baseline_row['desktop_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['desktop_icon']) / 2
+                    if 'web_text' in baseline_metrics and 'web_icon' in baseline_metrics:
+                        baseline_row['web_avg'] = (baseline_metrics['web_text'] + baseline_metrics['web_icon']) / 2
+                    if 'desktop_text' in baseline_metrics and 'web_text' in baseline_metrics:
+                        baseline_row['text_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['web_text']) / 2
+                    if 'desktop_icon' in baseline_metrics and 'web_icon' in baseline_metrics:
+                        baseline_row['icon_avg'] = (baseline_metrics['desktop_icon'] + baseline_metrics['web_icon']) / 2
+                else:
+                    baseline_row['overall'] = baseline_metrics.get('overall', 0)
+                baseline_rows.append(baseline_row)
+            # Append baselines to table
+            if baseline_rows:
+                baseline_df = pd.DataFrame(baseline_rows)
+                table_df = pd.concat([table_df, baseline_df], ignore_index=True)
+        # Create and display the styled table
+        styled_table = create_results_table(table_df, selected_dataset)
+        if styled_table is not None:
+            st.dataframe(styled_table, use_container_width=True, hide_index=True)
+        else:
+            st.info("No data available for the selected models.")
+    else:
+        st.info("No detailed metrics available for this dataset.")
 if __name__ == "__main__":
     main()