Spaces:

mlfoundations-cua-dev
/

leaderboard-viewer

Running

App Files Files Community

Anas Awadalla commited on Jul 24

Commit

4f35c65

1 Parent(s): 4db9f63

some fixes

Browse files

Files changed (1) hide show

src/streamlit_app.py +124 -59

src/streamlit_app.py CHANGED Viewed

@@ -153,6 +153,7 @@ def fetch_leaderboard_data():
                 # Extract UI type results if available
                 ui_type_results = detailed_results.get("by_ui_type", {})
                 dataset_type_results = detailed_results.get("by_dataset_type", {})
                 # Create a compact result entry (only keep what we need for visualization)
                 result_entry = {
@@ -167,7 +168,8 @@ def fetch_leaderboard_data():
                     "checkpoint_steps": metadata.get("checkpoint_steps"),
                     "training_loss": metadata.get("training_loss"),
                     "ui_type_results": ui_type_results,
-                    "dataset_type_results": dataset_type_results
                 }
                 results.append(result_entry)
@@ -239,55 +241,99 @@ def parse_ui_type_metrics(df: pd.DataFrame, dataset_filter: str) -> pd.DataFrame
         model = row['model']
         ui_results = row.get('ui_type_results', {})
         dataset_type_results = row.get('dataset_type_results', {})
-        # For ScreenSpot datasets, we have desktop/web and text/icon
         if 'screenspot' in dataset_filter.lower():
-            # First try to get from ui_type_results
-            desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
-            desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
-            web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
-            web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
-            # If all zeros, try to get from dataset_type_results
-            if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
-                # Check if data is nested under dataset types (e.g., 'screenspot-v2')
                 for dataset_key in dataset_type_results:
                     if 'screenspot' in dataset_key.lower():
                         dataset_data = dataset_type_results[dataset_key]
                         if 'by_ui_type' in dataset_data:
                             ui_data = dataset_data['by_ui_type']
-                            desktop_text = ui_data.get('desktop_text', {}).get('correct', 0) / max(ui_data.get('desktop_text', {}).get('total', 1), 1) * 100
-                            desktop_icon = ui_data.get('desktop_icon', {}).get('correct', 0) / max(ui_data.get('desktop_icon', {}).get('total', 1), 1) * 100
-                            web_text = ui_data.get('web_text', {}).get('correct', 0) / max(ui_data.get('web_text', {}).get('total', 1), 1) * 100
-                            web_icon = ui_data.get('web_icon', {}).get('correct', 0) / max(ui_data.get('web_icon', {}).get('total', 1), 1) * 100
                             break
-            # Calculate averages
-            desktop_avg = (desktop_text + desktop_icon) / 2 if (desktop_text > 0 or desktop_icon > 0) else 0
-            web_avg = (web_text + web_icon) / 2 if (web_text > 0 or web_icon > 0) else 0
-            text_avg = (desktop_text + web_text) / 2 if (desktop_text > 0 or web_text > 0) else 0
-            icon_avg = (desktop_icon + web_icon) / 2 if (desktop_icon > 0 or web_icon > 0) else 0
-            # For screenspot-v2, calculate the overall as average of desktop and web
-            if dataset_filter == 'screenspot-v2':
-                overall = (desktop_avg + web_avg) / 2 if (desktop_avg > 0 or web_avg > 0) else row['overall_accuracy']
-            else:
-                overall = row['overall_accuracy']
-            metrics_list.append({
-                'model': model,
-                'desktop_text': desktop_text,
-                'desktop_icon': desktop_icon,
-                'web_text': web_text,
-                'web_icon': web_icon,
-                'desktop_avg': desktop_avg,
-                'web_avg': web_avg,
-                'text_avg': text_avg,
-                'icon_avg': icon_avg,
-                'overall': overall,
-                'is_best_not_last': row.get('is_best_not_last', False),
-                'all_checkpoints': row.get('all_checkpoints', [])
-            })
         else:
             # For non-screenspot datasets, just pass through overall accuracy
             metrics_list.append({
@@ -595,26 +641,45 @@ def main():
                 for _, cp in checkpoint_df.iterrows():
                     ui_results = cp.get('ui_type_results', {})
                     dataset_type_results = cp.get('dataset_type_results', {})
-                    # First try to get from ui_type_results
-                    desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
-                    desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
-                    web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
-                    web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
-                    # If all zeros, try to get from dataset_type_results
-                    if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
-                        # Check if data is nested under dataset types
-                        for dataset_key in dataset_type_results:
-                            if 'screenspot' in dataset_key.lower():
-                                dataset_data = dataset_type_results[dataset_key]
-                                if 'by_ui_type' in dataset_data:
-                                    ui_data = dataset_data['by_ui_type']
-                                    desktop_text = ui_data.get('desktop_text', {}).get('correct', 0) / max(ui_data.get('desktop_text', {}).get('total', 1), 1) * 100
-                                    desktop_icon = ui_data.get('desktop_icon', {}).get('correct', 0) / max(ui_data.get('desktop_icon', {}).get('total', 1), 1) * 100
-                                    web_text = ui_data.get('web_text', {}).get('correct', 0) / max(ui_data.get('web_text', {}).get('total', 1), 1) * 100
-                                    web_icon = ui_data.get('web_icon', {}).get('correct', 0) / max(ui_data.get('web_icon', {}).get('total', 1), 1) * 100
-                                    break
                     desktop_avg = (desktop_text + desktop_icon) / 2
                     web_avg = (web_text + web_icon) / 2

                 # Extract UI type results if available
                 ui_type_results = detailed_results.get("by_ui_type", {})
                 dataset_type_results = detailed_results.get("by_dataset_type", {})
+                results_by_file = detailed_results.get("by_file", {})
                 # Create a compact result entry (only keep what we need for visualization)
                 result_entry = {
                     "checkpoint_steps": metadata.get("checkpoint_steps"),
                     "training_loss": metadata.get("training_loss"),
                     "ui_type_results": ui_type_results,
+                    "dataset_type_results": dataset_type_results,
+                    "results_by_file": results_by_file
                 }
                 results.append(result_entry)
         model = row['model']
         ui_results = row.get('ui_type_results', {})
         dataset_type_results = row.get('dataset_type_results', {})
+        results_by_file = row.get('results_by_file', {})
+        # For ScreenSpot datasets
         if 'screenspot' in dataset_filter.lower():
+            # Check if we have desktop/web breakdown in results_by_file
+            desktop_file = None
+            web_file = None
+            for filename, file_results in results_by_file.items():
+                if 'desktop' in filename.lower():
+                    desktop_file = file_results
+                elif 'web' in filename.lower():
+                    web_file = file_results
+            if desktop_file and web_file:
+                # We have desktop/web breakdown
+                desktop_text = desktop_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
+                desktop_icon = desktop_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
+                web_text = web_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
+                web_icon = web_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
+                # Calculate averages
+                desktop_avg = (desktop_text + desktop_icon) / 2 if (desktop_text > 0 or desktop_icon > 0) else 0
+                web_avg = (web_text + web_icon) / 2 if (web_text > 0 or web_icon > 0) else 0
+                text_avg = (desktop_text + web_text) / 2 if (desktop_text > 0 or web_text > 0) else 0
+                icon_avg = (desktop_icon + web_icon) / 2 if (desktop_icon > 0 or web_icon > 0) else 0
+                # For screenspot-v2, calculate the overall as average of desktop and web
+                if dataset_filter == 'screenspot-v2':
+                    overall = (desktop_avg + web_avg) / 2 if (desktop_avg > 0 or web_avg > 0) else row['overall_accuracy']
+                else:
+                    overall = row['overall_accuracy']
+                metrics_list.append({
+                    'model': model,
+                    'desktop_text': desktop_text,
+                    'desktop_icon': desktop_icon,
+                    'web_text': web_text,
+                    'web_icon': web_icon,
+                    'desktop_avg': desktop_avg,
+                    'web_avg': web_avg,
+                    'text_avg': text_avg,
+                    'icon_avg': icon_avg,
+                    'overall': overall,
+                    'is_best_not_last': row.get('is_best_not_last', False),
+                    'all_checkpoints': row.get('all_checkpoints', [])
+                })
+            elif 'text' in ui_results and 'icon' in ui_results:
+                # Simple text/icon structure without desktop/web breakdown
+                text_acc = (ui_results.get('text', {}).get('correct', 0) / max(ui_results.get('text', {}).get('total', 1), 1)) * 100
+                icon_acc = (ui_results.get('icon', {}).get('correct', 0) / max(ui_results.get('icon', {}).get('total', 1), 1)) * 100
+                metrics_list.append({
+                    'model': model,
+                    'text': text_acc,
+                    'icon': icon_acc,
+                    'overall': row['overall_accuracy'],
+                    'is_best_not_last': row.get('is_best_not_last', False),
+                    'all_checkpoints': row.get('all_checkpoints', [])
+                })
+            else:
+                # Try to get from dataset_type_results if available
+                found_data = False
                 for dataset_key in dataset_type_results:
                     if 'screenspot' in dataset_key.lower():
                         dataset_data = dataset_type_results[dataset_key]
                         if 'by_ui_type' in dataset_data:
                             ui_data = dataset_data['by_ui_type']
+                            text_data = ui_data.get('text', {})
+                            icon_data = ui_data.get('icon', {})
+                            text_acc = (text_data.get('correct', 0) / max(text_data.get('total', 1), 1)) * 100
+                            icon_acc = (icon_data.get('correct', 0) / max(icon_data.get('total', 1), 1)) * 100
+                            metrics_list.append({
+                                'model': model,
+                                'text': text_acc,
+                                'icon': icon_acc,
+                                'overall': row['overall_accuracy'],
+                                'is_best_not_last': row.get('is_best_not_last', False),
+                                'all_checkpoints': row.get('all_checkpoints', [])
+                            })
+                            found_data = True
                             break
+                if not found_data:
+                    # No UI type data available, just use overall
+                    metrics_list.append({
+                        'model': model,
+                        'overall': row['overall_accuracy'],
+                        'is_best_not_last': row.get('is_best_not_last', False),
+                        'all_checkpoints': row.get('all_checkpoints', [])
+                    })
         else:
             # For non-screenspot datasets, just pass through overall accuracy
             metrics_list.append({
                 for _, cp in checkpoint_df.iterrows():
                     ui_results = cp.get('ui_type_results', {})
                     dataset_type_results = cp.get('dataset_type_results', {})
+                    results_by_file = cp.get('results_by_file', {})
+                    # Check if we have desktop/web breakdown in results_by_file
+                    desktop_file = None
+                    web_file = None
+                    for filename, file_results in results_by_file.items():
+                        if 'desktop' in filename.lower():
+                            desktop_file = file_results
+                        elif 'web' in filename.lower():
+                            web_file = file_results
+                    if desktop_file and web_file:
+                        # We have desktop/web breakdown
+                        desktop_text = desktop_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
+                        desktop_icon = desktop_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
+                        web_text = web_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
+                        web_icon = web_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
+                    else:
+                        # Fallback to simple UI type results
+                        desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
+                        desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
+                        web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
+                        web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
+                        # If still all zeros, try dataset_type_results
+                        if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
+                            for dataset_key in dataset_type_results:
+                                if 'screenspot' in dataset_key.lower():
+                                    dataset_data = dataset_type_results[dataset_key]
+                                    if 'by_ui_type' in dataset_data:
+                                        ui_data = dataset_data['by_ui_type']
+                                        # For simple text/icon without desktop/web
+                                        text_val = ui_data.get('text', {}).get('correct', 0) / max(ui_data.get('text', {}).get('total', 1), 1) * 100
+                                        icon_val = ui_data.get('icon', {}).get('correct', 0) / max(ui_data.get('icon', {}).get('total', 1), 1) * 100
+                                        # Assign same values to desktop and web as we don't have the breakdown
+                                        desktop_text = web_text = text_val
+                                        desktop_icon = web_icon = icon_val
+                                        break
                     desktop_avg = (desktop_text + desktop_icon) / 2
                     web_avg = (web_text + web_icon) / 2