Anas Awadalla commited on
Commit
4f35c65
·
1 Parent(s): 4db9f63

some fixes

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +124 -59
src/streamlit_app.py CHANGED
@@ -153,6 +153,7 @@ def fetch_leaderboard_data():
153
  # Extract UI type results if available
154
  ui_type_results = detailed_results.get("by_ui_type", {})
155
  dataset_type_results = detailed_results.get("by_dataset_type", {})
 
156
 
157
  # Create a compact result entry (only keep what we need for visualization)
158
  result_entry = {
@@ -167,7 +168,8 @@ def fetch_leaderboard_data():
167
  "checkpoint_steps": metadata.get("checkpoint_steps"),
168
  "training_loss": metadata.get("training_loss"),
169
  "ui_type_results": ui_type_results,
170
- "dataset_type_results": dataset_type_results
 
171
  }
172
 
173
  results.append(result_entry)
@@ -239,55 +241,99 @@ def parse_ui_type_metrics(df: pd.DataFrame, dataset_filter: str) -> pd.DataFrame
239
  model = row['model']
240
  ui_results = row.get('ui_type_results', {})
241
  dataset_type_results = row.get('dataset_type_results', {})
 
242
 
243
- # For ScreenSpot datasets, we have desktop/web and text/icon
244
  if 'screenspot' in dataset_filter.lower():
245
- # First try to get from ui_type_results
246
- desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
247
- desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
248
- web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
249
- web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
250
 
251
- # If all zeros, try to get from dataset_type_results
252
- if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
253
- # Check if data is nested under dataset types (e.g., 'screenspot-v2')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  for dataset_key in dataset_type_results:
255
  if 'screenspot' in dataset_key.lower():
256
  dataset_data = dataset_type_results[dataset_key]
257
  if 'by_ui_type' in dataset_data:
258
  ui_data = dataset_data['by_ui_type']
259
- desktop_text = ui_data.get('desktop_text', {}).get('correct', 0) / max(ui_data.get('desktop_text', {}).get('total', 1), 1) * 100
260
- desktop_icon = ui_data.get('desktop_icon', {}).get('correct', 0) / max(ui_data.get('desktop_icon', {}).get('total', 1), 1) * 100
261
- web_text = ui_data.get('web_text', {}).get('correct', 0) / max(ui_data.get('web_text', {}).get('total', 1), 1) * 100
262
- web_icon = ui_data.get('web_icon', {}).get('correct', 0) / max(ui_data.get('web_icon', {}).get('total', 1), 1) * 100
 
 
 
 
 
 
 
 
 
 
 
263
  break
264
-
265
- # Calculate averages
266
- desktop_avg = (desktop_text + desktop_icon) / 2 if (desktop_text > 0 or desktop_icon > 0) else 0
267
- web_avg = (web_text + web_icon) / 2 if (web_text > 0 or web_icon > 0) else 0
268
- text_avg = (desktop_text + web_text) / 2 if (desktop_text > 0 or web_text > 0) else 0
269
- icon_avg = (desktop_icon + web_icon) / 2 if (desktop_icon > 0 or web_icon > 0) else 0
270
-
271
- # For screenspot-v2, calculate the overall as average of desktop and web
272
- if dataset_filter == 'screenspot-v2':
273
- overall = (desktop_avg + web_avg) / 2 if (desktop_avg > 0 or web_avg > 0) else row['overall_accuracy']
274
- else:
275
- overall = row['overall_accuracy']
276
-
277
- metrics_list.append({
278
- 'model': model,
279
- 'desktop_text': desktop_text,
280
- 'desktop_icon': desktop_icon,
281
- 'web_text': web_text,
282
- 'web_icon': web_icon,
283
- 'desktop_avg': desktop_avg,
284
- 'web_avg': web_avg,
285
- 'text_avg': text_avg,
286
- 'icon_avg': icon_avg,
287
- 'overall': overall,
288
- 'is_best_not_last': row.get('is_best_not_last', False),
289
- 'all_checkpoints': row.get('all_checkpoints', [])
290
- })
291
  else:
292
  # For non-screenspot datasets, just pass through overall accuracy
293
  metrics_list.append({
@@ -595,26 +641,45 @@ def main():
595
  for _, cp in checkpoint_df.iterrows():
596
  ui_results = cp.get('ui_type_results', {})
597
  dataset_type_results = cp.get('dataset_type_results', {})
 
 
 
 
 
598
 
599
- # First try to get from ui_type_results
600
- desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
601
- desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
602
- web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
603
- web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
604
 
605
- # If all zeros, try to get from dataset_type_results
606
- if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
607
- # Check if data is nested under dataset types
608
- for dataset_key in dataset_type_results:
609
- if 'screenspot' in dataset_key.lower():
610
- dataset_data = dataset_type_results[dataset_key]
611
- if 'by_ui_type' in dataset_data:
612
- ui_data = dataset_data['by_ui_type']
613
- desktop_text = ui_data.get('desktop_text', {}).get('correct', 0) / max(ui_data.get('desktop_text', {}).get('total', 1), 1) * 100
614
- desktop_icon = ui_data.get('desktop_icon', {}).get('correct', 0) / max(ui_data.get('desktop_icon', {}).get('total', 1), 1) * 100
615
- web_text = ui_data.get('web_text', {}).get('correct', 0) / max(ui_data.get('web_text', {}).get('total', 1), 1) * 100
616
- web_icon = ui_data.get('web_icon', {}).get('correct', 0) / max(ui_data.get('web_icon', {}).get('total', 1), 1) * 100
617
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
 
619
  desktop_avg = (desktop_text + desktop_icon) / 2
620
  web_avg = (web_text + web_icon) / 2
 
153
  # Extract UI type results if available
154
  ui_type_results = detailed_results.get("by_ui_type", {})
155
  dataset_type_results = detailed_results.get("by_dataset_type", {})
156
+ results_by_file = detailed_results.get("by_file", {})
157
 
158
  # Create a compact result entry (only keep what we need for visualization)
159
  result_entry = {
 
168
  "checkpoint_steps": metadata.get("checkpoint_steps"),
169
  "training_loss": metadata.get("training_loss"),
170
  "ui_type_results": ui_type_results,
171
+ "dataset_type_results": dataset_type_results,
172
+ "results_by_file": results_by_file
173
  }
174
 
175
  results.append(result_entry)
 
241
  model = row['model']
242
  ui_results = row.get('ui_type_results', {})
243
  dataset_type_results = row.get('dataset_type_results', {})
244
+ results_by_file = row.get('results_by_file', {})
245
 
246
+ # For ScreenSpot datasets
247
  if 'screenspot' in dataset_filter.lower():
248
+ # Check if we have desktop/web breakdown in results_by_file
249
+ desktop_file = None
250
+ web_file = None
 
 
251
 
252
+ for filename, file_results in results_by_file.items():
253
+ if 'desktop' in filename.lower():
254
+ desktop_file = file_results
255
+ elif 'web' in filename.lower():
256
+ web_file = file_results
257
+
258
+ if desktop_file and web_file:
259
+ # We have desktop/web breakdown
260
+ desktop_text = desktop_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
261
+ desktop_icon = desktop_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
262
+ web_text = web_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
263
+ web_icon = web_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
264
+
265
+ # Calculate averages
266
+ desktop_avg = (desktop_text + desktop_icon) / 2 if (desktop_text > 0 or desktop_icon > 0) else 0
267
+ web_avg = (web_text + web_icon) / 2 if (web_text > 0 or web_icon > 0) else 0
268
+ text_avg = (desktop_text + web_text) / 2 if (desktop_text > 0 or web_text > 0) else 0
269
+ icon_avg = (desktop_icon + web_icon) / 2 if (desktop_icon > 0 or web_icon > 0) else 0
270
+
271
+ # For screenspot-v2, calculate the overall as average of desktop and web
272
+ if dataset_filter == 'screenspot-v2':
273
+ overall = (desktop_avg + web_avg) / 2 if (desktop_avg > 0 or web_avg > 0) else row['overall_accuracy']
274
+ else:
275
+ overall = row['overall_accuracy']
276
+
277
+ metrics_list.append({
278
+ 'model': model,
279
+ 'desktop_text': desktop_text,
280
+ 'desktop_icon': desktop_icon,
281
+ 'web_text': web_text,
282
+ 'web_icon': web_icon,
283
+ 'desktop_avg': desktop_avg,
284
+ 'web_avg': web_avg,
285
+ 'text_avg': text_avg,
286
+ 'icon_avg': icon_avg,
287
+ 'overall': overall,
288
+ 'is_best_not_last': row.get('is_best_not_last', False),
289
+ 'all_checkpoints': row.get('all_checkpoints', [])
290
+ })
291
+ elif 'text' in ui_results and 'icon' in ui_results:
292
+ # Simple text/icon structure without desktop/web breakdown
293
+ text_acc = (ui_results.get('text', {}).get('correct', 0) / max(ui_results.get('text', {}).get('total', 1), 1)) * 100
294
+ icon_acc = (ui_results.get('icon', {}).get('correct', 0) / max(ui_results.get('icon', {}).get('total', 1), 1)) * 100
295
+
296
+ metrics_list.append({
297
+ 'model': model,
298
+ 'text': text_acc,
299
+ 'icon': icon_acc,
300
+ 'overall': row['overall_accuracy'],
301
+ 'is_best_not_last': row.get('is_best_not_last', False),
302
+ 'all_checkpoints': row.get('all_checkpoints', [])
303
+ })
304
+ else:
305
+ # Try to get from dataset_type_results if available
306
+ found_data = False
307
  for dataset_key in dataset_type_results:
308
  if 'screenspot' in dataset_key.lower():
309
  dataset_data = dataset_type_results[dataset_key]
310
  if 'by_ui_type' in dataset_data:
311
  ui_data = dataset_data['by_ui_type']
312
+ text_data = ui_data.get('text', {})
313
+ icon_data = ui_data.get('icon', {})
314
+
315
+ text_acc = (text_data.get('correct', 0) / max(text_data.get('total', 1), 1)) * 100
316
+ icon_acc = (icon_data.get('correct', 0) / max(icon_data.get('total', 1), 1)) * 100
317
+
318
+ metrics_list.append({
319
+ 'model': model,
320
+ 'text': text_acc,
321
+ 'icon': icon_acc,
322
+ 'overall': row['overall_accuracy'],
323
+ 'is_best_not_last': row.get('is_best_not_last', False),
324
+ 'all_checkpoints': row.get('all_checkpoints', [])
325
+ })
326
+ found_data = True
327
  break
328
+
329
+ if not found_data:
330
+ # No UI type data available, just use overall
331
+ metrics_list.append({
332
+ 'model': model,
333
+ 'overall': row['overall_accuracy'],
334
+ 'is_best_not_last': row.get('is_best_not_last', False),
335
+ 'all_checkpoints': row.get('all_checkpoints', [])
336
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  else:
338
  # For non-screenspot datasets, just pass through overall accuracy
339
  metrics_list.append({
 
641
  for _, cp in checkpoint_df.iterrows():
642
  ui_results = cp.get('ui_type_results', {})
643
  dataset_type_results = cp.get('dataset_type_results', {})
644
+ results_by_file = cp.get('results_by_file', {})
645
+
646
+ # Check if we have desktop/web breakdown in results_by_file
647
+ desktop_file = None
648
+ web_file = None
649
 
650
+ for filename, file_results in results_by_file.items():
651
+ if 'desktop' in filename.lower():
652
+ desktop_file = file_results
653
+ elif 'web' in filename.lower():
654
+ web_file = file_results
655
 
656
+ if desktop_file and web_file:
657
+ # We have desktop/web breakdown
658
+ desktop_text = desktop_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
659
+ desktop_icon = desktop_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
660
+ web_text = web_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
661
+ web_icon = web_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
662
+ else:
663
+ # Fallback to simple UI type results
664
+ desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
665
+ desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
666
+ web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
667
+ web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
668
+
669
+ # If still all zeros, try dataset_type_results
670
+ if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
671
+ for dataset_key in dataset_type_results:
672
+ if 'screenspot' in dataset_key.lower():
673
+ dataset_data = dataset_type_results[dataset_key]
674
+ if 'by_ui_type' in dataset_data:
675
+ ui_data = dataset_data['by_ui_type']
676
+ # For simple text/icon without desktop/web
677
+ text_val = ui_data.get('text', {}).get('correct', 0) / max(ui_data.get('text', {}).get('total', 1), 1) * 100
678
+ icon_val = ui_data.get('icon', {}).get('correct', 0) / max(ui_data.get('icon', {}).get('total', 1), 1) * 100
679
+ # Assign same values to desktop and web as we don't have the breakdown
680
+ desktop_text = web_text = text_val
681
+ desktop_icon = web_icon = icon_val
682
+ break
683
 
684
  desktop_avg = (desktop_text + desktop_icon) / 2
685
  web_avg = (web_text + web_icon) / 2