Anas Awadalla commited on
Commit
c148460
·
1 Parent(s): 41dce85

fix caching of elements

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +202 -214
src/streamlit_app.py CHANGED
@@ -54,7 +54,7 @@ BASELINES = {
54
  }
55
 
56
  @st.cache_data(ttl=300) # Cache for 5 minutes
57
- def fetch_leaderboard_data_cached():
58
  """Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
59
  api = HfApi()
60
  fs = HfFileSystem()
@@ -66,8 +66,17 @@ def fetch_leaderboard_data_cached():
66
 
67
  results = []
68
 
 
 
 
 
69
  for idx, file_path in enumerate(grounding_files):
70
  try:
 
 
 
 
 
71
  # Stream the JSON file content directly from HuggingFace
72
  file_url = f"datasets/{REPO_ID}/{file_path}"
73
 
@@ -146,6 +155,10 @@ def fetch_leaderboard_data_cached():
146
  st.warning(f"Error loading {file_path}: {str(e)}")
147
  continue
148
 
 
 
 
 
149
  # Create DataFrame
150
  df = pd.DataFrame(results)
151
 
@@ -194,10 +207,6 @@ def fetch_leaderboard_data_cached():
194
  st.error(f"Error fetching leaderboard data: {str(e)}")
195
  return pd.DataFrame()
196
 
197
- def fetch_leaderboard_data():
198
- """Wrapper function to fetch leaderboard data with progress indicators."""
199
- return fetch_leaderboard_data_cached()
200
-
201
  def parse_ui_type_metrics(df: pd.DataFrame, dataset_filter: str) -> pd.DataFrame:
202
  """Parse UI type metrics from the results dataframe."""
203
  metrics_list = []
@@ -377,23 +386,9 @@ def main():
377
  st.title("🎯 Grounding Benchmark Leaderboard")
378
  st.markdown("Visualization of model performance on grounding benchmarks")
379
 
380
- # Initialize placeholders for dynamic content
381
- progress_placeholder = st.empty()
382
- header_placeholder = st.empty()
383
- metrics_placeholder = st.empty()
384
- metric_selector_placeholder = st.empty()
385
- info_placeholder = st.empty()
386
- main_chart_placeholder = st.empty()
387
- expandable_placeholder = st.empty()
388
- checkpoint_placeholder = st.empty()
389
-
390
  # Fetch data
391
- with progress_placeholder.container():
392
- with st.spinner("Loading leaderboard data..."):
393
- df = fetch_leaderboard_data()
394
-
395
- # Clear progress placeholder after loading
396
- progress_placeholder.empty()
397
 
398
  if df.empty:
399
  st.warning("No data available in the leaderboard.")
@@ -416,24 +411,29 @@ def main():
416
 
417
  if selected_model != 'All':
418
  filtered_df = filtered_df[filtered_df['model'] == selected_model]
419
-
 
 
 
 
 
 
 
420
  # Main content
421
- with header_placeholder.container():
422
- st.header(f"Results for {selected_dataset}")
423
 
424
  # Overall metrics
425
- with metrics_placeholder.container():
426
- col1, col2, col3 = st.columns(3)
427
- with col1:
428
- st.metric("Models Evaluated", len(filtered_df))
429
- with col2:
430
- if not filtered_df.empty:
431
- best_acc = filtered_df['overall_accuracy'].max()
432
- best_model = filtered_df[filtered_df['overall_accuracy'] == best_acc]['model'].iloc[0]
433
- st.metric("Best Overall Accuracy", f"{best_acc:.1f}%", help=f"Model: {best_model}")
434
- with col3:
435
- total_samples = filtered_df['total_samples'].sum()
436
- st.metric("Total Samples Evaluated", f"{total_samples:,}")
437
 
438
  # Parse UI type metrics
439
  ui_metrics_df = parse_ui_type_metrics(filtered_df, selected_dataset)
@@ -442,177 +442,172 @@ def main():
442
  selected_metric = 'overall' # Default metric
443
  if not ui_metrics_df.empty and 'screenspot' in selected_dataset.lower():
444
  # Metric selector dropdown
445
- with metric_selector_placeholder.container():
446
- if selected_dataset == 'screenspot-v2':
447
- metric_options = {
448
- 'overall': 'Overall Average (Desktop + Web) / 2',
449
- 'desktop_avg': 'Desktop Average',
450
- 'web_avg': 'Web Average',
451
- 'desktop_text': 'Desktop (Text)',
452
- 'desktop_icon': 'Desktop (Icon)',
453
- 'web_text': 'Web (Text)',
454
- 'web_icon': 'Web (Icon)',
455
- 'text_avg': 'Text Average',
456
- 'icon_avg': 'Icon Average'
457
- }
458
- elif selected_dataset in ['screenspot-pro', 'showdown-clicks']:
459
- # For screenspot-pro and showdown-clicks, only show overall average
460
- metric_options = {
461
- 'overall': 'Overall Average'
462
- }
463
- else:
464
- metric_options = {
465
- 'overall': 'Overall Average',
466
- 'desktop_avg': 'Desktop Average',
467
- 'web_avg': 'Web Average',
468
- 'text_avg': 'Text Average',
469
- 'icon_avg': 'Icon Average'
470
- }
471
-
472
- selected_metric = st.selectbox(
473
- "Select metric to visualize:",
474
- options=list(metric_options.keys()),
475
- format_func=lambda x: metric_options[x],
476
- key="metric_selector"
477
- )
478
 
479
  # Add note about asterisks
480
- with info_placeholder.container():
481
- if any(ui_metrics_df['is_best_not_last']):
482
- st.info("* indicates the best checkpoint is not the last checkpoint")
483
 
484
  # Create single chart for selected metric
485
- with main_chart_placeholder.container():
486
- chart = create_bar_chart(ui_metrics_df, selected_metric, metric_options[selected_metric])
487
- if chart:
488
- st.altair_chart(chart, use_container_width=True)
489
- else:
490
- st.warning(f"No data available for {metric_options[selected_metric]}")
491
 
492
  # Show all metrics in an expandable section - available for all datasets
493
- with expandable_placeholder.container():
494
- with st.expander("View All Metrics"):
495
- if selected_dataset == 'screenspot-v2':
496
- # First row: Overall, Desktop, Web averages
497
- col1, col2, col3 = st.columns(3)
498
-
499
- with col1:
500
- chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average (Desktop + Web) / 2')
501
- if chart:
502
- st.altair_chart(chart, use_container_width=True)
503
-
504
- with col2:
505
- chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
506
- if chart:
507
- st.altair_chart(chart, use_container_width=True)
508
-
509
- with col3:
510
- chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
511
- if chart:
512
- st.altair_chart(chart, use_container_width=True)
513
-
514
- # Second row: Individual UI type metrics
515
- col1, col2, col3, col4 = st.columns(4)
516
-
517
- with col1:
518
- chart = create_bar_chart(ui_metrics_df, 'desktop_text', 'Desktop (Text)')
519
- if chart:
520
- st.altair_chart(chart, use_container_width=True)
521
-
522
- with col2:
523
- chart = create_bar_chart(ui_metrics_df, 'desktop_icon', 'Desktop (Icon)')
524
- if chart:
525
- st.altair_chart(chart, use_container_width=True)
526
-
527
- with col3:
528
- chart = create_bar_chart(ui_metrics_df, 'web_text', 'Web (Text)')
529
- if chart:
530
- st.altair_chart(chart, use_container_width=True)
531
-
532
- with col4:
533
- chart = create_bar_chart(ui_metrics_df, 'web_icon', 'Web (Icon)')
534
- if chart:
535
- st.altair_chart(chart, use_container_width=True)
536
-
537
- # Third row: Text vs Icon averages
538
- col1, col2 = st.columns(2)
539
-
540
- with col1:
541
- chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (Desktop + Web)')
542
- if chart:
543
- st.altair_chart(chart, use_container_width=True)
544
-
545
- with col2:
546
- chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (Desktop + Web)')
547
- if chart:
548
- st.altair_chart(chart, use_container_width=True)
549
- else:
550
- # For screenspot-pro and showdown-clicks
551
- st.info("No additional UI type metrics available for this dataset. Only overall accuracy is reported.")
552
 
553
  # Checkpoint progression visualization
554
- with checkpoint_placeholder.container():
555
- with st.expander("Checkpoint Progression Analysis"):
556
- # Select a model with checkpoints
557
- models_with_checkpoints = ui_metrics_df[ui_metrics_df['all_checkpoints'].apply(lambda x: len(x) > 1)]
558
-
559
- if not models_with_checkpoints.empty:
560
- selected_checkpoint_model = st.selectbox(
561
- "Select a model to view checkpoint progression:",
562
- models_with_checkpoints['model'].str.replace('*', '').unique()
563
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
564
 
565
- # Get checkpoint data for selected model
566
- model_row = models_with_checkpoints[models_with_checkpoints['model'].str.replace('*', '') == selected_checkpoint_model].iloc[0]
567
- checkpoint_data = model_row['all_checkpoints']
568
 
569
- # Create DataFrame from checkpoint data
570
- checkpoint_df = pd.DataFrame(checkpoint_data)
 
 
 
571
 
572
- # Prepare data for visualization
573
- checkpoint_metrics = []
574
- for _, cp in checkpoint_df.iterrows():
575
- ui_results = cp.get('ui_type_results', {})
576
- dataset_type_results = cp.get('dataset_type_results', {})
577
- results_by_file = cp.get('results_by_file', {})
578
-
579
- # Check if we have desktop/web breakdown in results_by_file
580
- desktop_file = None
581
- web_file = None
582
-
583
- for filename, file_results in results_by_file.items():
584
- if 'desktop' in filename.lower():
585
- desktop_file = file_results
586
- elif 'web' in filename.lower():
587
- web_file = file_results
588
 
589
- if desktop_file and web_file:
590
- # We have desktop/web breakdown
591
- desktop_text = desktop_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
592
- desktop_icon = desktop_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
593
- web_text = web_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
594
- web_icon = web_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
595
- else:
596
- # Fallback to simple UI type results
597
- desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
598
- desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
599
- web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
600
- web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
601
-
602
- # If still all zeros, try dataset_type_results
603
- if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
604
- for dataset_key in dataset_type_results:
605
- if 'screenspot' in dataset_key.lower():
606
- dataset_data = dataset_type_results[dataset_key]
607
- if 'by_ui_type' in dataset_data:
608
- ui_data = dataset_data['by_ui_type']
609
- # For simple text/icon without desktop/web
610
- text_val = ui_data.get('text', {}).get('correct', 0) / max(ui_data.get('text', {}).get('total', 1), 1) * 100
611
- icon_val = ui_data.get('icon', {}).get('correct', 0) / max(ui_data.get('icon', {}).get('total', 1), 1) * 100
612
- # Assign same values to desktop and web as we don't have the breakdown
613
- desktop_text = web_text = text_val
614
- desktop_icon = web_icon = icon_val
615
- break
616
 
617
  desktop_avg = (desktop_text + desktop_icon) / 2
618
  web_avg = (web_text + web_icon) / 2
@@ -765,26 +760,19 @@ def main():
765
 
766
  else:
767
  # For non-ScreenSpot datasets, show a simple bar chart
768
- # Clear unused placeholders
769
- metric_selector_placeholder.empty()
770
- info_placeholder.empty()
771
- expandable_placeholder.empty()
772
- checkpoint_placeholder.empty()
773
 
774
- with main_chart_placeholder.container():
775
- chart_data = filtered_df[['model', 'overall_accuracy']].copy()
776
- chart_data.columns = ['Model', 'Accuracy']
777
-
778
- chart = alt.Chart(chart_data).mark_bar().encode(
779
- x=alt.X('Model:N', sort='-y', axis=alt.Axis(labelAngle=-45)),
780
- y=alt.Y('Accuracy:Q', scale=alt.Scale(domain=[0, 100])),
781
- tooltip=['Model', 'Accuracy']
782
- ).properties(
783
- width=800,
784
- height=400
785
- )
786
-
787
- st.altair_chart(chart, use_container_width=True)
788
 
789
  if __name__ == "__main__":
790
  main()
 
54
  }
55
 
56
  @st.cache_data(ttl=300) # Cache for 5 minutes
57
+ def fetch_leaderboard_data():
58
  """Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
59
  api = HfApi()
60
  fs = HfFileSystem()
 
66
 
67
  results = []
68
 
69
+ # Create progress bar for loading
70
+ progress_bar = st.progress(0)
71
+ status_text = st.empty()
72
+
73
  for idx, file_path in enumerate(grounding_files):
74
  try:
75
+ # Update progress
76
+ progress = (idx + 1) / len(grounding_files)
77
+ progress_bar.progress(progress)
78
+ status_text.text(f"Loading {idx + 1}/{len(grounding_files)} files...")
79
+
80
  # Stream the JSON file content directly from HuggingFace
81
  file_url = f"datasets/{REPO_ID}/{file_path}"
82
 
 
155
  st.warning(f"Error loading {file_path}: {str(e)}")
156
  continue
157
 
158
+ # Clear progress indicators
159
+ progress_bar.empty()
160
+ status_text.empty()
161
+
162
  # Create DataFrame
163
  df = pd.DataFrame(results)
164
 
 
207
  st.error(f"Error fetching leaderboard data: {str(e)}")
208
  return pd.DataFrame()
209
 
 
 
 
 
210
  def parse_ui_type_metrics(df: pd.DataFrame, dataset_filter: str) -> pd.DataFrame:
211
  """Parse UI type metrics from the results dataframe."""
212
  metrics_list = []
 
386
  st.title("🎯 Grounding Benchmark Leaderboard")
387
  st.markdown("Visualization of model performance on grounding benchmarks")
388
 
 
 
 
 
 
 
 
 
 
 
389
  # Fetch data
390
+ with st.spinner("Loading leaderboard data..."):
391
+ df = fetch_leaderboard_data()
 
 
 
 
392
 
393
  if df.empty:
394
  st.warning("No data available in the leaderboard.")
 
411
 
412
  if selected_model != 'All':
413
  filtered_df = filtered_df[filtered_df['model'] == selected_model]
414
+
415
+ # Create placeholders for components that update when dataset or metric changes
416
+ header_placeholder = st.empty()
417
+ metrics_placeholder = st.empty()
418
+ chart_placeholder = st.empty()
419
+ view_metrics_expander_placeholder = st.empty()
420
+ progression_expander_placeholder = st.empty()
421
+
422
  # Main content
423
+ header_placeholder.header(f"Results for {selected_dataset}")
 
424
 
425
  # Overall metrics
426
+ col1, col2, col3 = metrics_placeholder.columns(3)
427
+ with col1:
428
+ st.metric("Models Evaluated", len(filtered_df))
429
+ with col2:
430
+ if not filtered_df.empty:
431
+ best_acc = filtered_df['overall_accuracy'].max()
432
+ best_model = filtered_df[filtered_df['overall_accuracy'] == best_acc]['model'].iloc[0]
433
+ st.metric("Best Overall Accuracy", f"{best_acc:.1f}%", help=f"Model: {best_model}")
434
+ with col3:
435
+ total_samples = filtered_df['total_samples'].sum()
436
+ st.metric("Total Samples Evaluated", f"{total_samples:,}")
 
437
 
438
  # Parse UI type metrics
439
  ui_metrics_df = parse_ui_type_metrics(filtered_df, selected_dataset)
 
442
  selected_metric = 'overall' # Default metric
443
  if not ui_metrics_df.empty and 'screenspot' in selected_dataset.lower():
444
  # Metric selector dropdown
445
+ if selected_dataset == 'screenspot-v2':
446
+ metric_options = {
447
+ 'overall': 'Overall Average (Desktop + Web) / 2',
448
+ 'desktop_avg': 'Desktop Average',
449
+ 'web_avg': 'Web Average',
450
+ 'desktop_text': 'Desktop (Text)',
451
+ 'desktop_icon': 'Desktop (Icon)',
452
+ 'web_text': 'Web (Text)',
453
+ 'web_icon': 'Web (Icon)',
454
+ 'text_avg': 'Text Average',
455
+ 'icon_avg': 'Icon Average'
456
+ }
457
+ elif selected_dataset in ['screenspot-pro', 'showdown-clicks']:
458
+ # For screenspot-pro and showdown-clicks, only show overall average
459
+ metric_options = {
460
+ 'overall': 'Overall Average'
461
+ }
462
+ else:
463
+ metric_options = {
464
+ 'overall': 'Overall Average',
465
+ 'desktop_avg': 'Desktop Average',
466
+ 'web_avg': 'Web Average',
467
+ 'text_avg': 'Text Average',
468
+ 'icon_avg': 'Icon Average'
469
+ }
470
+
471
+ selected_metric = st.selectbox(
472
+ "Select metric to visualize:",
473
+ options=list(metric_options.keys()),
474
+ format_func=lambda x: metric_options[x],
475
+ key="metric_selector"
476
+ )
 
477
 
478
  # Add note about asterisks
479
+ if any(ui_metrics_df['is_best_not_last']):
480
+ st.info("* indicates the best checkpoint is not the last checkpoint")
 
481
 
482
  # Create single chart for selected metric
483
+ chart = create_bar_chart(ui_metrics_df, selected_metric, metric_options[selected_metric])
484
+ if chart:
485
+ chart_placeholder.altair_chart(chart, use_container_width=True)
486
+ else:
487
+ st.warning(f"No data available for {metric_options[selected_metric]}")
 
488
 
489
  # Show all metrics in an expandable section - available for all datasets
490
+ with view_metrics_expander_placeholder.expander("View All Metrics"):
491
+ if selected_dataset == 'screenspot-v2':
492
+ # First row: Overall, Desktop, Web averages
493
+ col1, col2, col3 = st.columns(3)
494
+
495
+ with col1:
496
+ chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average (Desktop + Web) / 2')
497
+ if chart:
498
+ st.altair_chart(chart, use_container_width=True)
499
+
500
+ with col2:
501
+ chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
502
+ if chart:
503
+ st.altair_chart(chart, use_container_width=True)
504
+
505
+ with col3:
506
+ chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
507
+ if chart:
508
+ st.altair_chart(chart, use_container_width=True)
509
+
510
+ # Second row: Individual UI type metrics
511
+ col1, col2, col3, col4 = st.columns(4)
512
+
513
+ with col1:
514
+ chart = create_bar_chart(ui_metrics_df, 'desktop_text', 'Desktop (Text)')
515
+ if chart:
516
+ st.altair_chart(chart, use_container_width=True)
517
+
518
+ with col2:
519
+ chart = create_bar_chart(ui_metrics_df, 'desktop_icon', 'Desktop (Icon)')
520
+ if chart:
521
+ st.altair_chart(chart, use_container_width=True)
522
+
523
+ with col3:
524
+ chart = create_bar_chart(ui_metrics_df, 'web_text', 'Web (Text)')
525
+ if chart:
526
+ st.altair_chart(chart, use_container_width=True)
527
+
528
+ with col4:
529
+ chart = create_bar_chart(ui_metrics_df, 'web_icon', 'Web (Icon)')
530
+ if chart:
531
+ st.altair_chart(chart, use_container_width=True)
532
+
533
+ # Third row: Text vs Icon averages
534
+ col1, col2 = st.columns(2)
535
+
536
+ with col1:
537
+ chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (Desktop + Web)')
538
+ if chart:
539
+ st.altair_chart(chart, use_container_width=True)
540
+
541
+ with col2:
542
+ chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (Desktop + Web)')
543
+ if chart:
544
+ st.altair_chart(chart, use_container_width=True)
545
+ else:
546
+ # For screenspot-pro and showdown-clicks
547
+ st.info("No additional UI type metrics available for this dataset. Only overall accuracy is reported.")
 
548
 
549
  # Checkpoint progression visualization
550
+ with progression_expander_placeholder.expander("Checkpoint Progression Analysis"):
551
+ # Select a model with checkpoints
552
+ models_with_checkpoints = ui_metrics_df[ui_metrics_df['all_checkpoints'].apply(lambda x: len(x) > 1)]
553
+
554
+ if not models_with_checkpoints.empty:
555
+ selected_checkpoint_model = st.selectbox(
556
+ "Select a model to view checkpoint progression:",
557
+ models_with_checkpoints['model'].str.replace('*', '').unique()
558
+ )
559
+
560
+ # Get checkpoint data for selected model
561
+ model_row = models_with_checkpoints[models_with_checkpoints['model'].str.replace('*', '') == selected_checkpoint_model].iloc[0]
562
+ checkpoint_data = model_row['all_checkpoints']
563
+
564
+ # Create DataFrame from checkpoint data
565
+ checkpoint_df = pd.DataFrame(checkpoint_data)
566
+
567
+ # Prepare data for visualization
568
+ checkpoint_metrics = []
569
+ for _, cp in checkpoint_df.iterrows():
570
+ ui_results = cp.get('ui_type_results', {})
571
+ dataset_type_results = cp.get('dataset_type_results', {})
572
+ results_by_file = cp.get('results_by_file', {})
573
 
574
+ # Check if we have desktop/web breakdown in results_by_file
575
+ desktop_file = None
576
+ web_file = None
577
 
578
+ for filename, file_results in results_by_file.items():
579
+ if 'desktop' in filename.lower():
580
+ desktop_file = file_results
581
+ elif 'web' in filename.lower():
582
+ web_file = file_results
583
 
584
+ if desktop_file and web_file:
585
+ # We have desktop/web breakdown
586
+ desktop_text = desktop_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
587
+ desktop_icon = desktop_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
588
+ web_text = web_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
589
+ web_icon = web_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
590
+ else:
591
+ # Fallback to simple UI type results
592
+ desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
593
+ desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
594
+ web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
595
+ web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
 
 
 
 
596
 
597
+ # If still all zeros, try dataset_type_results
598
+ if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
599
+ for dataset_key in dataset_type_results:
600
+ if 'screenspot' in dataset_key.lower():
601
+ dataset_data = dataset_type_results[dataset_key]
602
+ if 'by_ui_type' in dataset_data:
603
+ ui_data = dataset_data['by_ui_type']
604
+ # For simple text/icon without desktop/web
605
+ text_val = ui_data.get('text', {}).get('correct', 0) / max(ui_data.get('text', {}).get('total', 1), 1) * 100
606
+ icon_val = ui_data.get('icon', {}).get('correct', 0) / max(ui_data.get('icon', {}).get('total', 1), 1) * 100
607
+ # Assign same values to desktop and web as we don't have the breakdown
608
+ desktop_text = web_text = text_val
609
+ desktop_icon = web_icon = icon_val
610
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
611
 
612
  desktop_avg = (desktop_text + desktop_icon) / 2
613
  web_avg = (web_text + web_icon) / 2
 
760
 
761
  else:
762
  # For non-ScreenSpot datasets, show a simple bar chart
763
+ chart_data = filtered_df[['model', 'overall_accuracy']].copy()
764
+ chart_data.columns = ['Model', 'Accuracy']
 
 
 
765
 
766
+ chart = alt.Chart(chart_data).mark_bar().encode(
767
+ x=alt.X('Model:N', sort='-y', axis=alt.Axis(labelAngle=-45)),
768
+ y=alt.Y('Accuracy:Q', scale=alt.Scale(domain=[0, 100])),
769
+ tooltip=['Model', 'Accuracy']
770
+ ).properties(
771
+ width=800,
772
+ height=400
773
+ )
774
+
775
+ chart_placeholder.altair_chart(chart, use_container_width=True)
 
 
 
 
776
 
777
  if __name__ == "__main__":
778
  main()