Anas Awadalla commited on
Commit
a860139
Β·
1 Parent(s): 98d976c

multi select models

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +133 -0
src/streamlit_app.py CHANGED
@@ -384,6 +384,89 @@ def create_bar_chart(data: pd.DataFrame, metric: str, title: str):
384
 
385
  return chart + text
386
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  def main():
388
  st.title("🎯 Grounding Benchmark Leaderboard")
389
  st.markdown("Visualization of model performance on grounding benchmarks")
@@ -508,6 +591,56 @@ def main():
508
  st.altair_chart(chart, use_container_width=True)
509
  else:
510
  st.warning(f"No data available for {metric_options[selected_metric]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
 
512
  if __name__ == "__main__":
513
  main()
 
384
 
385
  return chart + text
386
 
387
+ def create_results_table(data: pd.DataFrame, dataset: str):
388
+ """Create a formatted results table with best scores highlighted."""
389
+ if data.empty:
390
+ return None
391
+
392
+ # Copy data to avoid modifying original
393
+ table_data = data.copy()
394
+
395
+ # Remove columns we don't want to display
396
+ columns_to_drop = ['is_best_not_last', 'all_checkpoints']
397
+ table_data = table_data.drop(columns=[col for col in columns_to_drop if col in table_data.columns])
398
+
399
+ # Sort by overall score in descending order
400
+ if 'overall' in table_data.columns:
401
+ table_data = table_data.sort_values('overall', ascending=False)
402
+
403
+ # Determine which columns to show based on dataset
404
+ if dataset == 'screenspot-v2':
405
+ # Show all breakdown columns
406
+ column_order = ['model', 'desktop_text', 'desktop_icon', 'web_text', 'web_icon',
407
+ 'desktop_avg', 'web_avg', 'text_avg', 'icon_avg', 'overall']
408
+ column_names = {
409
+ 'model': 'Model',
410
+ 'desktop_text': 'Desktop Text',
411
+ 'desktop_icon': 'Desktop Icon',
412
+ 'web_text': 'Web Text',
413
+ 'web_icon': 'Web Icon',
414
+ 'desktop_avg': 'Desktop Avg',
415
+ 'web_avg': 'Web Avg',
416
+ 'text_avg': 'Text Avg',
417
+ 'icon_avg': 'Icon Avg',
418
+ 'overall': 'Overall'
419
+ }
420
+ elif 'text' in table_data.columns and 'icon' in table_data.columns:
421
+ # Show text/icon breakdown
422
+ column_order = ['model', 'text', 'icon', 'overall']
423
+ column_names = {
424
+ 'model': 'Model',
425
+ 'text': 'Text',
426
+ 'icon': 'Icon',
427
+ 'overall': 'Overall'
428
+ }
429
+ else:
430
+ # Show only overall
431
+ column_order = ['model', 'overall']
432
+ column_names = {
433
+ 'model': 'Model',
434
+ 'overall': 'Overall'
435
+ }
436
+
437
+ # Filter and reorder columns
438
+ available_columns = [col for col in column_order if col in table_data.columns]
439
+ table_data = table_data[available_columns]
440
+
441
+ # Rename columns for display
442
+ table_data = table_data.rename(columns=column_names)
443
+
444
+ # Round numeric columns to 1 decimal place
445
+ numeric_columns = [col for col in table_data.columns if col != 'Model']
446
+ for col in numeric_columns:
447
+ if col in table_data.columns:
448
+ table_data[col] = table_data[col].round(1)
449
+
450
+ # Apply styling to highlight best scores
451
+ def highlight_best(s):
452
+ """Highlight the best score in each column."""
453
+ if s.name == 'Model':
454
+ return [''] * len(s)
455
+
456
+ # Find the maximum value
457
+ max_val = s.max()
458
+ # Return style for each cell
459
+ return ['font-weight: bold; color: #2E7D32' if v == max_val else '' for v in s]
460
+
461
+ # Style the dataframe
462
+ styled_table = table_data.style.apply(highlight_best)
463
+
464
+ # Format numbers to show 1 decimal place
465
+ format_dict = {col: '{:.1f}' for col in numeric_columns if col in table_data.columns}
466
+ styled_table = styled_table.format(format_dict)
467
+
468
+ return styled_table
469
+
470
  def main():
471
  st.title("🎯 Grounding Benchmark Leaderboard")
472
  st.markdown("Visualization of model performance on grounding benchmarks")
 
591
  st.altair_chart(chart, use_container_width=True)
592
  else:
593
  st.warning(f"No data available for {metric_options[selected_metric]}")
594
+
595
+ # Display results table
596
+ st.subheader("πŸ“Š Results Table")
597
+
598
+ # Filter ui_metrics_df to only include selected models
599
+ if not ui_metrics_df.empty:
600
+ table_df = ui_metrics_df[ui_metrics_df['model'].isin(selected_models)].copy()
601
+
602
+ # Add baselines to the table if available
603
+ if selected_dataset in BASELINES:
604
+ baseline_rows = []
605
+ for baseline_name, baseline_metrics in BASELINES[selected_dataset].items():
606
+ baseline_row = {'model': f"{baseline_name} (baseline)"}
607
+
608
+ # Map baseline metrics to table columns
609
+ if selected_dataset == 'screenspot-v2':
610
+ baseline_row.update({
611
+ 'desktop_text': baseline_metrics.get('desktop_text', 0),
612
+ 'desktop_icon': baseline_metrics.get('desktop_icon', 0),
613
+ 'web_text': baseline_metrics.get('web_text', 0),
614
+ 'web_icon': baseline_metrics.get('web_icon', 0),
615
+ 'overall': baseline_metrics.get('overall', 0)
616
+ })
617
+ # Calculate averages if not provided
618
+ if 'desktop_text' in baseline_metrics and 'desktop_icon' in baseline_metrics:
619
+ baseline_row['desktop_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['desktop_icon']) / 2
620
+ if 'web_text' in baseline_metrics and 'web_icon' in baseline_metrics:
621
+ baseline_row['web_avg'] = (baseline_metrics['web_text'] + baseline_metrics['web_icon']) / 2
622
+ if 'desktop_text' in baseline_metrics and 'web_text' in baseline_metrics:
623
+ baseline_row['text_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['web_text']) / 2
624
+ if 'desktop_icon' in baseline_metrics and 'web_icon' in baseline_metrics:
625
+ baseline_row['icon_avg'] = (baseline_metrics['desktop_icon'] + baseline_metrics['web_icon']) / 2
626
+ else:
627
+ baseline_row['overall'] = baseline_metrics.get('overall', 0)
628
+
629
+ baseline_rows.append(baseline_row)
630
+
631
+ # Append baselines to table
632
+ if baseline_rows:
633
+ baseline_df = pd.DataFrame(baseline_rows)
634
+ table_df = pd.concat([table_df, baseline_df], ignore_index=True)
635
+
636
+ # Create and display the styled table
637
+ styled_table = create_results_table(table_df, selected_dataset)
638
+ if styled_table is not None:
639
+ st.dataframe(styled_table, use_container_width=True, hide_index=True)
640
+ else:
641
+ st.info("No data available for the selected models.")
642
+ else:
643
+ st.info("No detailed metrics available for this dataset.")
644
 
645
  if __name__ == "__main__":
646
  main()