Anas Awadalla
commited on
Commit
Β·
a860139
1
Parent(s):
98d976c
multi select models
Browse files- src/streamlit_app.py +133 -0
src/streamlit_app.py
CHANGED
@@ -384,6 +384,89 @@ def create_bar_chart(data: pd.DataFrame, metric: str, title: str):
|
|
384 |
|
385 |
return chart + text
|
386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
def main():
|
388 |
st.title("π― Grounding Benchmark Leaderboard")
|
389 |
st.markdown("Visualization of model performance on grounding benchmarks")
|
@@ -508,6 +591,56 @@ def main():
|
|
508 |
st.altair_chart(chart, use_container_width=True)
|
509 |
else:
|
510 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
511 |
|
512 |
if __name__ == "__main__":
|
513 |
main()
|
|
|
384 |
|
385 |
return chart + text
|
386 |
|
387 |
+
def create_results_table(data: pd.DataFrame, dataset: str):
|
388 |
+
"""Create a formatted results table with best scores highlighted."""
|
389 |
+
if data.empty:
|
390 |
+
return None
|
391 |
+
|
392 |
+
# Copy data to avoid modifying original
|
393 |
+
table_data = data.copy()
|
394 |
+
|
395 |
+
# Remove columns we don't want to display
|
396 |
+
columns_to_drop = ['is_best_not_last', 'all_checkpoints']
|
397 |
+
table_data = table_data.drop(columns=[col for col in columns_to_drop if col in table_data.columns])
|
398 |
+
|
399 |
+
# Sort by overall score in descending order
|
400 |
+
if 'overall' in table_data.columns:
|
401 |
+
table_data = table_data.sort_values('overall', ascending=False)
|
402 |
+
|
403 |
+
# Determine which columns to show based on dataset
|
404 |
+
if dataset == 'screenspot-v2':
|
405 |
+
# Show all breakdown columns
|
406 |
+
column_order = ['model', 'desktop_text', 'desktop_icon', 'web_text', 'web_icon',
|
407 |
+
'desktop_avg', 'web_avg', 'text_avg', 'icon_avg', 'overall']
|
408 |
+
column_names = {
|
409 |
+
'model': 'Model',
|
410 |
+
'desktop_text': 'Desktop Text',
|
411 |
+
'desktop_icon': 'Desktop Icon',
|
412 |
+
'web_text': 'Web Text',
|
413 |
+
'web_icon': 'Web Icon',
|
414 |
+
'desktop_avg': 'Desktop Avg',
|
415 |
+
'web_avg': 'Web Avg',
|
416 |
+
'text_avg': 'Text Avg',
|
417 |
+
'icon_avg': 'Icon Avg',
|
418 |
+
'overall': 'Overall'
|
419 |
+
}
|
420 |
+
elif 'text' in table_data.columns and 'icon' in table_data.columns:
|
421 |
+
# Show text/icon breakdown
|
422 |
+
column_order = ['model', 'text', 'icon', 'overall']
|
423 |
+
column_names = {
|
424 |
+
'model': 'Model',
|
425 |
+
'text': 'Text',
|
426 |
+
'icon': 'Icon',
|
427 |
+
'overall': 'Overall'
|
428 |
+
}
|
429 |
+
else:
|
430 |
+
# Show only overall
|
431 |
+
column_order = ['model', 'overall']
|
432 |
+
column_names = {
|
433 |
+
'model': 'Model',
|
434 |
+
'overall': 'Overall'
|
435 |
+
}
|
436 |
+
|
437 |
+
# Filter and reorder columns
|
438 |
+
available_columns = [col for col in column_order if col in table_data.columns]
|
439 |
+
table_data = table_data[available_columns]
|
440 |
+
|
441 |
+
# Rename columns for display
|
442 |
+
table_data = table_data.rename(columns=column_names)
|
443 |
+
|
444 |
+
# Round numeric columns to 1 decimal place
|
445 |
+
numeric_columns = [col for col in table_data.columns if col != 'Model']
|
446 |
+
for col in numeric_columns:
|
447 |
+
if col in table_data.columns:
|
448 |
+
table_data[col] = table_data[col].round(1)
|
449 |
+
|
450 |
+
# Apply styling to highlight best scores
|
451 |
+
def highlight_best(s):
|
452 |
+
"""Highlight the best score in each column."""
|
453 |
+
if s.name == 'Model':
|
454 |
+
return [''] * len(s)
|
455 |
+
|
456 |
+
# Find the maximum value
|
457 |
+
max_val = s.max()
|
458 |
+
# Return style for each cell
|
459 |
+
return ['font-weight: bold; color: #2E7D32' if v == max_val else '' for v in s]
|
460 |
+
|
461 |
+
# Style the dataframe
|
462 |
+
styled_table = table_data.style.apply(highlight_best)
|
463 |
+
|
464 |
+
# Format numbers to show 1 decimal place
|
465 |
+
format_dict = {col: '{:.1f}' for col in numeric_columns if col in table_data.columns}
|
466 |
+
styled_table = styled_table.format(format_dict)
|
467 |
+
|
468 |
+
return styled_table
|
469 |
+
|
470 |
def main():
|
471 |
st.title("π― Grounding Benchmark Leaderboard")
|
472 |
st.markdown("Visualization of model performance on grounding benchmarks")
|
|
|
591 |
st.altair_chart(chart, use_container_width=True)
|
592 |
else:
|
593 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
594 |
+
|
595 |
+
# Display results table
|
596 |
+
st.subheader("π Results Table")
|
597 |
+
|
598 |
+
# Filter ui_metrics_df to only include selected models
|
599 |
+
if not ui_metrics_df.empty:
|
600 |
+
table_df = ui_metrics_df[ui_metrics_df['model'].isin(selected_models)].copy()
|
601 |
+
|
602 |
+
# Add baselines to the table if available
|
603 |
+
if selected_dataset in BASELINES:
|
604 |
+
baseline_rows = []
|
605 |
+
for baseline_name, baseline_metrics in BASELINES[selected_dataset].items():
|
606 |
+
baseline_row = {'model': f"{baseline_name} (baseline)"}
|
607 |
+
|
608 |
+
# Map baseline metrics to table columns
|
609 |
+
if selected_dataset == 'screenspot-v2':
|
610 |
+
baseline_row.update({
|
611 |
+
'desktop_text': baseline_metrics.get('desktop_text', 0),
|
612 |
+
'desktop_icon': baseline_metrics.get('desktop_icon', 0),
|
613 |
+
'web_text': baseline_metrics.get('web_text', 0),
|
614 |
+
'web_icon': baseline_metrics.get('web_icon', 0),
|
615 |
+
'overall': baseline_metrics.get('overall', 0)
|
616 |
+
})
|
617 |
+
# Calculate averages if not provided
|
618 |
+
if 'desktop_text' in baseline_metrics and 'desktop_icon' in baseline_metrics:
|
619 |
+
baseline_row['desktop_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['desktop_icon']) / 2
|
620 |
+
if 'web_text' in baseline_metrics and 'web_icon' in baseline_metrics:
|
621 |
+
baseline_row['web_avg'] = (baseline_metrics['web_text'] + baseline_metrics['web_icon']) / 2
|
622 |
+
if 'desktop_text' in baseline_metrics and 'web_text' in baseline_metrics:
|
623 |
+
baseline_row['text_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['web_text']) / 2
|
624 |
+
if 'desktop_icon' in baseline_metrics and 'web_icon' in baseline_metrics:
|
625 |
+
baseline_row['icon_avg'] = (baseline_metrics['desktop_icon'] + baseline_metrics['web_icon']) / 2
|
626 |
+
else:
|
627 |
+
baseline_row['overall'] = baseline_metrics.get('overall', 0)
|
628 |
+
|
629 |
+
baseline_rows.append(baseline_row)
|
630 |
+
|
631 |
+
# Append baselines to table
|
632 |
+
if baseline_rows:
|
633 |
+
baseline_df = pd.DataFrame(baseline_rows)
|
634 |
+
table_df = pd.concat([table_df, baseline_df], ignore_index=True)
|
635 |
+
|
636 |
+
# Create and display the styled table
|
637 |
+
styled_table = create_results_table(table_df, selected_dataset)
|
638 |
+
if styled_table is not None:
|
639 |
+
st.dataframe(styled_table, use_container_width=True, hide_index=True)
|
640 |
+
else:
|
641 |
+
st.info("No data available for the selected models.")
|
642 |
+
else:
|
643 |
+
st.info("No detailed metrics available for this dataset.")
|
644 |
|
645 |
if __name__ == "__main__":
|
646 |
main()
|