Improvements in performance chart, hopefully fixed online missing data
Browse files
app.py
CHANGED
@@ -702,47 +702,175 @@ def show_model_performance(df):
|
|
702 |
# Model comparison
|
703 |
st.subheader("Model Comparison")
|
704 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
705 |
selected_models = st.multiselect(
|
706 |
-
"Select models to compare",
|
707 |
-
|
708 |
-
default=
|
|
|
709 |
)
|
710 |
|
711 |
if selected_models:
|
712 |
comparison_data = df_display.loc[selected_models].T
|
713 |
comparison_data.index = [clean_benchmark_name(idx) for idx in comparison_data.index]
|
714 |
|
715 |
-
# Radar
|
716 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
717 |
fig = go.Figure()
|
718 |
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
723 |
|
724 |
-
# Close the radar chart
|
725 |
-
|
726 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
|
728 |
fig.add_trace(go.Scatterpolar(
|
729 |
-
r=
|
730 |
-
theta=
|
731 |
fill='toself',
|
732 |
-
name=
|
|
|
|
|
733 |
))
|
734 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
735 |
fig.update_layout(
|
736 |
polar=dict(
|
737 |
radialaxis=dict(
|
738 |
visible=True,
|
739 |
-
range=[
|
|
|
740 |
)),
|
741 |
showlegend=True,
|
742 |
-
title="Model Performance Radar Chart"
|
|
|
|
|
743 |
)
|
744 |
|
745 |
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
746 |
|
747 |
# Detailed comparison table
|
748 |
st.subheader("Detailed Comparison")
|
|
|
702 |
# Model comparison
|
703 |
st.subheader("Model Comparison")
|
704 |
|
705 |
+
# Benchmark selection for radar chart (always visible)
|
706 |
+
st.subheader("π Benchmark & Model Selection")
|
707 |
+
|
708 |
+
col1, col2 = st.columns([2, 1])
|
709 |
+
|
710 |
+
with col1:
|
711 |
+
available_benchmarks = list(df_display.columns)
|
712 |
+
default_benchmarks = available_benchmarks[:min(8, len(available_benchmarks))] # Default to first 8 or all if fewer
|
713 |
+
|
714 |
+
selected_benchmarks_for_radar = st.multiselect(
|
715 |
+
"Select benchmarks for radar chart",
|
716 |
+
available_benchmarks,
|
717 |
+
default=default_benchmarks,
|
718 |
+
format_func=clean_benchmark_name,
|
719 |
+
help="Choose which benchmarks to display in the radar chart"
|
720 |
+
)
|
721 |
+
|
722 |
+
with col2:
|
723 |
+
complete_data_only = st.checkbox(
|
724 |
+
"Complete data only",
|
725 |
+
value=True,
|
726 |
+
help="Show only models that have data for ALL selected benchmarks"
|
727 |
+
)
|
728 |
+
|
729 |
+
# Filter available models based on benchmark selection and complete data requirement
|
730 |
+
if complete_data_only and selected_benchmarks_for_radar:
|
731 |
+
# Only show models that have data for all selected benchmarks
|
732 |
+
models_with_complete_data = []
|
733 |
+
for model in df_display.index:
|
734 |
+
has_all_data = True
|
735 |
+
for benchmark in selected_benchmarks_for_radar:
|
736 |
+
if pd.isna(df_display.loc[model, benchmark]):
|
737 |
+
has_all_data = False
|
738 |
+
break
|
739 |
+
if has_all_data:
|
740 |
+
models_with_complete_data.append(model)
|
741 |
+
|
742 |
+
available_models_for_selection = models_with_complete_data
|
743 |
+
models_info = f"({len(available_models_for_selection)} models with complete data)"
|
744 |
+
else:
|
745 |
+
available_models_for_selection = df_display.index.tolist()
|
746 |
+
models_info = f"({len(available_models_for_selection)} models total)"
|
747 |
+
|
748 |
+
# Model selection with filtered list
|
749 |
+
if available_models_for_selection:
|
750 |
+
# Get top performers from available models for default selection
|
751 |
+
available_model_avg_scores = df_display.loc[available_models_for_selection].mean(axis=1, skipna=True).sort_values(ascending=False)
|
752 |
+
default_selection = available_model_avg_scores.head(3).index.tolist()
|
753 |
+
else:
|
754 |
+
default_selection = []
|
755 |
+
|
756 |
selected_models = st.multiselect(
|
757 |
+
f"Select models to compare {models_info}",
|
758 |
+
available_models_for_selection,
|
759 |
+
default=default_selection,
|
760 |
+
help="Models are filtered based on benchmark selection and complete data setting above"
|
761 |
)
|
762 |
|
763 |
if selected_models:
|
764 |
comparison_data = df_display.loc[selected_models].T
|
765 |
comparison_data.index = [clean_benchmark_name(idx) for idx in comparison_data.index]
|
766 |
|
767 |
+
# Performance Radar Chart
|
768 |
+
st.subheader("π Performance Radar Chart")
|
769 |
+
|
770 |
+
if not selected_benchmarks_for_radar:
|
771 |
+
st.info("Please select at least one benchmark above for the radar chart.")
|
772 |
+
elif len(selected_models) == 0:
|
773 |
+
st.info("Please select models above to see the radar chart comparison.")
|
774 |
+
elif len(selected_models) > 10:
|
775 |
+
st.warning(f"Too many models selected ({len(selected_models)}). Please select 10 or fewer models for the radar chart.")
|
776 |
+
st.info("π‘ **Tip**: Use the search box above to filter models, then select a smaller subset for comparison.")
|
777 |
+
else:
|
778 |
+
# Show radar chart for 1-10 models
|
779 |
fig = go.Figure()
|
780 |
|
781 |
+
# Use only selected benchmarks
|
782 |
+
clean_benchmark_names = [clean_benchmark_name(b) for b in selected_benchmarks_for_radar]
|
783 |
+
|
784 |
+
# Define colors for different models
|
785 |
+
colors_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
|
786 |
+
'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
|
787 |
+
|
788 |
+
for i, model in enumerate(selected_models):
|
789 |
+
# Get model data for selected benchmarks only
|
790 |
+
model_scores = []
|
791 |
+
for benchmark in selected_benchmarks_for_radar:
|
792 |
+
score = df_display.loc[model, benchmark]
|
793 |
+
# Convert to float, use 0.0 for any remaining NaN values
|
794 |
+
model_scores.append(0.0 if pd.isna(score) else float(score))
|
795 |
|
796 |
+
# Close the radar chart by adding the first value at the end
|
797 |
+
radar_values = model_scores + [model_scores[0]]
|
798 |
+
radar_benchmarks = clean_benchmark_names + [clean_benchmark_names[0]]
|
799 |
+
|
800 |
+
# Create model name for legend (remove path prefix if present)
|
801 |
+
model_display_name = model.split('/')[-1] if '/' in model else model
|
802 |
+
|
803 |
+
# Use color from list, cycling if needed
|
804 |
+
model_color = colors_list[i % len(colors_list)]
|
805 |
|
806 |
fig.add_trace(go.Scatterpolar(
|
807 |
+
r=radar_values,
|
808 |
+
theta=radar_benchmarks,
|
809 |
fill='toself',
|
810 |
+
name=model_display_name,
|
811 |
+
line_color=model_color,
|
812 |
+
hovertemplate='<b>%{theta}</b><br>Score: %{r:.3f}<extra></extra>'
|
813 |
))
|
814 |
|
815 |
+
# Calculate dynamic range for better visualization
|
816 |
+
all_values = []
|
817 |
+
for model in selected_models:
|
818 |
+
for benchmark in selected_benchmarks_for_radar:
|
819 |
+
score = df_display.loc[model, benchmark]
|
820 |
+
if not pd.isna(score):
|
821 |
+
all_values.append(score)
|
822 |
+
|
823 |
+
if all_values:
|
824 |
+
min_val = min(all_values)
|
825 |
+
max_val = max(all_values)
|
826 |
+
# Add some padding
|
827 |
+
range_padding = (max_val - min_val) * 0.1
|
828 |
+
radar_min = max(0, min_val - range_padding)
|
829 |
+
radar_max = min(1, max_val + range_padding)
|
830 |
+
else:
|
831 |
+
radar_min, radar_max = 0, 1
|
832 |
+
|
833 |
+
# Adjust chart size based on number of models
|
834 |
+
chart_height = 600 if len(selected_models) <= 3 else 700
|
835 |
+
|
836 |
fig.update_layout(
|
837 |
polar=dict(
|
838 |
radialaxis=dict(
|
839 |
visible=True,
|
840 |
+
range=[radar_min, radar_max],
|
841 |
+
tickformat='.2f'
|
842 |
)),
|
843 |
showlegend=True,
|
844 |
+
title=f"Model Performance Radar Chart ({len(selected_benchmarks_for_radar)} benchmarks, {len(selected_models)} models)",
|
845 |
+
width=700,
|
846 |
+
height=chart_height
|
847 |
)
|
848 |
|
849 |
st.plotly_chart(fig, use_container_width=True)
|
850 |
+
|
851 |
+
# Add explanation about missing values (only if not using complete data only)
|
852 |
+
if not complete_data_only:
|
853 |
+
missing_info = []
|
854 |
+
for model in selected_models:
|
855 |
+
missing_benchmarks = []
|
856 |
+
for benchmark in selected_benchmarks_for_radar:
|
857 |
+
if pd.isna(df_display.loc[model, benchmark]):
|
858 |
+
missing_benchmarks.append(clean_benchmark_name(benchmark))
|
859 |
+
if missing_benchmarks:
|
860 |
+
missing_info.append(f"β’ {model.split('/')[-1]}: {', '.join(missing_benchmarks)}")
|
861 |
+
|
862 |
+
if missing_info:
|
863 |
+
with st.expander("βΉοΈ Missing Data Information"):
|
864 |
+
st.write("Missing values are shown as 0 in the radar chart:")
|
865 |
+
for info in missing_info:
|
866 |
+
st.write(info)
|
867 |
+
else:
|
868 |
+
# When complete data only is enabled, all selected models should have complete data
|
869 |
+
st.info("β
All selected models have complete data for the chosen benchmarks.")
|
870 |
+
|
871 |
+
# Performance tips for large selections
|
872 |
+
if len(selected_models) > 5:
|
873 |
+
st.info(f"π‘ **Viewing {len(selected_models)} models**: For better readability, consider selecting fewer models or use the detailed comparison table below.")
|
874 |
|
875 |
# Detailed comparison table
|
876 |
st.subheader("Detailed Comparison")
|