jmercat commited on
Commit
ec23e9b
Β·
1 Parent(s): 5c0c5a9

Improvements in performance chart, hopefully fixed online missing data

Browse files
Files changed (1) hide show
  1. app.py +145 -17
app.py CHANGED
@@ -702,47 +702,175 @@ def show_model_performance(df):
702
  # Model comparison
703
  st.subheader("Model Comparison")
704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
  selected_models = st.multiselect(
706
- "Select models to compare",
707
- df_display.index.tolist(),
708
- default=model_avg_scores.head(3).index.tolist()
 
709
  )
710
 
711
  if selected_models:
712
  comparison_data = df_display.loc[selected_models].T
713
  comparison_data.index = [clean_benchmark_name(idx) for idx in comparison_data.index]
714
 
715
- # Radar chart
716
- if len(selected_models) <= 5: # Only for manageable number of models
 
 
 
 
 
 
 
 
 
 
717
  fig = go.Figure()
718
 
719
- for model in selected_models:
720
- model_data = df_display.loc[model].dropna()
721
- benchmarks = [clean_benchmark_name(b) for b in model_data.index]
722
- values = model_data.values.tolist()
 
 
 
 
 
 
 
 
 
 
723
 
724
- # Close the radar chart
725
- values += values[:1]
726
- benchmarks += benchmarks[:1]
 
 
 
 
 
 
727
 
728
  fig.add_trace(go.Scatterpolar(
729
- r=values,
730
- theta=benchmarks,
731
  fill='toself',
732
- name=model.split('/')[-1]
 
 
733
  ))
734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735
  fig.update_layout(
736
  polar=dict(
737
  radialaxis=dict(
738
  visible=True,
739
- range=[0, 1]
 
740
  )),
741
  showlegend=True,
742
- title="Model Performance Radar Chart"
 
 
743
  )
744
 
745
  st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
746
 
747
  # Detailed comparison table
748
  st.subheader("Detailed Comparison")
 
702
  # Model comparison
703
  st.subheader("Model Comparison")
704
 
705
+ # Benchmark selection for radar chart (always visible)
706
+ st.subheader("πŸ“Š Benchmark & Model Selection")
707
+
708
+ col1, col2 = st.columns([2, 1])
709
+
710
+ with col1:
711
+ available_benchmarks = list(df_display.columns)
712
+ default_benchmarks = available_benchmarks[:min(8, len(available_benchmarks))] # Default to first 8 or all if fewer
713
+
714
+ selected_benchmarks_for_radar = st.multiselect(
715
+ "Select benchmarks for radar chart",
716
+ available_benchmarks,
717
+ default=default_benchmarks,
718
+ format_func=clean_benchmark_name,
719
+ help="Choose which benchmarks to display in the radar chart"
720
+ )
721
+
722
+ with col2:
723
+ complete_data_only = st.checkbox(
724
+ "Complete data only",
725
+ value=True,
726
+ help="Show only models that have data for ALL selected benchmarks"
727
+ )
728
+
729
+ # Filter available models based on benchmark selection and complete data requirement
730
+ if complete_data_only and selected_benchmarks_for_radar:
731
+ # Only show models that have data for all selected benchmarks
732
+ models_with_complete_data = []
733
+ for model in df_display.index:
734
+ has_all_data = True
735
+ for benchmark in selected_benchmarks_for_radar:
736
+ if pd.isna(df_display.loc[model, benchmark]):
737
+ has_all_data = False
738
+ break
739
+ if has_all_data:
740
+ models_with_complete_data.append(model)
741
+
742
+ available_models_for_selection = models_with_complete_data
743
+ models_info = f"({len(available_models_for_selection)} models with complete data)"
744
+ else:
745
+ available_models_for_selection = df_display.index.tolist()
746
+ models_info = f"({len(available_models_for_selection)} models total)"
747
+
748
+ # Model selection with filtered list
749
+ if available_models_for_selection:
750
+ # Get top performers from available models for default selection
751
+ available_model_avg_scores = df_display.loc[available_models_for_selection].mean(axis=1, skipna=True).sort_values(ascending=False)
752
+ default_selection = available_model_avg_scores.head(3).index.tolist()
753
+ else:
754
+ default_selection = []
755
+
756
  selected_models = st.multiselect(
757
+ f"Select models to compare {models_info}",
758
+ available_models_for_selection,
759
+ default=default_selection,
760
+ help="Models are filtered based on benchmark selection and complete data setting above"
761
  )
762
 
763
  if selected_models:
764
  comparison_data = df_display.loc[selected_models].T
765
  comparison_data.index = [clean_benchmark_name(idx) for idx in comparison_data.index]
766
 
767
+ # Performance Radar Chart
768
+ st.subheader("πŸ“Š Performance Radar Chart")
769
+
770
+ if not selected_benchmarks_for_radar:
771
+ st.info("Please select at least one benchmark above for the radar chart.")
772
+ elif len(selected_models) == 0:
773
+ st.info("Please select models above to see the radar chart comparison.")
774
+ elif len(selected_models) > 10:
775
+ st.warning(f"Too many models selected ({len(selected_models)}). Please select 10 or fewer models for the radar chart.")
776
+ st.info("πŸ’‘ **Tip**: Use the search box above to filter models, then select a smaller subset for comparison.")
777
+ else:
778
+ # Show radar chart for 1-10 models
779
  fig = go.Figure()
780
 
781
+ # Use only selected benchmarks
782
+ clean_benchmark_names = [clean_benchmark_name(b) for b in selected_benchmarks_for_radar]
783
+
784
+ # Define colors for different models
785
+ colors_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
786
+ '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
787
+
788
+ for i, model in enumerate(selected_models):
789
+ # Get model data for selected benchmarks only
790
+ model_scores = []
791
+ for benchmark in selected_benchmarks_for_radar:
792
+ score = df_display.loc[model, benchmark]
793
+ # Convert to float, use 0.0 for any remaining NaN values
794
+ model_scores.append(0.0 if pd.isna(score) else float(score))
795
 
796
+ # Close the radar chart by adding the first value at the end
797
+ radar_values = model_scores + [model_scores[0]]
798
+ radar_benchmarks = clean_benchmark_names + [clean_benchmark_names[0]]
799
+
800
+ # Create model name for legend (remove path prefix if present)
801
+ model_display_name = model.split('/')[-1] if '/' in model else model
802
+
803
+ # Use color from list, cycling if needed
804
+ model_color = colors_list[i % len(colors_list)]
805
 
806
  fig.add_trace(go.Scatterpolar(
807
+ r=radar_values,
808
+ theta=radar_benchmarks,
809
  fill='toself',
810
+ name=model_display_name,
811
+ line_color=model_color,
812
+ hovertemplate='<b>%{theta}</b><br>Score: %{r:.3f}<extra></extra>'
813
  ))
814
 
815
+ # Calculate dynamic range for better visualization
816
+ all_values = []
817
+ for model in selected_models:
818
+ for benchmark in selected_benchmarks_for_radar:
819
+ score = df_display.loc[model, benchmark]
820
+ if not pd.isna(score):
821
+ all_values.append(score)
822
+
823
+ if all_values:
824
+ min_val = min(all_values)
825
+ max_val = max(all_values)
826
+ # Add some padding
827
+ range_padding = (max_val - min_val) * 0.1
828
+ radar_min = max(0, min_val - range_padding)
829
+ radar_max = min(1, max_val + range_padding)
830
+ else:
831
+ radar_min, radar_max = 0, 1
832
+
833
+ # Adjust chart size based on number of models
834
+ chart_height = 600 if len(selected_models) <= 3 else 700
835
+
836
  fig.update_layout(
837
  polar=dict(
838
  radialaxis=dict(
839
  visible=True,
840
+ range=[radar_min, radar_max],
841
+ tickformat='.2f'
842
  )),
843
  showlegend=True,
844
+ title=f"Model Performance Radar Chart ({len(selected_benchmarks_for_radar)} benchmarks, {len(selected_models)} models)",
845
+ width=700,
846
+ height=chart_height
847
  )
848
 
849
  st.plotly_chart(fig, use_container_width=True)
850
+
851
+ # Add explanation about missing values (only if not using complete data only)
852
+ if not complete_data_only:
853
+ missing_info = []
854
+ for model in selected_models:
855
+ missing_benchmarks = []
856
+ for benchmark in selected_benchmarks_for_radar:
857
+ if pd.isna(df_display.loc[model, benchmark]):
858
+ missing_benchmarks.append(clean_benchmark_name(benchmark))
859
+ if missing_benchmarks:
860
+ missing_info.append(f"β€’ {model.split('/')[-1]}: {', '.join(missing_benchmarks)}")
861
+
862
+ if missing_info:
863
+ with st.expander("ℹ️ Missing Data Information"):
864
+ st.write("Missing values are shown as 0 in the radar chart:")
865
+ for info in missing_info:
866
+ st.write(info)
867
+ else:
868
+ # When complete data only is enabled, all selected models should have complete data
869
+ st.info("βœ… All selected models have complete data for the chosen benchmarks.")
870
+
871
+ # Performance tips for large selections
872
+ if len(selected_models) > 5:
873
+ st.info(f"πŸ’‘ **Viewing {len(selected_models)} models**: For better readability, consider selecting fewer models or use the detailed comparison table below.")
874
 
875
  # Detailed comparison table
876
  st.subheader("Detailed Comparison")