jmercat commited on
Commit
217c20b
Β·
1 Parent(s): b1b519f

Fixed model ranking to estimate missing ranks

Browse files
Files changed (1) hide show
  1. app.py +236 -15
app.py CHANGED
@@ -334,6 +334,133 @@ def filter_target_benchmarks(df):
334
 
335
  return df[available_benchmarks].copy()
336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  def main():
338
  """Main application."""
339
  st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
@@ -707,25 +834,119 @@ def show_model_performance(df):
707
  # Performance ranking
708
  st.subheader("Model Rankings")
709
 
710
- # Calculate average performance (excluding NaN)
711
- model_avg_scores = df_display.mean(axis=1, skipna=True).sort_values(ascending=False)
 
 
 
 
 
 
 
712
 
713
- # Top performers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714
  col1, col2 = st.columns(2)
715
 
716
  with col1:
717
- st.markdown("**πŸ† Top 10 Models (by average score)**")
718
- for i, (model, score) in enumerate(model_avg_scores.head(10).items()):
719
- st.write(f"{i+1}. {model.split('/')[-1]}: {score:.3f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
720
 
721
  with col2:
722
- st.markdown("**πŸ“Š Performance Distribution**")
723
- fig = px.histogram(model_avg_scores,
724
- nbins=20,
725
- title="Distribution of Average Model Scores")
 
 
 
 
 
 
 
726
  st.plotly_chart(fig, use_container_width=True)
727
 
728
- # Model comparison
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
729
  st.subheader("Model Comparison")
730
 
731
  # Benchmark selection for radar chart (always visible)
@@ -771,11 +992,11 @@ def show_model_performance(df):
771
  available_models_for_selection = df_display.index.tolist()
772
  models_info = f"({len(available_models_for_selection)} models total)"
773
 
774
- # Model selection with filtered list
775
  if available_models_for_selection:
776
- # Get top performers from available models for default selection
777
- available_model_avg_scores = df_display.loc[available_models_for_selection].mean(axis=1, skipna=True).sort_values(ascending=False)
778
- default_selection = available_model_avg_scores.head(3).index.tolist()
779
  else:
780
  default_selection = []
781
 
 
334
 
335
  return df[available_benchmarks].copy()
336
 
337
+ def estimate_missing_ranks(df, method='spearman', min_corr=0.3, min_benchmarks=3):
338
+ """
339
+ Estimate missing benchmark ranks using rank correlation-based imputation.
340
+
341
+ Args:
342
+ df: DataFrame with models as rows and benchmarks as columns
343
+ method: Rank correlation method ('spearman' or 'kendall')
344
+ min_corr: Minimum correlation threshold to use for prediction
345
+ min_benchmarks: Minimum number of benchmarks needed for prediction
346
+
347
+ Returns:
348
+ DataFrame with estimated ranks filled in
349
+ """
350
+ # Convert scores to ranks (higher score = better rank = lower rank number)
351
+ df_ranks = df.rank(method='min', ascending=False, na_option='keep')
352
+ df_ranks_imputed = df_ranks.copy()
353
+
354
+ # Compute rank correlation matrix
355
+ if method == 'spearman':
356
+ rank_corr_matrix = df_ranks.corr(method='spearman')
357
+ elif method == 'kendall':
358
+ rank_corr_matrix = df_ranks.corr(method='kendall')
359
+ else:
360
+ rank_corr_matrix = df_ranks.corr(method='pearson') # fallback
361
+
362
+ # For each model and benchmark combination with missing data
363
+ for model_idx in df.index:
364
+ for benchmark in df.columns:
365
+ if pd.isna(df_ranks.loc[model_idx, benchmark]):
366
+ # Find benchmarks this model has ranks for
367
+ available_benchmarks = df_ranks.columns[df_ranks.loc[model_idx].notna()].tolist()
368
+
369
+ if len(available_benchmarks) >= min_benchmarks:
370
+ # Get rank correlations between target benchmark and available benchmarks
371
+ correlations = []
372
+ ranks = []
373
+
374
+ for avail_bench in available_benchmarks:
375
+ corr_val = rank_corr_matrix.loc[benchmark, avail_bench]
376
+ if not pd.isna(corr_val) and abs(corr_val) >= min_corr:
377
+ correlations.append(abs(corr_val)) # Use absolute correlation as weight
378
+ ranks.append(df_ranks.loc[model_idx, avail_bench])
379
+
380
+ if len(correlations) > 0:
381
+ # Weighted average of ranks using correlations as weights
382
+ correlations = np.array(correlations)
383
+ ranks = np.array(ranks)
384
+
385
+ # Normalize weights
386
+ weights = correlations / correlations.sum()
387
+ estimated_rank = np.average(ranks, weights=weights)
388
+
389
+ df_ranks_imputed.loc[model_idx, benchmark] = estimated_rank
390
+
391
+ return df_ranks_imputed
392
+
393
+ def create_consensus_ranking(df, method='spearman', use_rank_imputation=True):
394
+ """
395
+ Create a consensus ranking using rank correlation-based estimation.
396
+
397
+ Returns:
398
+ tuple: (ranking_df, rank_matrix, metadata)
399
+ """
400
+ if use_rank_imputation:
401
+ # Estimate missing ranks
402
+ df_ranks = estimate_missing_ranks(df, method)
403
+
404
+ # Calculate consensus rank for each model (median rank across all benchmarks)
405
+ consensus_ranks = df_ranks.median(axis=1, skipna=True)
406
+
407
+ # Calculate coverage and estimation statistics
408
+ original_coverage = df.notna().sum(axis=1)
409
+ imputed_coverage = df_ranks.notna().sum(axis=1)
410
+ estimated_count = imputed_coverage - original_coverage
411
+
412
+ # Create ranking dataframe
413
+ ranking_data = []
414
+ for model in df.index:
415
+ ranking_data.append({
416
+ 'Model': model.split('/')[-1] if '/' in model else model,
417
+ 'Full_Model_Name': model,
418
+ 'Consensus_Rank': float(consensus_ranks[model]),
419
+ 'Original_Benchmarks': int(original_coverage[model]),
420
+ 'Total_Benchmarks': int(imputed_coverage[model]),
421
+ 'Estimated_Ranks': int(estimated_count[model]),
422
+ 'Coverage_Pct': float(original_coverage[model] / len(df.columns) * 100)
423
+ })
424
+
425
+ ranking_df = pd.DataFrame(ranking_data).sort_values('Consensus_Rank', ascending=True) # Lower rank = better
426
+
427
+ metadata = {
428
+ 'method': method,
429
+ 'imputation_used': True,
430
+ 'total_estimates': int(estimated_count.sum()),
431
+ 'models_with_estimates': int((estimated_count > 0).sum()),
432
+ 'ranking_method': 'consensus_rank'
433
+ }
434
+
435
+ else:
436
+ # Simple ranking based on available data only
437
+ df_ranks = df.rank(method='min', ascending=False, na_option='keep')
438
+ median_ranks = df_ranks.median(axis=1, skipna=True)
439
+
440
+ ranking_data = []
441
+ for model in df.index:
442
+ ranking_data.append({
443
+ 'Model': model.split('/')[-1] if '/' in model else model,
444
+ 'Full_Model_Name': model,
445
+ 'Consensus_Rank': float(median_ranks[model]),
446
+ 'Original_Benchmarks': int(df.notna().sum(axis=1)[model]),
447
+ 'Total_Benchmarks': int(df.notna().sum(axis=1)[model]),
448
+ 'Estimated_Ranks': 0,
449
+ 'Coverage_Pct': float(df.notna().sum(axis=1)[model] / len(df.columns) * 100)
450
+ })
451
+
452
+ ranking_df = pd.DataFrame(ranking_data).sort_values('Consensus_Rank', ascending=True)
453
+
454
+ metadata = {
455
+ 'method': 'none',
456
+ 'imputation_used': False,
457
+ 'total_estimates': 0,
458
+ 'models_with_estimates': 0,
459
+ 'ranking_method': 'median_rank'
460
+ }
461
+
462
+ return ranking_df, df_ranks, metadata
463
+
464
  def main():
465
  """Main application."""
466
  st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
 
834
  # Performance ranking
835
  st.subheader("Model Rankings")
836
 
837
+ # Ranking method controls
838
+ col1, col2, col3 = st.columns(3)
839
+
840
+ with col1:
841
+ use_rank_imputation = st.checkbox(
842
+ "Use rank-based estimation",
843
+ value=True,
844
+ help="Estimate missing rankings using rank correlations between benchmarks. More fair than simple averaging."
845
+ )
846
 
847
+ with col2:
848
+ if use_rank_imputation:
849
+ rank_method = st.selectbox(
850
+ "Rank correlation method",
851
+ ["spearman", "kendall"],
852
+ help="Spearman: More sensitive to monotonic relationships\nKendall: More robust to outliers"
853
+ )
854
+ else:
855
+ rank_method = "none"
856
+
857
+ with col3:
858
+ if use_rank_imputation:
859
+ min_corr = st.slider(
860
+ "Min correlation threshold",
861
+ min_value=0.1,
862
+ max_value=0.8,
863
+ value=0.3,
864
+ step=0.1,
865
+ help="Minimum rank correlation required to use a benchmark for prediction"
866
+ )
867
+ else:
868
+ min_corr = 0.3
869
+
870
+ # Generate rankings
871
+ ranking_df, rank_matrix, metadata = create_consensus_ranking(
872
+ df_display,
873
+ method=rank_method,
874
+ use_rank_imputation=use_rank_imputation
875
+ )
876
+
877
+ # Display ranking information
878
  col1, col2 = st.columns(2)
879
 
880
  with col1:
881
+ st.markdown("**πŸ† Top 15 Models**")
882
+
883
+ if metadata['imputation_used']:
884
+ st.caption(f"πŸ”¬ Using {metadata['method']} rank correlations with {metadata['total_estimates']} estimated ranks")
885
+ else:
886
+ st.caption("πŸ“Š Using median rank of available rankings")
887
+
888
+ rank_num = 0
889
+ for i, row in ranking_df.head(15).iterrows():
890
+ rank_num += 1
891
+ estimated_info = f" (+{row['Estimated_Ranks']} est.)" if row['Estimated_Ranks'] > 0 else ""
892
+ coverage_info = f"{row['Coverage_Pct']:.0f}%"
893
+
894
+ if metadata['imputation_used']:
895
+ st.write(f"{rank_num}. **{row['Model']}** (median rank: {row['Consensus_Rank']:.1f})")
896
+ st.caption(f" πŸ“Š {row['Original_Benchmarks']}/{row['Total_Benchmarks']} benchmarks{estimated_info}")
897
+ else:
898
+ st.write(f"{rank_num}. **{row['Model']}** (median rank: {row['Consensus_Rank']:.1f})")
899
+ st.caption(f" πŸ“Š {row['Original_Benchmarks']} benchmarks ({coverage_info} coverage)")
900
 
901
  with col2:
902
+ st.markdown("**πŸ“Š Ranking Distribution**")
903
+
904
+ # Create histogram of consensus ranks
905
+ fig = px.histogram(
906
+ ranking_df,
907
+ x='Consensus_Rank',
908
+ nbins=20,
909
+ title="Distribution of Consensus Rankings",
910
+ labels={'Consensus_Rank': 'Average Rank (lower is better)', 'count': 'Number of Models'}
911
+ )
912
+ fig.update_layout(height=400)
913
  st.plotly_chart(fig, use_container_width=True)
914
 
915
+ # Show ranking methodology explanation
916
+ if metadata['imputation_used']:
917
+ with st.expander("ℹ️ How Rank-Based Estimation Works"):
918
+ st.write(f"""
919
+ **Method**: {metadata['method'].title()} rank correlation
920
+
921
+ **Process**:
922
+ 1. Convert benchmark scores to ranks (1st, 2nd, 3rd, etc.)
923
+ 2. Calculate rank correlations between all benchmark pairs
924
+ 3. For missing data: predict rank using weighted average of available ranks
925
+ 4. Weights based on rank correlation strength (min threshold: {min_corr})
926
+ 5. Final consensus rank = median rank across all benchmarks
927
+
928
+ **Upsides**:
929
+ - Eliminates bias from models tested only on easier/harder benchmarks
930
+ - Uses the correlation structure to make informed predictions
931
+ - Focuses on relative ranking rather than absolute scores
932
+ - More robust to outliers and scale differences
933
+ - Median consensus rank is less affected by extreme outlier rankings
934
+
935
+ **Statistics**:
936
+ - Total rank estimates made: {metadata['total_estimates']:,}
937
+ - Models with estimated ranks: {metadata['models_with_estimates']}
938
+ """)
939
+ else:
940
+ with st.expander("ℹ️ Simple Ranking Method"):
941
+ st.write("""
942
+ **Method**: Median rank of available rankings
943
+
944
+ **Limitation**: Models tested on fewer or easier benchmarks may appear artificially better.
945
+
946
+ **Recommendation**: Enable rank-based estimation for fairer comparisons.
947
+ """)
948
+
949
+ # Model comparison section
950
  st.subheader("Model Comparison")
951
 
952
  # Benchmark selection for radar chart (always visible)
 
992
  available_models_for_selection = df_display.index.tolist()
993
  models_info = f"({len(available_models_for_selection)} models total)"
994
 
995
+ # Model selection with filtered list - use top ranked models as default
996
  if available_models_for_selection:
997
+ # Get top performers from ranking
998
+ top_models_from_ranking = ranking_df['Full_Model_Name'].head(5).tolist()
999
+ default_selection = [m for m in top_models_from_ranking if m in available_models_for_selection][:3]
1000
  else:
1001
  default_selection = []
1002