Fixed model ranking to estimate missing ranks
Browse files
app.py
CHANGED
@@ -334,6 +334,133 @@ def filter_target_benchmarks(df):
|
|
334 |
|
335 |
return df[available_benchmarks].copy()
|
336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
def main():
|
338 |
"""Main application."""
|
339 |
st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
|
@@ -707,25 +834,119 @@ def show_model_performance(df):
|
|
707 |
# Performance ranking
|
708 |
st.subheader("Model Rankings")
|
709 |
|
710 |
-
#
|
711 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
712 |
|
713 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
714 |
col1, col2 = st.columns(2)
|
715 |
|
716 |
with col1:
|
717 |
-
st.markdown("**π Top
|
718 |
-
|
719 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
720 |
|
721 |
with col2:
|
722 |
-
st.markdown("**π
|
723 |
-
|
724 |
-
|
725 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
726 |
st.plotly_chart(fig, use_container_width=True)
|
727 |
|
728 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
729 |
st.subheader("Model Comparison")
|
730 |
|
731 |
# Benchmark selection for radar chart (always visible)
|
@@ -771,11 +992,11 @@ def show_model_performance(df):
|
|
771 |
available_models_for_selection = df_display.index.tolist()
|
772 |
models_info = f"({len(available_models_for_selection)} models total)"
|
773 |
|
774 |
-
# Model selection with filtered list
|
775 |
if available_models_for_selection:
|
776 |
-
# Get top performers from
|
777 |
-
|
778 |
-
default_selection =
|
779 |
else:
|
780 |
default_selection = []
|
781 |
|
|
|
334 |
|
335 |
return df[available_benchmarks].copy()
|
336 |
|
337 |
+
def estimate_missing_ranks(df, method='spearman', min_corr=0.3, min_benchmarks=3):
|
338 |
+
"""
|
339 |
+
Estimate missing benchmark ranks using rank correlation-based imputation.
|
340 |
+
|
341 |
+
Args:
|
342 |
+
df: DataFrame with models as rows and benchmarks as columns
|
343 |
+
method: Rank correlation method ('spearman' or 'kendall')
|
344 |
+
min_corr: Minimum correlation threshold to use for prediction
|
345 |
+
min_benchmarks: Minimum number of benchmarks needed for prediction
|
346 |
+
|
347 |
+
Returns:
|
348 |
+
DataFrame with estimated ranks filled in
|
349 |
+
"""
|
350 |
+
# Convert scores to ranks (higher score = better rank = lower rank number)
|
351 |
+
df_ranks = df.rank(method='min', ascending=False, na_option='keep')
|
352 |
+
df_ranks_imputed = df_ranks.copy()
|
353 |
+
|
354 |
+
# Compute rank correlation matrix
|
355 |
+
if method == 'spearman':
|
356 |
+
rank_corr_matrix = df_ranks.corr(method='spearman')
|
357 |
+
elif method == 'kendall':
|
358 |
+
rank_corr_matrix = df_ranks.corr(method='kendall')
|
359 |
+
else:
|
360 |
+
rank_corr_matrix = df_ranks.corr(method='pearson') # fallback
|
361 |
+
|
362 |
+
# For each model and benchmark combination with missing data
|
363 |
+
for model_idx in df.index:
|
364 |
+
for benchmark in df.columns:
|
365 |
+
if pd.isna(df_ranks.loc[model_idx, benchmark]):
|
366 |
+
# Find benchmarks this model has ranks for
|
367 |
+
available_benchmarks = df_ranks.columns[df_ranks.loc[model_idx].notna()].tolist()
|
368 |
+
|
369 |
+
if len(available_benchmarks) >= min_benchmarks:
|
370 |
+
# Get rank correlations between target benchmark and available benchmarks
|
371 |
+
correlations = []
|
372 |
+
ranks = []
|
373 |
+
|
374 |
+
for avail_bench in available_benchmarks:
|
375 |
+
corr_val = rank_corr_matrix.loc[benchmark, avail_bench]
|
376 |
+
if not pd.isna(corr_val) and abs(corr_val) >= min_corr:
|
377 |
+
correlations.append(abs(corr_val)) # Use absolute correlation as weight
|
378 |
+
ranks.append(df_ranks.loc[model_idx, avail_bench])
|
379 |
+
|
380 |
+
if len(correlations) > 0:
|
381 |
+
# Weighted average of ranks using correlations as weights
|
382 |
+
correlations = np.array(correlations)
|
383 |
+
ranks = np.array(ranks)
|
384 |
+
|
385 |
+
# Normalize weights
|
386 |
+
weights = correlations / correlations.sum()
|
387 |
+
estimated_rank = np.average(ranks, weights=weights)
|
388 |
+
|
389 |
+
df_ranks_imputed.loc[model_idx, benchmark] = estimated_rank
|
390 |
+
|
391 |
+
return df_ranks_imputed
|
392 |
+
|
393 |
+
def create_consensus_ranking(df, method='spearman', use_rank_imputation=True):
|
394 |
+
"""
|
395 |
+
Create a consensus ranking using rank correlation-based estimation.
|
396 |
+
|
397 |
+
Returns:
|
398 |
+
tuple: (ranking_df, rank_matrix, metadata)
|
399 |
+
"""
|
400 |
+
if use_rank_imputation:
|
401 |
+
# Estimate missing ranks
|
402 |
+
df_ranks = estimate_missing_ranks(df, method)
|
403 |
+
|
404 |
+
# Calculate consensus rank for each model (median rank across all benchmarks)
|
405 |
+
consensus_ranks = df_ranks.median(axis=1, skipna=True)
|
406 |
+
|
407 |
+
# Calculate coverage and estimation statistics
|
408 |
+
original_coverage = df.notna().sum(axis=1)
|
409 |
+
imputed_coverage = df_ranks.notna().sum(axis=1)
|
410 |
+
estimated_count = imputed_coverage - original_coverage
|
411 |
+
|
412 |
+
# Create ranking dataframe
|
413 |
+
ranking_data = []
|
414 |
+
for model in df.index:
|
415 |
+
ranking_data.append({
|
416 |
+
'Model': model.split('/')[-1] if '/' in model else model,
|
417 |
+
'Full_Model_Name': model,
|
418 |
+
'Consensus_Rank': float(consensus_ranks[model]),
|
419 |
+
'Original_Benchmarks': int(original_coverage[model]),
|
420 |
+
'Total_Benchmarks': int(imputed_coverage[model]),
|
421 |
+
'Estimated_Ranks': int(estimated_count[model]),
|
422 |
+
'Coverage_Pct': float(original_coverage[model] / len(df.columns) * 100)
|
423 |
+
})
|
424 |
+
|
425 |
+
ranking_df = pd.DataFrame(ranking_data).sort_values('Consensus_Rank', ascending=True) # Lower rank = better
|
426 |
+
|
427 |
+
metadata = {
|
428 |
+
'method': method,
|
429 |
+
'imputation_used': True,
|
430 |
+
'total_estimates': int(estimated_count.sum()),
|
431 |
+
'models_with_estimates': int((estimated_count > 0).sum()),
|
432 |
+
'ranking_method': 'consensus_rank'
|
433 |
+
}
|
434 |
+
|
435 |
+
else:
|
436 |
+
# Simple ranking based on available data only
|
437 |
+
df_ranks = df.rank(method='min', ascending=False, na_option='keep')
|
438 |
+
median_ranks = df_ranks.median(axis=1, skipna=True)
|
439 |
+
|
440 |
+
ranking_data = []
|
441 |
+
for model in df.index:
|
442 |
+
ranking_data.append({
|
443 |
+
'Model': model.split('/')[-1] if '/' in model else model,
|
444 |
+
'Full_Model_Name': model,
|
445 |
+
'Consensus_Rank': float(median_ranks[model]),
|
446 |
+
'Original_Benchmarks': int(df.notna().sum(axis=1)[model]),
|
447 |
+
'Total_Benchmarks': int(df.notna().sum(axis=1)[model]),
|
448 |
+
'Estimated_Ranks': 0,
|
449 |
+
'Coverage_Pct': float(df.notna().sum(axis=1)[model] / len(df.columns) * 100)
|
450 |
+
})
|
451 |
+
|
452 |
+
ranking_df = pd.DataFrame(ranking_data).sort_values('Consensus_Rank', ascending=True)
|
453 |
+
|
454 |
+
metadata = {
|
455 |
+
'method': 'none',
|
456 |
+
'imputation_used': False,
|
457 |
+
'total_estimates': 0,
|
458 |
+
'models_with_estimates': 0,
|
459 |
+
'ranking_method': 'median_rank'
|
460 |
+
}
|
461 |
+
|
462 |
+
return ranking_df, df_ranks, metadata
|
463 |
+
|
464 |
def main():
|
465 |
"""Main application."""
|
466 |
st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
|
|
|
834 |
# Performance ranking
|
835 |
st.subheader("Model Rankings")
|
836 |
|
837 |
+
# Ranking method controls
|
838 |
+
col1, col2, col3 = st.columns(3)
|
839 |
+
|
840 |
+
with col1:
|
841 |
+
use_rank_imputation = st.checkbox(
|
842 |
+
"Use rank-based estimation",
|
843 |
+
value=True,
|
844 |
+
help="Estimate missing rankings using rank correlations between benchmarks. More fair than simple averaging."
|
845 |
+
)
|
846 |
|
847 |
+
with col2:
|
848 |
+
if use_rank_imputation:
|
849 |
+
rank_method = st.selectbox(
|
850 |
+
"Rank correlation method",
|
851 |
+
["spearman", "kendall"],
|
852 |
+
help="Spearman: More sensitive to monotonic relationships\nKendall: More robust to outliers"
|
853 |
+
)
|
854 |
+
else:
|
855 |
+
rank_method = "none"
|
856 |
+
|
857 |
+
with col3:
|
858 |
+
if use_rank_imputation:
|
859 |
+
min_corr = st.slider(
|
860 |
+
"Min correlation threshold",
|
861 |
+
min_value=0.1,
|
862 |
+
max_value=0.8,
|
863 |
+
value=0.3,
|
864 |
+
step=0.1,
|
865 |
+
help="Minimum rank correlation required to use a benchmark for prediction"
|
866 |
+
)
|
867 |
+
else:
|
868 |
+
min_corr = 0.3
|
869 |
+
|
870 |
+
# Generate rankings
|
871 |
+
ranking_df, rank_matrix, metadata = create_consensus_ranking(
|
872 |
+
df_display,
|
873 |
+
method=rank_method,
|
874 |
+
use_rank_imputation=use_rank_imputation
|
875 |
+
)
|
876 |
+
|
877 |
+
# Display ranking information
|
878 |
col1, col2 = st.columns(2)
|
879 |
|
880 |
with col1:
|
881 |
+
st.markdown("**π Top 15 Models**")
|
882 |
+
|
883 |
+
if metadata['imputation_used']:
|
884 |
+
st.caption(f"π¬ Using {metadata['method']} rank correlations with {metadata['total_estimates']} estimated ranks")
|
885 |
+
else:
|
886 |
+
st.caption("π Using median rank of available rankings")
|
887 |
+
|
888 |
+
rank_num = 0
|
889 |
+
for i, row in ranking_df.head(15).iterrows():
|
890 |
+
rank_num += 1
|
891 |
+
estimated_info = f" (+{row['Estimated_Ranks']} est.)" if row['Estimated_Ranks'] > 0 else ""
|
892 |
+
coverage_info = f"{row['Coverage_Pct']:.0f}%"
|
893 |
+
|
894 |
+
if metadata['imputation_used']:
|
895 |
+
st.write(f"{rank_num}. **{row['Model']}** (median rank: {row['Consensus_Rank']:.1f})")
|
896 |
+
st.caption(f" π {row['Original_Benchmarks']}/{row['Total_Benchmarks']} benchmarks{estimated_info}")
|
897 |
+
else:
|
898 |
+
st.write(f"{rank_num}. **{row['Model']}** (median rank: {row['Consensus_Rank']:.1f})")
|
899 |
+
st.caption(f" π {row['Original_Benchmarks']} benchmarks ({coverage_info} coverage)")
|
900 |
|
901 |
with col2:
|
902 |
+
st.markdown("**π Ranking Distribution**")
|
903 |
+
|
904 |
+
# Create histogram of consensus ranks
|
905 |
+
fig = px.histogram(
|
906 |
+
ranking_df,
|
907 |
+
x='Consensus_Rank',
|
908 |
+
nbins=20,
|
909 |
+
title="Distribution of Consensus Rankings",
|
910 |
+
labels={'Consensus_Rank': 'Average Rank (lower is better)', 'count': 'Number of Models'}
|
911 |
+
)
|
912 |
+
fig.update_layout(height=400)
|
913 |
st.plotly_chart(fig, use_container_width=True)
|
914 |
|
915 |
+
# Show ranking methodology explanation
|
916 |
+
if metadata['imputation_used']:
|
917 |
+
with st.expander("βΉοΈ How Rank-Based Estimation Works"):
|
918 |
+
st.write(f"""
|
919 |
+
**Method**: {metadata['method'].title()} rank correlation
|
920 |
+
|
921 |
+
**Process**:
|
922 |
+
1. Convert benchmark scores to ranks (1st, 2nd, 3rd, etc.)
|
923 |
+
2. Calculate rank correlations between all benchmark pairs
|
924 |
+
3. For missing data: predict rank using weighted average of available ranks
|
925 |
+
4. Weights based on rank correlation strength (min threshold: {min_corr})
|
926 |
+
5. Final consensus rank = median rank across all benchmarks
|
927 |
+
|
928 |
+
**Upsides**:
|
929 |
+
- Eliminates bias from models tested only on easier/harder benchmarks
|
930 |
+
- Uses the correlation structure to make informed predictions
|
931 |
+
- Focuses on relative ranking rather than absolute scores
|
932 |
+
- More robust to outliers and scale differences
|
933 |
+
- Median consensus rank is less affected by extreme outlier rankings
|
934 |
+
|
935 |
+
**Statistics**:
|
936 |
+
- Total rank estimates made: {metadata['total_estimates']:,}
|
937 |
+
- Models with estimated ranks: {metadata['models_with_estimates']}
|
938 |
+
""")
|
939 |
+
else:
|
940 |
+
with st.expander("βΉοΈ Simple Ranking Method"):
|
941 |
+
st.write("""
|
942 |
+
**Method**: Median rank of available rankings
|
943 |
+
|
944 |
+
**Limitation**: Models tested on fewer or easier benchmarks may appear artificially better.
|
945 |
+
|
946 |
+
**Recommendation**: Enable rank-based estimation for fairer comparisons.
|
947 |
+
""")
|
948 |
+
|
949 |
+
# Model comparison section
|
950 |
st.subheader("Model Comparison")
|
951 |
|
952 |
# Benchmark selection for radar chart (always visible)
|
|
|
992 |
available_models_for_selection = df_display.index.tolist()
|
993 |
models_info = f"({len(available_models_for_selection)} models total)"
|
994 |
|
995 |
+
# Model selection with filtered list - use top ranked models as default
|
996 |
if available_models_for_selection:
|
997 |
+
# Get top performers from ranking
|
998 |
+
top_models_from_ranking = ranking_df['Full_Model_Name'].head(5).tolist()
|
999 |
+
default_selection = [m for m in top_models_from_ranking if m in available_models_for_selection][:3]
|
1000 |
else:
|
1001 |
default_selection = []
|
1002 |
|