jmercat commited on
Commit
f7fb142
Β·
1 Parent(s): 7362f99

Uncertainty aware correlation heatmap

Browse files
Files changed (1) hide show
  1. app.py +545 -19
app.py CHANGED
@@ -216,7 +216,7 @@ def create_interactive_heatmap(corr_matrix, title="Correlation Heatmap"):
216
  hover_row = []
217
  for j, bench2 in enumerate(corr_matrix.columns):
218
  if i == j:
219
- hover_row.append(f"{clean_names[i]}<br>Reliability: 100%")
220
  else:
221
  corr_val = corr_matrix_pct.iloc[i, j]
222
  if pd.isna(corr_val):
@@ -461,6 +461,361 @@ def create_consensus_ranking(df, method='spearman', use_rank_imputation=True):
461
 
462
  return ranking_df, df_ranks, metadata
463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  def main():
465
  """Main application."""
466
  st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
@@ -681,32 +1036,181 @@ def show_overview_dashboard(df, stderr_df):
681
  st.write(f"Total pairs analyzed: {len(pairs)}")
682
 
683
  def show_interactive_heatmap(df):
684
- """Show the interactive heatmap."""
685
  st.header("πŸ”₯ Interactive Correlation Heatmap")
686
 
687
- # Correlation method selection
688
- col1, col2 = st.columns([3, 1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689
 
690
  with col2:
691
- corr_method = st.selectbox(
692
- "Correlation Method",
693
- ["pearson", "spearman", "kendall"],
694
- help="**Pearson's r** is a parametric measure of linear correlation that is sensitive to outliers and can be less appropriate for ordinal data.\n" +
695
- "**Spearman's rho** is a non-parametric measure of rank correlation that is less sensitive to outliers and can be more appropriate for ordinal data.\n" +
696
- "**Kendall's tau** is a non-parametric measure of rank correlation that is less sensitive to outliers and can be more appropriate for ordinal data."
697
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
 
699
  # Compute correlation matrix
700
- corr_matrix = compute_correlations(df, corr_method)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701
 
702
- # Create and display heatmap
703
- fig = create_interactive_heatmap(corr_matrix, f"{corr_method.capitalize()} Correlation Matrix")
704
  st.plotly_chart(fig, use_container_width=True)
705
 
706
- # Correlation statistics
707
  st.subheader("Correlation Statistics")
708
 
709
- # Get all off-diagonal correlations
710
  mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
711
  corr_values = corr_matrix.where(mask).stack().dropna()
712
 
@@ -724,14 +1228,36 @@ def show_interactive_heatmap(df):
724
  with col4:
725
  st.metric("Min Correlation", f"{corr_values.min():.3f}")
726
 
727
- # Distribution of correlations
728
  st.subheader("Correlation Distribution")
729
 
730
- fig = px.histogram(corr_values,
731
  nbins=20,
732
  title="Distribution of Pairwise Correlations",
733
  labels={'value': 'Correlation Coefficient', 'count': 'Frequency'})
734
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735
 
736
  def show_scatter_explorer(df, stderr_df):
737
  """Show the scatter plot explorer."""
 
216
  hover_row = []
217
  for j, bench2 in enumerate(corr_matrix.columns):
218
  if i == j:
219
+ hover_row.append(f"{clean_names[i]}<br>Correlation: 100%")
220
  else:
221
  corr_val = corr_matrix_pct.iloc[i, j]
222
  if pd.isna(corr_val):
 
461
 
462
  return ranking_df, df_ranks, metadata
463
 
464
+ def weighted_correlation(x, y, weights):
465
+ """Compute weighted Pearson correlation coefficient."""
466
+ # Remove NaN values
467
+ valid_mask = ~(np.isnan(x) | np.isnan(y) | np.isnan(weights))
468
+ if valid_mask.sum() < 3:
469
+ return np.nan, np.nan
470
+
471
+ x_clean = x[valid_mask]
472
+ y_clean = y[valid_mask]
473
+ w_clean = weights[valid_mask]
474
+
475
+ # Weighted means
476
+ x_mean = np.average(x_clean, weights=w_clean)
477
+ y_mean = np.average(y_clean, weights=w_clean)
478
+
479
+ # Weighted covariance and variances
480
+ cov = np.average((x_clean - x_mean) * (y_clean - y_mean), weights=w_clean)
481
+ var_x = np.average((x_clean - x_mean)**2, weights=w_clean)
482
+ var_y = np.average((y_clean - y_mean)**2, weights=w_clean)
483
+
484
+ # Weighted correlation
485
+ if var_x == 0 or var_y == 0:
486
+ return np.nan, np.nan
487
+
488
+ corr = cov / np.sqrt(var_x * var_y)
489
+
490
+ # Approximate degrees of freedom for weighted data
491
+ # Using effective sample size approximation
492
+ sum_w = np.sum(w_clean)
493
+ sum_w2 = np.sum(w_clean**2)
494
+ eff_n = sum_w**2 / sum_w2
495
+
496
+ # Standard error of correlation (approximate)
497
+ if eff_n > 3:
498
+ from scipy.stats import t
499
+ se_corr = np.sqrt((1 - corr**2) / (eff_n - 2))
500
+ t_stat = corr / se_corr
501
+ p_value = 2 * (1 - t.cdf(abs(t_stat), eff_n - 2))
502
+ else:
503
+ p_value = np.nan
504
+
505
+ return corr, p_value
506
+
507
+ def match_scores_with_stderr(scores_df, stderr_df, target_benchmarks):
508
+ """Match score columns with their corresponding stderr columns."""
509
+ target_benchmarks_dict, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
510
+
511
+ score_to_stderr_mapping = {}
512
+
513
+ # Look for stderr matches with various naming patterns
514
+ for col in target_benchmarks:
515
+ stderr_col = None
516
+
517
+ # Try different naming patterns in order of preference
518
+ potential_stderr_names = [
519
+ f"{col}_std_err", # Direct match
520
+ f"{col.replace('_accuracy', '_accuracy_std_err')}", # Handle _accuracy vs _accuracy_avg
521
+ f"{col.replace('_accuracy_avg', '_accuracy_std_err')}", # Handle _accuracy_avg
522
+ ]
523
+
524
+ # Special handling for MATH500 and other variations
525
+ if col == 'MATH500_accuracy':
526
+ potential_stderr_names.extend([
527
+ 'MATH500x2_accuracy_std_err',
528
+ 'MATH500_accuracy_std_err'
529
+ ])
530
+
531
+ # Add 'x2' variants for all benchmarks (in case there are other x2 versions)
532
+ base_name = col.replace('_accuracy_avg', '').replace('_accuracy', '')
533
+ potential_stderr_names.extend([
534
+ f"{base_name}x2_accuracy_std_err",
535
+ f"{base_name}_accuracy_std_err"
536
+ ])
537
+
538
+ # Find the first matching column with sufficient data
539
+ for stderr_name in potential_stderr_names:
540
+ if stderr_name in stderr_df.columns:
541
+ # Check if there's sufficient data (at least 10 models)
542
+ non_null_count = stderr_df[stderr_name].notna().sum()
543
+ if non_null_count >= 10:
544
+ stderr_col = stderr_name
545
+ break
546
+
547
+ if stderr_col:
548
+ score_to_stderr_mapping[col] = stderr_col
549
+
550
+ return score_to_stderr_mapping
551
+
552
+ def create_uncertainty_aware_correlation_matrix(scores_df, stderr_df, score_to_stderr_mapping):
553
+ """Create correlation matrix accounting for measurement uncertainties."""
554
+ target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
555
+
556
+ benchmarks = list(score_to_stderr_mapping.keys())
557
+ n_benchmarks = len(benchmarks)
558
+
559
+ # Initialize matrices
560
+ corr_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)
561
+ pvalue_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)
562
+ weighted_corr_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)
563
+ weighted_pvalue_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)
564
+
565
+ for i, bench1 in enumerate(benchmarks):
566
+ for j, bench2 in enumerate(benchmarks):
567
+ if i == j:
568
+ # Diagonal: compute reliability coefficient
569
+ stderr_col = score_to_stderr_mapping[bench1]
570
+
571
+ # Has actual stderr data
572
+ # reliability = 1 - (measurement_error_variance / total_variance)
573
+ scores = scores_df[bench1].dropna()
574
+ stderrs = stderr_df[stderr_col].dropna()
575
+
576
+ # Align data
577
+ common_idx = scores.index.intersection(stderrs.index)
578
+ if len(common_idx) >= 3:
579
+ aligned_scores = scores.loc[common_idx]
580
+ aligned_stderrs = stderrs.loc[common_idx]
581
+
582
+ # Total variance in observed scores
583
+ total_variance = aligned_scores.var()
584
+
585
+ # Mean measurement error variance
586
+ mean_error_variance = (aligned_stderrs**2).mean()
587
+
588
+ # Reliability = proportion of total variance that is "true" variance
589
+ if total_variance > 0:
590
+ reliability = max(0, 1 - (mean_error_variance / total_variance))
591
+
592
+ # For regular correlation, we still use 1.0 (mathematical definition)
593
+ corr_matrix[i, j] = 1.0
594
+ pvalue_matrix[i, j] = 0.0
595
+
596
+ # For weighted correlation, use reliability coefficient
597
+ weighted_corr_matrix[i, j] = reliability
598
+ weighted_pvalue_matrix[i, j] = 0.0
599
+ else:
600
+ corr_matrix[i, j] = 1.0
601
+ weighted_corr_matrix[i, j] = 0.0
602
+ pvalue_matrix[i, j] = 0.0
603
+ weighted_pvalue_matrix[i, j] = 0.0
604
+ else:
605
+ # Insufficient data
606
+ corr_matrix[i, j] = 1.0
607
+ weighted_corr_matrix[i, j] = np.nan
608
+ pvalue_matrix[i, j] = 0.0
609
+ weighted_pvalue_matrix[i, j] = np.nan
610
+ continue
611
+
612
+ # Get common valid data
613
+ x = scores_df[bench1].values
614
+ y = scores_df[bench2].values
615
+
616
+ # Get standard errors
617
+ stderr1_col = score_to_stderr_mapping[bench1]
618
+ stderr2_col = score_to_stderr_mapping[bench2]
619
+
620
+ # Standard (unweighted) correlation
621
+ valid_mask = ~(np.isnan(x) | np.isnan(y))
622
+ if valid_mask.sum() >= 3:
623
+ corr, p_val = pearsonr(x[valid_mask], y[valid_mask])
624
+ corr_matrix[i, j] = corr
625
+ pvalue_matrix[i, j] = p_val
626
+
627
+ # Weighted correlation
628
+ stderr1 = stderr_df[stderr1_col].values
629
+ stderr2 = stderr_df[stderr2_col].values
630
+
631
+ # Weighted correlation using inverse variance weighting
632
+ # Weight = 1 / (stderr1^2 + stderr2^2) - accounting for error in both variables
633
+ valid_stderr_mask = ~(np.isnan(stderr1) | np.isnan(stderr2)) & valid_mask
634
+ if valid_stderr_mask.sum() >= 3:
635
+ combined_variance = stderr1[valid_stderr_mask]**2 + stderr2[valid_stderr_mask]**2
636
+ # Avoid division by zero
637
+ weights = np.where(combined_variance > 0, 1.0 / combined_variance, 0)
638
+
639
+ if weights.sum() > 0:
640
+ w_corr, w_p_val = weighted_correlation(
641
+ x[valid_stderr_mask],
642
+ y[valid_stderr_mask],
643
+ weights
644
+ )
645
+ weighted_corr_matrix[i, j] = w_corr
646
+ weighted_pvalue_matrix[i, j] = w_p_val
647
+ else:
648
+ # Use regular correlation for weighted matrix too
649
+ if valid_mask.sum() >= 3:
650
+ weighted_corr_matrix[i, j] = corr_matrix[i, j]
651
+ weighted_pvalue_matrix[i, j] = pvalue_matrix[i, j]
652
+
653
+ # Convert to DataFrames
654
+ corr_df = pd.DataFrame(corr_matrix, index=benchmarks, columns=benchmarks)
655
+ pvalue_df = pd.DataFrame(pvalue_matrix, index=benchmarks, columns=benchmarks)
656
+ weighted_corr_df = pd.DataFrame(weighted_corr_matrix, index=benchmarks, columns=benchmarks)
657
+ weighted_pvalue_df = pd.DataFrame(weighted_pvalue_matrix, index=benchmarks, columns=benchmarks)
658
+
659
+ return corr_df, pvalue_df, weighted_corr_df, weighted_pvalue_df
660
+
661
+ def create_uncertainty_weighted_heatmap_plotly(weighted_corr_df, title_prefix="Uncertainty-Weighted Correlation Analysis"):
662
+ """Create a single uncertainty-weighted heatmap using Plotly."""
663
+ target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
664
+
665
+ # Get clean names for display
666
+ clean_names = [clean_benchmark_name(name) for name in weighted_corr_df.columns]
667
+
668
+ # Weighted correlation heatmap
669
+ weighted_corr_pct = (weighted_corr_df * 100).round(1)
670
+
671
+ # Create hover text for weighted correlations
672
+ hover_text_weighted = []
673
+ for i, bench1 in enumerate(weighted_corr_df.columns):
674
+ hover_row = []
675
+ for j, bench2 in enumerate(weighted_corr_df.columns):
676
+ if i == j:
677
+ reliability = weighted_corr_df.iloc[i, j]
678
+ if pd.isna(reliability):
679
+ hover_row.append(f"{clean_names[i]}<br>Reliability: Unknown")
680
+ else:
681
+ hover_row.append(f"{clean_names[i]}<br>Reliability: {reliability*100:.1f}%")
682
+ else:
683
+ corr_val = weighted_corr_pct.iloc[i, j]
684
+ if pd.isna(corr_val):
685
+ hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>No weighted data")
686
+ else:
687
+ hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>Weighted correlation: {corr_val:.1f}%")
688
+ hover_text_weighted.append(hover_row)
689
+
690
+ # Create the heatmap
691
+ fig = go.Figure(data=go.Heatmap(
692
+ z=weighted_corr_df.values,
693
+ x=clean_names,
694
+ y=clean_names,
695
+ colorscale='RdBu_r',
696
+ zmid=0,
697
+ text=weighted_corr_pct.values,
698
+ texttemplate="%{text}",
699
+ textfont={"size": 10},
700
+ hoverinfo='text',
701
+ hovertext=hover_text_weighted,
702
+ colorbar=dict(title="Correlation")
703
+ ))
704
+
705
+ # Update layout
706
+ fig.update_layout(
707
+ title=f"{title_prefix}<br><sub>Diagonal shows reliability coefficients (signal-to-noise ratios)</sub>",
708
+ width=800,
709
+ height=700,
710
+ font=dict(size=12),
711
+ xaxis=dict(tickangle=45),
712
+ yaxis=dict(tickangle=0)
713
+ )
714
+
715
+ return fig
716
+
717
+ def create_uncertainty_aware_heatmap_plotly(corr_df, weighted_corr_df, title_prefix="Correlation Analysis"):
718
+ """Create side-by-side interactive heatmaps comparing regular vs weighted correlations using Plotly."""
719
+ target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
720
+
721
+ # Get clean names for display
722
+ clean_names = [clean_benchmark_name(name) for name in corr_df.columns]
723
+
724
+ # Create subplots
725
+ fig = make_subplots(
726
+ rows=1, cols=2,
727
+ subplot_titles=('Regular Correlation Matrix<br>(Equal weighting)',
728
+ 'Uncertainty-Weighted Correlation Matrix<br>(Inverse variance weighting)'),
729
+ horizontal_spacing=0.15
730
+ )
731
+
732
+ # Regular correlation heatmap
733
+ corr_matrix_pct = (corr_df * 100).round(1)
734
+
735
+ # Create hover text for regular correlations
736
+ hover_text_regular = []
737
+ for i, bench1 in enumerate(corr_df.columns):
738
+ hover_row = []
739
+ for j, bench2 in enumerate(corr_df.columns):
740
+ if i == j:
741
+ hover_row.append(f"{clean_names[i]}<br>Self-correlation: 100%")
742
+ else:
743
+ corr_val = corr_matrix_pct.iloc[i, j]
744
+ if pd.isna(corr_val):
745
+ hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>No data")
746
+ else:
747
+ hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>Correlation: {corr_val:.1f}%")
748
+ hover_text_regular.append(hover_row)
749
+
750
+ fig.add_trace(go.Heatmap(
751
+ z=corr_df.values,
752
+ x=clean_names,
753
+ y=clean_names,
754
+ colorscale='RdBu_r',
755
+ zmid=0,
756
+ text=corr_matrix_pct.values,
757
+ texttemplate="%{text}",
758
+ textfont={"size": 8},
759
+ hoverinfo='text',
760
+ hovertext=hover_text_regular,
761
+ showscale=False,
762
+ name="Regular"
763
+ ), row=1, col=1)
764
+
765
+ # Weighted correlation heatmap
766
+ weighted_corr_pct = (weighted_corr_df * 100).round(1)
767
+
768
+ # Create hover text for weighted correlations
769
+ hover_text_weighted = []
770
+ for i, bench1 in enumerate(weighted_corr_df.columns):
771
+ hover_row = []
772
+ for j, bench2 in enumerate(weighted_corr_df.columns):
773
+ if i == j:
774
+ reliability = weighted_corr_df.iloc[i, j]
775
+ if pd.isna(reliability):
776
+ hover_row.append(f"{clean_names[i]}<br>Reliability: Unknown")
777
+ else:
778
+ hover_row.append(f"{clean_names[i]}<br>Reliability: {reliability*100:.1f}%")
779
+ else:
780
+ corr_val = weighted_corr_pct.iloc[i, j]
781
+ if pd.isna(corr_val):
782
+ hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>No weighted data")
783
+ else:
784
+ hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>Weighted correlation: {corr_val:.1f}%")
785
+ hover_text_weighted.append(hover_row)
786
+
787
+ fig.add_trace(go.Heatmap(
788
+ z=weighted_corr_df.values,
789
+ x=clean_names,
790
+ y=clean_names,
791
+ colorscale='RdBu_r',
792
+ zmid=0,
793
+ text=weighted_corr_pct.values,
794
+ texttemplate="%{text}",
795
+ textfont={"size": 8},
796
+ hoverinfo='text',
797
+ hovertext=hover_text_weighted,
798
+ showscale=True,
799
+ colorbar=dict(title="Correlation", x=1.02),
800
+ name="Weighted"
801
+ ), row=1, col=2)
802
+
803
+ # Update layout
804
+ fig.update_layout(
805
+ title=f"{title_prefix}<br><sub>Diagonal shows reliability coefficients for weighted matrix</sub>",
806
+ width=1400,
807
+ height=700,
808
+ font=dict(size=12)
809
+ )
810
+
811
+ # Update axes
812
+ fig.update_xaxes(tickangle=45, row=1, col=1)
813
+ fig.update_xaxes(tickangle=45, row=1, col=2)
814
+ fig.update_yaxes(tickangle=0, row=1, col=1)
815
+ fig.update_yaxes(tickangle=0, row=1, col=2)
816
+
817
+ return fig
818
+
819
  def main():
820
  """Main application."""
821
  st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
 
1036
  st.write(f"Total pairs analyzed: {len(pairs)}")
1037
 
1038
  def show_interactive_heatmap(df):
1039
+ """Display interactive correlation heatmap with various options."""
1040
  st.header("πŸ”₯ Interactive Correlation Heatmap")
1041
 
1042
+ # Check if stderr data is available
1043
+ stderr_df = load_stderr_data()
1044
+
1045
+ col1, col2, col3 = st.columns(3)
1046
+
1047
+ with col1:
1048
+ # Check if stderr data is available for the uncertainty-aware checkbox
1049
+ stderr_available = stderr_df is not None
1050
+ uncertainty_aware = False
1051
+ if stderr_available:
1052
+ uncertainty_aware = st.checkbox(
1053
+ "πŸ”¬ Uncertainty-Aware Analysis",
1054
+ value=False,
1055
+ help="Use measurement uncertainties to weight correlations (requires standard error data)"
1056
+ )
1057
+
1058
+ # Adjust method selector based on uncertainty-aware mode
1059
+ if uncertainty_aware:
1060
+ st.selectbox(
1061
+ "Correlation Method",
1062
+ ["pearson"],
1063
+ index=0,
1064
+ disabled=True,
1065
+ help="**Uncertainty-aware analysis uses Pearson correlations only**\n\nWeighted correlations require parametric methods to properly account for measurement uncertainties."
1066
+ )
1067
+ method = "pearson" # Force Pearson for uncertainty-aware analysis
1068
+ else:
1069
+ method = st.selectbox(
1070
+ "Correlation Method",
1071
+ ["kendall", "spearman", "pearson"],
1072
+ index=0,
1073
+ help="**Pearson's r** is a parametric measure of linear correlation that is sensitive to outliers and can be less appropriate for ordinal data.\n" +
1074
+ "**Spearman's rho** is a non-parametric measure of rank correlation that is less sensitive to outliers and can be more appropriate for ordinal data.\n" +
1075
+ "**Kendall's tau** is a non-parametric measure of rank correlation that is less sensitive to outliers and can be more appropriate for ordinal data."
1076
+ )
1077
 
1078
  with col2:
1079
+ show_values = st.checkbox("Show correlation values", value=True)
1080
+
1081
+ # Additional options
1082
+ if uncertainty_aware and stderr_df is not None:
1083
+ st.info("πŸ”¬ **Uncertainty-Aware Mode**: Correlations are weighted by inverse measurement variance. "
1084
+ "Diagonal shows reliability coefficients (proportion of variance that is 'true signal' vs measurement error).")
1085
+
1086
+ # Match scores with stderr data
1087
+ available_benchmarks = list(df.columns)
1088
+ score_to_stderr_mapping = match_scores_with_stderr(df, stderr_df, available_benchmarks)
1089
+
1090
+ if len(score_to_stderr_mapping) == 0:
1091
+ st.warning("No matching standard error data found for the selected benchmarks. "
1092
+ "Falling back to regular correlation analysis.")
1093
+ uncertainty_aware = False
1094
+ else:
1095
+ # Filter to benchmarks with stderr data
1096
+ benchmarks_with_stderr = list(score_to_stderr_mapping.keys())
1097
+ df_stderr = df[benchmarks_with_stderr].copy()
1098
+
1099
+ st.success(f"Found standard error data for {len(score_to_stderr_mapping)} benchmarks: "
1100
+ f"{', '.join([clean_benchmark_name(b) for b in benchmarks_with_stderr])}")
1101
+
1102
+ # Align dataframes
1103
+ common_models = df_stderr.index.intersection(stderr_df.index)
1104
+ df_aligned = df_stderr.loc[common_models]
1105
+ stderr_aligned = stderr_df.loc[common_models]
1106
+
1107
+ st.write(f"**Analysis scope**: {len(common_models)} models with both scores and standard errors")
1108
+
1109
+ # Compute uncertainty-aware correlations
1110
+ with st.spinner("Computing uncertainty-weighted correlations..."):
1111
+ corr_df, pvalue_df, weighted_corr_df, weighted_pvalue_df = create_uncertainty_aware_correlation_matrix(
1112
+ df_aligned, stderr_aligned, score_to_stderr_mapping
1113
+ )
1114
+
1115
+ # Create and display uncertainty-aware heatmap
1116
+ fig = create_uncertainty_weighted_heatmap_plotly(
1117
+ weighted_corr_df,
1118
+ title_prefix=f"Uncertainty-Weighted {method.capitalize()} Correlations"
1119
+ )
1120
+
1121
+ st.plotly_chart(fig, use_container_width=True)
1122
+
1123
+ # Show reliability statistics
1124
+ with st.expander("πŸ“Š Reliability Statistics", expanded=False):
1125
+ st.write("**Benchmark Reliability Coefficients** (proportion of variance that is true signal):")
1126
+ reliability_data = []
1127
+ for bench in weighted_corr_df.columns:
1128
+ diag_val = weighted_corr_df.loc[bench, bench]
1129
+ if not pd.isna(diag_val):
1130
+ reliability_data.append({
1131
+ 'Benchmark': clean_benchmark_name(bench),
1132
+ 'Reliability': f"{diag_val*100:.1f}%",
1133
+ 'Category': next((cat for cat, benchs in get_focused_benchmark_mapping()[1].items()
1134
+ for b in benchs if get_focused_benchmark_mapping()[0].get(b) == bench), 'Unknown')
1135
+ })
1136
+
1137
+ if reliability_data:
1138
+ reliability_df = pd.DataFrame(reliability_data)
1139
+ st.dataframe(reliability_df, use_container_width=True)
1140
+
1141
+ avg_reliability = pd.to_numeric([d['Reliability'].rstrip('%') for d in reliability_data]).mean() / 100
1142
+ st.metric("Average Reliability", f"{avg_reliability:.3f} ({avg_reliability*100:.1f}%)")
1143
+
1144
+ # Show correlation differences
1145
+ with st.expander("πŸ“ˆ Impact of Uncertainty Weighting", expanded=False):
1146
+ st.write("**Correlation Changes** (Weighted - Regular):")
1147
+
1148
+ diff_data = []
1149
+ for i, bench1 in enumerate(corr_df.columns):
1150
+ for j, bench2 in enumerate(corr_df.columns):
1151
+ if i < j: # Only upper triangle
1152
+ regular_corr = corr_df.iloc[i, j]
1153
+ weighted_corr = weighted_corr_df.iloc[i, j]
1154
+
1155
+ if not (pd.isna(regular_corr) or pd.isna(weighted_corr)):
1156
+ diff = weighted_corr - regular_corr
1157
+ diff_data.append({
1158
+ 'Benchmark Pair': f"{clean_benchmark_name(bench1)} vs {clean_benchmark_name(bench2)}",
1159
+ 'Regular': f"{regular_corr:.3f}",
1160
+ 'Weighted': f"{weighted_corr:.3f}",
1161
+ 'Difference': f"{diff:+.3f}",
1162
+ 'Abs Difference': abs(diff)
1163
+ })
1164
+
1165
+ if diff_data:
1166
+ diff_df = pd.DataFrame(diff_data)
1167
+ # Sort by absolute difference
1168
+ diff_df_sorted = diff_df.sort_values('Abs Difference', ascending=False)
1169
+ st.dataframe(diff_df_sorted.drop('Abs Difference', axis=1), use_container_width=True)
1170
+
1171
+ # Summary stats
1172
+ diffs = [float(d['Difference']) for d in diff_data]
1173
+ col1, col2, col3 = st.columns(3)
1174
+ with col1:
1175
+ st.metric("Mean Change", f"{np.mean(diffs):+.4f}")
1176
+ with col2:
1177
+ st.metric("Max |Change|", f"{max(abs(d) for d in diffs):.4f}")
1178
+ with col3:
1179
+ st.metric("Large Changes (|Ξ”| > 0.1)", f"{sum(1 for d in diffs if abs(d) > 0.1)}")
1180
+
1181
+ return # Exit early for uncertainty-aware analysis
1182
+
1183
+ # Regular correlation analysis (original functionality restored)
1184
+ if df.empty:
1185
+ st.error("No data available.")
1186
+ return
1187
 
1188
  # Compute correlation matrix
1189
+ corr_matrix = compute_correlations(df, method)
1190
+
1191
+ if corr_matrix.empty:
1192
+ st.error("Unable to compute correlations.")
1193
+ return
1194
+
1195
+ # Create and display regular heatmap (original way)
1196
+ fig = create_interactive_heatmap(corr_matrix, f"{method.capitalize()} Correlation Matrix")
1197
+
1198
+ # Add correlation values as text annotations if requested
1199
+ if show_values:
1200
+ # Convert correlations to percentages for display
1201
+ corr_text = (corr_matrix * 100).round().astype(str)
1202
+ fig.update_traces(
1203
+ text=corr_text.values,
1204
+ texttemplate="%{text}",
1205
+ textfont={"size": 8}
1206
+ )
1207
 
 
 
1208
  st.plotly_chart(fig, use_container_width=True)
1209
 
1210
+ # Correlation statistics (original)
1211
  st.subheader("Correlation Statistics")
1212
 
1213
+ # Get all off-diagonal correlations (original method)
1214
  mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
1215
  corr_values = corr_matrix.where(mask).stack().dropna()
1216
 
 
1228
  with col4:
1229
  st.metric("Min Correlation", f"{corr_values.min():.3f}")
1230
 
1231
+ # Distribution of correlations (original)
1232
  st.subheader("Correlation Distribution")
1233
 
1234
+ fig_hist = px.histogram(corr_values,
1235
  nbins=20,
1236
  title="Distribution of Pairwise Correlations",
1237
  labels={'value': 'Correlation Coefficient', 'count': 'Frequency'})
1238
+ st.plotly_chart(fig_hist, use_container_width=True)
1239
+
1240
+ # Methodology note
1241
+ with st.expander("ℹ️ About Correlation Methods", expanded=False):
1242
+ st.markdown("""
1243
+ **Pearson**: Measures linear relationships. Values range from -1 to +1.
1244
+ - +1: Perfect positive linear relationship
1245
+ - 0: No linear relationship
1246
+ - -1: Perfect negative linear relationship
1247
+
1248
+ **Spearman**: Measures monotonic relationships using ranks. More robust to outliers.
1249
+ - Good for non-linear but monotonic relationships
1250
+ - Less sensitive to extreme values
1251
+
1252
+ **Kendall**: Measures ordinal association using concordant/discordant pairs.
1253
+ - More robust than Spearman for small samples
1254
+ - Better for data with many tied values
1255
+
1256
+ **Uncertainty-Aware Analysis**: When available, uses measurement standard errors to:
1257
+ - Weight correlations by inverse measurement variance
1258
+ - Show reliability coefficients (signal-to-noise ratios) on diagonal
1259
+ - Provide more accurate correlation estimates for noisy data
1260
+ """)
1261
 
1262
  def show_scatter_explorer(df, stderr_df):
1263
  """Show the scatter plot explorer."""