Uncertainty aware correlation heatmap
Browse files
app.py
CHANGED
|
@@ -216,7 +216,7 @@ def create_interactive_heatmap(corr_matrix, title="Correlation Heatmap"):
|
|
| 216 |
hover_row = []
|
| 217 |
for j, bench2 in enumerate(corr_matrix.columns):
|
| 218 |
if i == j:
|
| 219 |
-
hover_row.append(f"{clean_names[i]}<br>
|
| 220 |
else:
|
| 221 |
corr_val = corr_matrix_pct.iloc[i, j]
|
| 222 |
if pd.isna(corr_val):
|
|
@@ -461,6 +461,361 @@ def create_consensus_ranking(df, method='spearman', use_rank_imputation=True):
|
|
| 461 |
|
| 462 |
return ranking_df, df_ranks, metadata
|
| 463 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
def main():
|
| 465 |
"""Main application."""
|
| 466 |
st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
|
|
@@ -681,32 +1036,181 @@ def show_overview_dashboard(df, stderr_df):
|
|
| 681 |
st.write(f"Total pairs analyzed: {len(pairs)}")
|
| 682 |
|
| 683 |
def show_interactive_heatmap(df):
|
| 684 |
-
"""
|
| 685 |
st.header("π₯ Interactive Correlation Heatmap")
|
| 686 |
|
| 687 |
-
#
|
| 688 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 689 |
|
| 690 |
with col2:
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 698 |
|
| 699 |
# Compute correlation matrix
|
| 700 |
-
corr_matrix = compute_correlations(df,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 701 |
|
| 702 |
-
# Create and display heatmap
|
| 703 |
-
fig = create_interactive_heatmap(corr_matrix, f"{corr_method.capitalize()} Correlation Matrix")
|
| 704 |
st.plotly_chart(fig, use_container_width=True)
|
| 705 |
|
| 706 |
-
# Correlation statistics
|
| 707 |
st.subheader("Correlation Statistics")
|
| 708 |
|
| 709 |
-
# Get all off-diagonal correlations
|
| 710 |
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
|
| 711 |
corr_values = corr_matrix.where(mask).stack().dropna()
|
| 712 |
|
|
@@ -724,14 +1228,36 @@ def show_interactive_heatmap(df):
|
|
| 724 |
with col4:
|
| 725 |
st.metric("Min Correlation", f"{corr_values.min():.3f}")
|
| 726 |
|
| 727 |
-
# Distribution of correlations
|
| 728 |
st.subheader("Correlation Distribution")
|
| 729 |
|
| 730 |
-
|
| 731 |
nbins=20,
|
| 732 |
title="Distribution of Pairwise Correlations",
|
| 733 |
labels={'value': 'Correlation Coefficient', 'count': 'Frequency'})
|
| 734 |
-
st.plotly_chart(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 735 |
|
| 736 |
def show_scatter_explorer(df, stderr_df):
|
| 737 |
"""Show the scatter plot explorer."""
|
|
|
|
| 216 |
hover_row = []
|
| 217 |
for j, bench2 in enumerate(corr_matrix.columns):
|
| 218 |
if i == j:
|
| 219 |
+
hover_row.append(f"{clean_names[i]}<br>Correlation: 100%")
|
| 220 |
else:
|
| 221 |
corr_val = corr_matrix_pct.iloc[i, j]
|
| 222 |
if pd.isna(corr_val):
|
|
|
|
| 461 |
|
| 462 |
return ranking_df, df_ranks, metadata
|
| 463 |
|
| 464 |
+
def weighted_correlation(x, y, weights):
|
| 465 |
+
"""Compute weighted Pearson correlation coefficient."""
|
| 466 |
+
# Remove NaN values
|
| 467 |
+
valid_mask = ~(np.isnan(x) | np.isnan(y) | np.isnan(weights))
|
| 468 |
+
if valid_mask.sum() < 3:
|
| 469 |
+
return np.nan, np.nan
|
| 470 |
+
|
| 471 |
+
x_clean = x[valid_mask]
|
| 472 |
+
y_clean = y[valid_mask]
|
| 473 |
+
w_clean = weights[valid_mask]
|
| 474 |
+
|
| 475 |
+
# Weighted means
|
| 476 |
+
x_mean = np.average(x_clean, weights=w_clean)
|
| 477 |
+
y_mean = np.average(y_clean, weights=w_clean)
|
| 478 |
+
|
| 479 |
+
# Weighted covariance and variances
|
| 480 |
+
cov = np.average((x_clean - x_mean) * (y_clean - y_mean), weights=w_clean)
|
| 481 |
+
var_x = np.average((x_clean - x_mean)**2, weights=w_clean)
|
| 482 |
+
var_y = np.average((y_clean - y_mean)**2, weights=w_clean)
|
| 483 |
+
|
| 484 |
+
# Weighted correlation
|
| 485 |
+
if var_x == 0 or var_y == 0:
|
| 486 |
+
return np.nan, np.nan
|
| 487 |
+
|
| 488 |
+
corr = cov / np.sqrt(var_x * var_y)
|
| 489 |
+
|
| 490 |
+
# Approximate degrees of freedom for weighted data
|
| 491 |
+
# Using effective sample size approximation
|
| 492 |
+
sum_w = np.sum(w_clean)
|
| 493 |
+
sum_w2 = np.sum(w_clean**2)
|
| 494 |
+
eff_n = sum_w**2 / sum_w2
|
| 495 |
+
|
| 496 |
+
# Standard error of correlation (approximate)
|
| 497 |
+
if eff_n > 3:
|
| 498 |
+
from scipy.stats import t
|
| 499 |
+
se_corr = np.sqrt((1 - corr**2) / (eff_n - 2))
|
| 500 |
+
t_stat = corr / se_corr
|
| 501 |
+
p_value = 2 * (1 - t.cdf(abs(t_stat), eff_n - 2))
|
| 502 |
+
else:
|
| 503 |
+
p_value = np.nan
|
| 504 |
+
|
| 505 |
+
return corr, p_value
|
| 506 |
+
|
| 507 |
+
def match_scores_with_stderr(scores_df, stderr_df, target_benchmarks):
|
| 508 |
+
"""Match score columns with their corresponding stderr columns."""
|
| 509 |
+
target_benchmarks_dict, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
|
| 510 |
+
|
| 511 |
+
score_to_stderr_mapping = {}
|
| 512 |
+
|
| 513 |
+
# Look for stderr matches with various naming patterns
|
| 514 |
+
for col in target_benchmarks:
|
| 515 |
+
stderr_col = None
|
| 516 |
+
|
| 517 |
+
# Try different naming patterns in order of preference
|
| 518 |
+
potential_stderr_names = [
|
| 519 |
+
f"{col}_std_err", # Direct match
|
| 520 |
+
f"{col.replace('_accuracy', '_accuracy_std_err')}", # Handle _accuracy vs _accuracy_avg
|
| 521 |
+
f"{col.replace('_accuracy_avg', '_accuracy_std_err')}", # Handle _accuracy_avg
|
| 522 |
+
]
|
| 523 |
+
|
| 524 |
+
# Special handling for MATH500 and other variations
|
| 525 |
+
if col == 'MATH500_accuracy':
|
| 526 |
+
potential_stderr_names.extend([
|
| 527 |
+
'MATH500x2_accuracy_std_err',
|
| 528 |
+
'MATH500_accuracy_std_err'
|
| 529 |
+
])
|
| 530 |
+
|
| 531 |
+
# Add 'x2' variants for all benchmarks (in case there are other x2 versions)
|
| 532 |
+
base_name = col.replace('_accuracy_avg', '').replace('_accuracy', '')
|
| 533 |
+
potential_stderr_names.extend([
|
| 534 |
+
f"{base_name}x2_accuracy_std_err",
|
| 535 |
+
f"{base_name}_accuracy_std_err"
|
| 536 |
+
])
|
| 537 |
+
|
| 538 |
+
# Find the first matching column with sufficient data
|
| 539 |
+
for stderr_name in potential_stderr_names:
|
| 540 |
+
if stderr_name in stderr_df.columns:
|
| 541 |
+
# Check if there's sufficient data (at least 10 models)
|
| 542 |
+
non_null_count = stderr_df[stderr_name].notna().sum()
|
| 543 |
+
if non_null_count >= 10:
|
| 544 |
+
stderr_col = stderr_name
|
| 545 |
+
break
|
| 546 |
+
|
| 547 |
+
if stderr_col:
|
| 548 |
+
score_to_stderr_mapping[col] = stderr_col
|
| 549 |
+
|
| 550 |
+
return score_to_stderr_mapping
|
| 551 |
+
|
| 552 |
+
def create_uncertainty_aware_correlation_matrix(scores_df, stderr_df, score_to_stderr_mapping):
|
| 553 |
+
"""Create correlation matrix accounting for measurement uncertainties."""
|
| 554 |
+
target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
|
| 555 |
+
|
| 556 |
+
benchmarks = list(score_to_stderr_mapping.keys())
|
| 557 |
+
n_benchmarks = len(benchmarks)
|
| 558 |
+
|
| 559 |
+
# Initialize matrices
|
| 560 |
+
corr_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)
|
| 561 |
+
pvalue_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)
|
| 562 |
+
weighted_corr_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)
|
| 563 |
+
weighted_pvalue_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)
|
| 564 |
+
|
| 565 |
+
for i, bench1 in enumerate(benchmarks):
|
| 566 |
+
for j, bench2 in enumerate(benchmarks):
|
| 567 |
+
if i == j:
|
| 568 |
+
# Diagonal: compute reliability coefficient
|
| 569 |
+
stderr_col = score_to_stderr_mapping[bench1]
|
| 570 |
+
|
| 571 |
+
# Has actual stderr data
|
| 572 |
+
# reliability = 1 - (measurement_error_variance / total_variance)
|
| 573 |
+
scores = scores_df[bench1].dropna()
|
| 574 |
+
stderrs = stderr_df[stderr_col].dropna()
|
| 575 |
+
|
| 576 |
+
# Align data
|
| 577 |
+
common_idx = scores.index.intersection(stderrs.index)
|
| 578 |
+
if len(common_idx) >= 3:
|
| 579 |
+
aligned_scores = scores.loc[common_idx]
|
| 580 |
+
aligned_stderrs = stderrs.loc[common_idx]
|
| 581 |
+
|
| 582 |
+
# Total variance in observed scores
|
| 583 |
+
total_variance = aligned_scores.var()
|
| 584 |
+
|
| 585 |
+
# Mean measurement error variance
|
| 586 |
+
mean_error_variance = (aligned_stderrs**2).mean()
|
| 587 |
+
|
| 588 |
+
# Reliability = proportion of total variance that is "true" variance
|
| 589 |
+
if total_variance > 0:
|
| 590 |
+
reliability = max(0, 1 - (mean_error_variance / total_variance))
|
| 591 |
+
|
| 592 |
+
# For regular correlation, we still use 1.0 (mathematical definition)
|
| 593 |
+
corr_matrix[i, j] = 1.0
|
| 594 |
+
pvalue_matrix[i, j] = 0.0
|
| 595 |
+
|
| 596 |
+
# For weighted correlation, use reliability coefficient
|
| 597 |
+
weighted_corr_matrix[i, j] = reliability
|
| 598 |
+
weighted_pvalue_matrix[i, j] = 0.0
|
| 599 |
+
else:
|
| 600 |
+
corr_matrix[i, j] = 1.0
|
| 601 |
+
weighted_corr_matrix[i, j] = 0.0
|
| 602 |
+
pvalue_matrix[i, j] = 0.0
|
| 603 |
+
weighted_pvalue_matrix[i, j] = 0.0
|
| 604 |
+
else:
|
| 605 |
+
# Insufficient data
|
| 606 |
+
corr_matrix[i, j] = 1.0
|
| 607 |
+
weighted_corr_matrix[i, j] = np.nan
|
| 608 |
+
pvalue_matrix[i, j] = 0.0
|
| 609 |
+
weighted_pvalue_matrix[i, j] = np.nan
|
| 610 |
+
continue
|
| 611 |
+
|
| 612 |
+
# Get common valid data
|
| 613 |
+
x = scores_df[bench1].values
|
| 614 |
+
y = scores_df[bench2].values
|
| 615 |
+
|
| 616 |
+
# Get standard errors
|
| 617 |
+
stderr1_col = score_to_stderr_mapping[bench1]
|
| 618 |
+
stderr2_col = score_to_stderr_mapping[bench2]
|
| 619 |
+
|
| 620 |
+
# Standard (unweighted) correlation
|
| 621 |
+
valid_mask = ~(np.isnan(x) | np.isnan(y))
|
| 622 |
+
if valid_mask.sum() >= 3:
|
| 623 |
+
corr, p_val = pearsonr(x[valid_mask], y[valid_mask])
|
| 624 |
+
corr_matrix[i, j] = corr
|
| 625 |
+
pvalue_matrix[i, j] = p_val
|
| 626 |
+
|
| 627 |
+
# Weighted correlation
|
| 628 |
+
stderr1 = stderr_df[stderr1_col].values
|
| 629 |
+
stderr2 = stderr_df[stderr2_col].values
|
| 630 |
+
|
| 631 |
+
# Weighted correlation using inverse variance weighting
|
| 632 |
+
# Weight = 1 / (stderr1^2 + stderr2^2) - accounting for error in both variables
|
| 633 |
+
valid_stderr_mask = ~(np.isnan(stderr1) | np.isnan(stderr2)) & valid_mask
|
| 634 |
+
if valid_stderr_mask.sum() >= 3:
|
| 635 |
+
combined_variance = stderr1[valid_stderr_mask]**2 + stderr2[valid_stderr_mask]**2
|
| 636 |
+
# Avoid division by zero
|
| 637 |
+
weights = np.where(combined_variance > 0, 1.0 / combined_variance, 0)
|
| 638 |
+
|
| 639 |
+
if weights.sum() > 0:
|
| 640 |
+
w_corr, w_p_val = weighted_correlation(
|
| 641 |
+
x[valid_stderr_mask],
|
| 642 |
+
y[valid_stderr_mask],
|
| 643 |
+
weights
|
| 644 |
+
)
|
| 645 |
+
weighted_corr_matrix[i, j] = w_corr
|
| 646 |
+
weighted_pvalue_matrix[i, j] = w_p_val
|
| 647 |
+
else:
|
| 648 |
+
# Use regular correlation for weighted matrix too
|
| 649 |
+
if valid_mask.sum() >= 3:
|
| 650 |
+
weighted_corr_matrix[i, j] = corr_matrix[i, j]
|
| 651 |
+
weighted_pvalue_matrix[i, j] = pvalue_matrix[i, j]
|
| 652 |
+
|
| 653 |
+
# Convert to DataFrames
|
| 654 |
+
corr_df = pd.DataFrame(corr_matrix, index=benchmarks, columns=benchmarks)
|
| 655 |
+
pvalue_df = pd.DataFrame(pvalue_matrix, index=benchmarks, columns=benchmarks)
|
| 656 |
+
weighted_corr_df = pd.DataFrame(weighted_corr_matrix, index=benchmarks, columns=benchmarks)
|
| 657 |
+
weighted_pvalue_df = pd.DataFrame(weighted_pvalue_matrix, index=benchmarks, columns=benchmarks)
|
| 658 |
+
|
| 659 |
+
return corr_df, pvalue_df, weighted_corr_df, weighted_pvalue_df
|
| 660 |
+
|
| 661 |
+
def create_uncertainty_weighted_heatmap_plotly(weighted_corr_df, title_prefix="Uncertainty-Weighted Correlation Analysis"):
|
| 662 |
+
"""Create a single uncertainty-weighted heatmap using Plotly."""
|
| 663 |
+
target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
|
| 664 |
+
|
| 665 |
+
# Get clean names for display
|
| 666 |
+
clean_names = [clean_benchmark_name(name) for name in weighted_corr_df.columns]
|
| 667 |
+
|
| 668 |
+
# Weighted correlation heatmap
|
| 669 |
+
weighted_corr_pct = (weighted_corr_df * 100).round(1)
|
| 670 |
+
|
| 671 |
+
# Create hover text for weighted correlations
|
| 672 |
+
hover_text_weighted = []
|
| 673 |
+
for i, bench1 in enumerate(weighted_corr_df.columns):
|
| 674 |
+
hover_row = []
|
| 675 |
+
for j, bench2 in enumerate(weighted_corr_df.columns):
|
| 676 |
+
if i == j:
|
| 677 |
+
reliability = weighted_corr_df.iloc[i, j]
|
| 678 |
+
if pd.isna(reliability):
|
| 679 |
+
hover_row.append(f"{clean_names[i]}<br>Reliability: Unknown")
|
| 680 |
+
else:
|
| 681 |
+
hover_row.append(f"{clean_names[i]}<br>Reliability: {reliability*100:.1f}%")
|
| 682 |
+
else:
|
| 683 |
+
corr_val = weighted_corr_pct.iloc[i, j]
|
| 684 |
+
if pd.isna(corr_val):
|
| 685 |
+
hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>No weighted data")
|
| 686 |
+
else:
|
| 687 |
+
hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>Weighted correlation: {corr_val:.1f}%")
|
| 688 |
+
hover_text_weighted.append(hover_row)
|
| 689 |
+
|
| 690 |
+
# Create the heatmap
|
| 691 |
+
fig = go.Figure(data=go.Heatmap(
|
| 692 |
+
z=weighted_corr_df.values,
|
| 693 |
+
x=clean_names,
|
| 694 |
+
y=clean_names,
|
| 695 |
+
colorscale='RdBu_r',
|
| 696 |
+
zmid=0,
|
| 697 |
+
text=weighted_corr_pct.values,
|
| 698 |
+
texttemplate="%{text}",
|
| 699 |
+
textfont={"size": 10},
|
| 700 |
+
hoverinfo='text',
|
| 701 |
+
hovertext=hover_text_weighted,
|
| 702 |
+
colorbar=dict(title="Correlation")
|
| 703 |
+
))
|
| 704 |
+
|
| 705 |
+
# Update layout
|
| 706 |
+
fig.update_layout(
|
| 707 |
+
title=f"{title_prefix}<br><sub>Diagonal shows reliability coefficients (signal-to-noise ratios)</sub>",
|
| 708 |
+
width=800,
|
| 709 |
+
height=700,
|
| 710 |
+
font=dict(size=12),
|
| 711 |
+
xaxis=dict(tickangle=45),
|
| 712 |
+
yaxis=dict(tickangle=0)
|
| 713 |
+
)
|
| 714 |
+
|
| 715 |
+
return fig
|
| 716 |
+
|
| 717 |
+
def create_uncertainty_aware_heatmap_plotly(corr_df, weighted_corr_df, title_prefix="Correlation Analysis"):
|
| 718 |
+
"""Create side-by-side interactive heatmaps comparing regular vs weighted correlations using Plotly."""
|
| 719 |
+
target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
|
| 720 |
+
|
| 721 |
+
# Get clean names for display
|
| 722 |
+
clean_names = [clean_benchmark_name(name) for name in corr_df.columns]
|
| 723 |
+
|
| 724 |
+
# Create subplots
|
| 725 |
+
fig = make_subplots(
|
| 726 |
+
rows=1, cols=2,
|
| 727 |
+
subplot_titles=('Regular Correlation Matrix<br>(Equal weighting)',
|
| 728 |
+
'Uncertainty-Weighted Correlation Matrix<br>(Inverse variance weighting)'),
|
| 729 |
+
horizontal_spacing=0.15
|
| 730 |
+
)
|
| 731 |
+
|
| 732 |
+
# Regular correlation heatmap
|
| 733 |
+
corr_matrix_pct = (corr_df * 100).round(1)
|
| 734 |
+
|
| 735 |
+
# Create hover text for regular correlations
|
| 736 |
+
hover_text_regular = []
|
| 737 |
+
for i, bench1 in enumerate(corr_df.columns):
|
| 738 |
+
hover_row = []
|
| 739 |
+
for j, bench2 in enumerate(corr_df.columns):
|
| 740 |
+
if i == j:
|
| 741 |
+
hover_row.append(f"{clean_names[i]}<br>Self-correlation: 100%")
|
| 742 |
+
else:
|
| 743 |
+
corr_val = corr_matrix_pct.iloc[i, j]
|
| 744 |
+
if pd.isna(corr_val):
|
| 745 |
+
hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>No data")
|
| 746 |
+
else:
|
| 747 |
+
hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>Correlation: {corr_val:.1f}%")
|
| 748 |
+
hover_text_regular.append(hover_row)
|
| 749 |
+
|
| 750 |
+
fig.add_trace(go.Heatmap(
|
| 751 |
+
z=corr_df.values,
|
| 752 |
+
x=clean_names,
|
| 753 |
+
y=clean_names,
|
| 754 |
+
colorscale='RdBu_r',
|
| 755 |
+
zmid=0,
|
| 756 |
+
text=corr_matrix_pct.values,
|
| 757 |
+
texttemplate="%{text}",
|
| 758 |
+
textfont={"size": 8},
|
| 759 |
+
hoverinfo='text',
|
| 760 |
+
hovertext=hover_text_regular,
|
| 761 |
+
showscale=False,
|
| 762 |
+
name="Regular"
|
| 763 |
+
), row=1, col=1)
|
| 764 |
+
|
| 765 |
+
# Weighted correlation heatmap
|
| 766 |
+
weighted_corr_pct = (weighted_corr_df * 100).round(1)
|
| 767 |
+
|
| 768 |
+
# Create hover text for weighted correlations
|
| 769 |
+
hover_text_weighted = []
|
| 770 |
+
for i, bench1 in enumerate(weighted_corr_df.columns):
|
| 771 |
+
hover_row = []
|
| 772 |
+
for j, bench2 in enumerate(weighted_corr_df.columns):
|
| 773 |
+
if i == j:
|
| 774 |
+
reliability = weighted_corr_df.iloc[i, j]
|
| 775 |
+
if pd.isna(reliability):
|
| 776 |
+
hover_row.append(f"{clean_names[i]}<br>Reliability: Unknown")
|
| 777 |
+
else:
|
| 778 |
+
hover_row.append(f"{clean_names[i]}<br>Reliability: {reliability*100:.1f}%")
|
| 779 |
+
else:
|
| 780 |
+
corr_val = weighted_corr_pct.iloc[i, j]
|
| 781 |
+
if pd.isna(corr_val):
|
| 782 |
+
hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>No weighted data")
|
| 783 |
+
else:
|
| 784 |
+
hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>Weighted correlation: {corr_val:.1f}%")
|
| 785 |
+
hover_text_weighted.append(hover_row)
|
| 786 |
+
|
| 787 |
+
fig.add_trace(go.Heatmap(
|
| 788 |
+
z=weighted_corr_df.values,
|
| 789 |
+
x=clean_names,
|
| 790 |
+
y=clean_names,
|
| 791 |
+
colorscale='RdBu_r',
|
| 792 |
+
zmid=0,
|
| 793 |
+
text=weighted_corr_pct.values,
|
| 794 |
+
texttemplate="%{text}",
|
| 795 |
+
textfont={"size": 8},
|
| 796 |
+
hoverinfo='text',
|
| 797 |
+
hovertext=hover_text_weighted,
|
| 798 |
+
showscale=True,
|
| 799 |
+
colorbar=dict(title="Correlation", x=1.02),
|
| 800 |
+
name="Weighted"
|
| 801 |
+
), row=1, col=2)
|
| 802 |
+
|
| 803 |
+
# Update layout
|
| 804 |
+
fig.update_layout(
|
| 805 |
+
title=f"{title_prefix}<br><sub>Diagonal shows reliability coefficients for weighted matrix</sub>",
|
| 806 |
+
width=1400,
|
| 807 |
+
height=700,
|
| 808 |
+
font=dict(size=12)
|
| 809 |
+
)
|
| 810 |
+
|
| 811 |
+
# Update axes
|
| 812 |
+
fig.update_xaxes(tickangle=45, row=1, col=1)
|
| 813 |
+
fig.update_xaxes(tickangle=45, row=1, col=2)
|
| 814 |
+
fig.update_yaxes(tickangle=0, row=1, col=1)
|
| 815 |
+
fig.update_yaxes(tickangle=0, row=1, col=2)
|
| 816 |
+
|
| 817 |
+
return fig
|
| 818 |
+
|
| 819 |
def main():
|
| 820 |
"""Main application."""
|
| 821 |
st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
|
|
|
|
| 1036 |
st.write(f"Total pairs analyzed: {len(pairs)}")
|
| 1037 |
|
| 1038 |
def show_interactive_heatmap(df):
|
| 1039 |
+
"""Display interactive correlation heatmap with various options."""
|
| 1040 |
st.header("π₯ Interactive Correlation Heatmap")
|
| 1041 |
|
| 1042 |
+
# Check if stderr data is available
|
| 1043 |
+
stderr_df = load_stderr_data()
|
| 1044 |
+
|
| 1045 |
+
col1, col2, col3 = st.columns(3)
|
| 1046 |
+
|
| 1047 |
+
with col1:
|
| 1048 |
+
# Check if stderr data is available for the uncertainty-aware checkbox
|
| 1049 |
+
stderr_available = stderr_df is not None
|
| 1050 |
+
uncertainty_aware = False
|
| 1051 |
+
if stderr_available:
|
| 1052 |
+
uncertainty_aware = st.checkbox(
|
| 1053 |
+
"π¬ Uncertainty-Aware Analysis",
|
| 1054 |
+
value=False,
|
| 1055 |
+
help="Use measurement uncertainties to weight correlations (requires standard error data)"
|
| 1056 |
+
)
|
| 1057 |
+
|
| 1058 |
+
# Adjust method selector based on uncertainty-aware mode
|
| 1059 |
+
if uncertainty_aware:
|
| 1060 |
+
st.selectbox(
|
| 1061 |
+
"Correlation Method",
|
| 1062 |
+
["pearson"],
|
| 1063 |
+
index=0,
|
| 1064 |
+
disabled=True,
|
| 1065 |
+
help="**Uncertainty-aware analysis uses Pearson correlations only**\n\nWeighted correlations require parametric methods to properly account for measurement uncertainties."
|
| 1066 |
+
)
|
| 1067 |
+
method = "pearson" # Force Pearson for uncertainty-aware analysis
|
| 1068 |
+
else:
|
| 1069 |
+
method = st.selectbox(
|
| 1070 |
+
"Correlation Method",
|
| 1071 |
+
["kendall", "spearman", "pearson"],
|
| 1072 |
+
index=0,
|
| 1073 |
+
help="**Pearson's r** is a parametric measure of linear correlation that is sensitive to outliers and can be less appropriate for ordinal data.\n" +
|
| 1074 |
+
"**Spearman's rho** is a non-parametric measure of rank correlation that is less sensitive to outliers and can be more appropriate for ordinal data.\n" +
|
| 1075 |
+
"**Kendall's tau** is a non-parametric measure of rank correlation that is less sensitive to outliers and can be more appropriate for ordinal data."
|
| 1076 |
+
)
|
| 1077 |
|
| 1078 |
with col2:
|
| 1079 |
+
show_values = st.checkbox("Show correlation values", value=True)
|
| 1080 |
+
|
| 1081 |
+
# Additional options
|
| 1082 |
+
if uncertainty_aware and stderr_df is not None:
|
| 1083 |
+
st.info("π¬ **Uncertainty-Aware Mode**: Correlations are weighted by inverse measurement variance. "
|
| 1084 |
+
"Diagonal shows reliability coefficients (proportion of variance that is 'true signal' vs measurement error).")
|
| 1085 |
+
|
| 1086 |
+
# Match scores with stderr data
|
| 1087 |
+
available_benchmarks = list(df.columns)
|
| 1088 |
+
score_to_stderr_mapping = match_scores_with_stderr(df, stderr_df, available_benchmarks)
|
| 1089 |
+
|
| 1090 |
+
if len(score_to_stderr_mapping) == 0:
|
| 1091 |
+
st.warning("No matching standard error data found for the selected benchmarks. "
|
| 1092 |
+
"Falling back to regular correlation analysis.")
|
| 1093 |
+
uncertainty_aware = False
|
| 1094 |
+
else:
|
| 1095 |
+
# Filter to benchmarks with stderr data
|
| 1096 |
+
benchmarks_with_stderr = list(score_to_stderr_mapping.keys())
|
| 1097 |
+
df_stderr = df[benchmarks_with_stderr].copy()
|
| 1098 |
+
|
| 1099 |
+
st.success(f"Found standard error data for {len(score_to_stderr_mapping)} benchmarks: "
|
| 1100 |
+
f"{', '.join([clean_benchmark_name(b) for b in benchmarks_with_stderr])}")
|
| 1101 |
+
|
| 1102 |
+
# Align dataframes
|
| 1103 |
+
common_models = df_stderr.index.intersection(stderr_df.index)
|
| 1104 |
+
df_aligned = df_stderr.loc[common_models]
|
| 1105 |
+
stderr_aligned = stderr_df.loc[common_models]
|
| 1106 |
+
|
| 1107 |
+
st.write(f"**Analysis scope**: {len(common_models)} models with both scores and standard errors")
|
| 1108 |
+
|
| 1109 |
+
# Compute uncertainty-aware correlations
|
| 1110 |
+
with st.spinner("Computing uncertainty-weighted correlations..."):
|
| 1111 |
+
corr_df, pvalue_df, weighted_corr_df, weighted_pvalue_df = create_uncertainty_aware_correlation_matrix(
|
| 1112 |
+
df_aligned, stderr_aligned, score_to_stderr_mapping
|
| 1113 |
+
)
|
| 1114 |
+
|
| 1115 |
+
# Create and display uncertainty-aware heatmap
|
| 1116 |
+
fig = create_uncertainty_weighted_heatmap_plotly(
|
| 1117 |
+
weighted_corr_df,
|
| 1118 |
+
title_prefix=f"Uncertainty-Weighted {method.capitalize()} Correlations"
|
| 1119 |
+
)
|
| 1120 |
+
|
| 1121 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1122 |
+
|
| 1123 |
+
# Show reliability statistics
|
| 1124 |
+
with st.expander("π Reliability Statistics", expanded=False):
|
| 1125 |
+
st.write("**Benchmark Reliability Coefficients** (proportion of variance that is true signal):")
|
| 1126 |
+
reliability_data = []
|
| 1127 |
+
for bench in weighted_corr_df.columns:
|
| 1128 |
+
diag_val = weighted_corr_df.loc[bench, bench]
|
| 1129 |
+
if not pd.isna(diag_val):
|
| 1130 |
+
reliability_data.append({
|
| 1131 |
+
'Benchmark': clean_benchmark_name(bench),
|
| 1132 |
+
'Reliability': f"{diag_val*100:.1f}%",
|
| 1133 |
+
'Category': next((cat for cat, benchs in get_focused_benchmark_mapping()[1].items()
|
| 1134 |
+
for b in benchs if get_focused_benchmark_mapping()[0].get(b) == bench), 'Unknown')
|
| 1135 |
+
})
|
| 1136 |
+
|
| 1137 |
+
if reliability_data:
|
| 1138 |
+
reliability_df = pd.DataFrame(reliability_data)
|
| 1139 |
+
st.dataframe(reliability_df, use_container_width=True)
|
| 1140 |
+
|
| 1141 |
+
avg_reliability = pd.to_numeric([d['Reliability'].rstrip('%') for d in reliability_data]).mean() / 100
|
| 1142 |
+
st.metric("Average Reliability", f"{avg_reliability:.3f} ({avg_reliability*100:.1f}%)")
|
| 1143 |
+
|
| 1144 |
+
# Show correlation differences
|
| 1145 |
+
with st.expander("π Impact of Uncertainty Weighting", expanded=False):
|
| 1146 |
+
st.write("**Correlation Changes** (Weighted - Regular):")
|
| 1147 |
+
|
| 1148 |
+
diff_data = []
|
| 1149 |
+
for i, bench1 in enumerate(corr_df.columns):
|
| 1150 |
+
for j, bench2 in enumerate(corr_df.columns):
|
| 1151 |
+
if i < j: # Only upper triangle
|
| 1152 |
+
regular_corr = corr_df.iloc[i, j]
|
| 1153 |
+
weighted_corr = weighted_corr_df.iloc[i, j]
|
| 1154 |
+
|
| 1155 |
+
if not (pd.isna(regular_corr) or pd.isna(weighted_corr)):
|
| 1156 |
+
diff = weighted_corr - regular_corr
|
| 1157 |
+
diff_data.append({
|
| 1158 |
+
'Benchmark Pair': f"{clean_benchmark_name(bench1)} vs {clean_benchmark_name(bench2)}",
|
| 1159 |
+
'Regular': f"{regular_corr:.3f}",
|
| 1160 |
+
'Weighted': f"{weighted_corr:.3f}",
|
| 1161 |
+
'Difference': f"{diff:+.3f}",
|
| 1162 |
+
'Abs Difference': abs(diff)
|
| 1163 |
+
})
|
| 1164 |
+
|
| 1165 |
+
if diff_data:
|
| 1166 |
+
diff_df = pd.DataFrame(diff_data)
|
| 1167 |
+
# Sort by absolute difference
|
| 1168 |
+
diff_df_sorted = diff_df.sort_values('Abs Difference', ascending=False)
|
| 1169 |
+
st.dataframe(diff_df_sorted.drop('Abs Difference', axis=1), use_container_width=True)
|
| 1170 |
+
|
| 1171 |
+
# Summary stats
|
| 1172 |
+
diffs = [float(d['Difference']) for d in diff_data]
|
| 1173 |
+
col1, col2, col3 = st.columns(3)
|
| 1174 |
+
with col1:
|
| 1175 |
+
st.metric("Mean Change", f"{np.mean(diffs):+.4f}")
|
| 1176 |
+
with col2:
|
| 1177 |
+
st.metric("Max |Change|", f"{max(abs(d) for d in diffs):.4f}")
|
| 1178 |
+
with col3:
|
| 1179 |
+
st.metric("Large Changes (|Ξ| > 0.1)", f"{sum(1 for d in diffs if abs(d) > 0.1)}")
|
| 1180 |
+
|
| 1181 |
+
return # Exit early for uncertainty-aware analysis
|
| 1182 |
+
|
| 1183 |
+
# Regular correlation analysis (original functionality restored)
|
| 1184 |
+
if df.empty:
|
| 1185 |
+
st.error("No data available.")
|
| 1186 |
+
return
|
| 1187 |
|
| 1188 |
# Compute correlation matrix
|
| 1189 |
+
corr_matrix = compute_correlations(df, method)
|
| 1190 |
+
|
| 1191 |
+
if corr_matrix.empty:
|
| 1192 |
+
st.error("Unable to compute correlations.")
|
| 1193 |
+
return
|
| 1194 |
+
|
| 1195 |
+
# Create and display regular heatmap (original way)
|
| 1196 |
+
fig = create_interactive_heatmap(corr_matrix, f"{method.capitalize()} Correlation Matrix")
|
| 1197 |
+
|
| 1198 |
+
# Add correlation values as text annotations if requested
|
| 1199 |
+
if show_values:
|
| 1200 |
+
# Convert correlations to percentages for display
|
| 1201 |
+
corr_text = (corr_matrix * 100).round().astype(str)
|
| 1202 |
+
fig.update_traces(
|
| 1203 |
+
text=corr_text.values,
|
| 1204 |
+
texttemplate="%{text}",
|
| 1205 |
+
textfont={"size": 8}
|
| 1206 |
+
)
|
| 1207 |
|
|
|
|
|
|
|
| 1208 |
st.plotly_chart(fig, use_container_width=True)
|
| 1209 |
|
| 1210 |
+
# Correlation statistics (original)
|
| 1211 |
st.subheader("Correlation Statistics")
|
| 1212 |
|
| 1213 |
+
# Get all off-diagonal correlations (original method)
|
| 1214 |
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
|
| 1215 |
corr_values = corr_matrix.where(mask).stack().dropna()
|
| 1216 |
|
|
|
|
| 1228 |
with col4:
|
| 1229 |
st.metric("Min Correlation", f"{corr_values.min():.3f}")
|
| 1230 |
|
| 1231 |
+
# Distribution of correlations (original)
|
| 1232 |
st.subheader("Correlation Distribution")
|
| 1233 |
|
| 1234 |
+
fig_hist = px.histogram(corr_values,
|
| 1235 |
nbins=20,
|
| 1236 |
title="Distribution of Pairwise Correlations",
|
| 1237 |
labels={'value': 'Correlation Coefficient', 'count': 'Frequency'})
|
| 1238 |
+
st.plotly_chart(fig_hist, use_container_width=True)
|
| 1239 |
+
|
| 1240 |
+
# Methodology note
|
| 1241 |
+
with st.expander("βΉοΈ About Correlation Methods", expanded=False):
|
| 1242 |
+
st.markdown("""
|
| 1243 |
+
**Pearson**: Measures linear relationships. Values range from -1 to +1.
|
| 1244 |
+
- +1: Perfect positive linear relationship
|
| 1245 |
+
- 0: No linear relationship
|
| 1246 |
+
- -1: Perfect negative linear relationship
|
| 1247 |
+
|
| 1248 |
+
**Spearman**: Measures monotonic relationships using ranks. More robust to outliers.
|
| 1249 |
+
- Good for non-linear but monotonic relationships
|
| 1250 |
+
- Less sensitive to extreme values
|
| 1251 |
+
|
| 1252 |
+
**Kendall**: Measures ordinal association using concordant/discordant pairs.
|
| 1253 |
+
- More robust than Spearman for small samples
|
| 1254 |
+
- Better for data with many tied values
|
| 1255 |
+
|
| 1256 |
+
**Uncertainty-Aware Analysis**: When available, uses measurement standard errors to:
|
| 1257 |
+
- Weight correlations by inverse measurement variance
|
| 1258 |
+
- Show reliability coefficients (signal-to-noise ratios) on diagonal
|
| 1259 |
+
- Provide more accurate correlation estimates for noisy data
|
| 1260 |
+
""")
|
| 1261 |
|
| 1262 |
def show_scatter_explorer(df, stderr_df):
|
| 1263 |
"""Show the scatter plot explorer."""
|