import streamlit as st import pandas as pd import numpy as np from PIL import Image import base64 from io import BytesIO # --- Page config --- st.set_page_config(page_title="VeriFact Leaderboard", layout="wide") # --- Load images --- @st.cache_data def load_image(path): return Image.open(path) # logo = load_image("factrbench.png") # chart = load_image("test.png") # Display logo buf = BytesIO() logo.save(buf, format="PNG") logo_b64 = base64.b64encode(buf.getvalue()).decode("utf-8") st.markdown(f"""

""", unsafe_allow_html=True) # Header st.markdown("""

VERIFACT: Enhancing Long-Form Factuality Evaluation...

""", unsafe_allow_html=True) # --- Load data --- @st.cache_data def load_data(path="models.json"): df = pd.read_json(path, lines=True) df["Avg"] = df[[f"T{i}" for i in range(1,12)]].mean(axis=1).round(1) # Compute rank per column for col in [f"T{i}" for i in range(1,12)] + ["Avg"]: df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int) return df df = load_data() # --- Tabs --- tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"]) with tab1: st.markdown("**Leaderboard:** Higher scores shaded green; best models bolded.") # Build HTML table cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"] max_ranks = {col: df[f"{col}_rank"].max() for col in cols if col!="Model"} html = "" # header html += "" + "".join(f"" for c in cols) + "" # rows for _, row in df.iterrows(): html += "" for c in cols: val = row[c] if c!="Model" else row[c] if c=="Model": html += f"" else: # color gradient rank = row[f"{c}_rank"] norm = 1 - (rank-1)/(max_ranks[c]-1 or 1) # interpolate green-white r = int(255 - norm*(255-182)) g = int(255 - norm*(255-243)) b = 255 style = f"background-color:rgb({r},{g},{b}); padding:4px;" bold = "font-weight:bold;" if rank==1 else "" html += f"" html += "" html += "

{c}
{val}	{val}

" st.markdown(html, unsafe_allow_html=True) with tab2: buf2 = BytesIO() chart.save(buf2, format="PNG") chart_b64 = base64.b64encode(buf2.getvalue()).decode("utf-8") st.markdown(f"""

""", unsafe_allow_html=True)