ExpertLongBench / app.py
shezamunir's picture
Create app.py
0075c7c verified
raw
history blame
3.55 kB
import streamlit as st
import pandas as pd
import numpy as np
from PIL import Image
import base64
from io import BytesIO
# --- Page config ---
st.set_page_config(page_title="VeriFact Leaderboard", layout="wide")
# --- Load images ---
@st.cache_data
def load_image(path):
return Image.open(path)
logo = load_image("factrbench.png")
chart = load_image("test.png")
# Display logo
buf = BytesIO()
logo.save(buf, format="PNG")
logo_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
st.markdown(f"""
<div style="text-align:center; margin-bottom:20px;">
<img src="data:image/png;base64,{logo_b64}" style="width:50%; max-width:700px;"/>
</div>
""", unsafe_allow_html=True)
# Header
st.markdown("""
<div style="text-align:center;">
<p style="font-size:22px;">
VERIFACT: Enhancing Long-Form Factuality Evaluation...
</p>
<p style="font-size:20px;">
# πŸ“‘ <a href="">Paper</a> | πŸ’» <a href="">GitHub</a> | πŸ€— <a href="">HuggingFace</a> |
βš™οΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 11 | Updated: <strong>April 2025</strong>
</p>
</div>
""", unsafe_allow_html=True)
# --- Load data ---
@st.cache_data
def load_data(path="models.json"):
df = pd.read_json(path, lines=True)
df["Avg"] = df[[f"T{i}" for i in range(1,12)]].mean(axis=1).round(1)
# Compute rank per column
for col in [f"T{i}" for i in range(1,12)] + ["Avg"]:
df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
return df
df = load_data()
# --- Tabs ---
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
with tab1:
st.markdown("**Leaderboard:** Higher scores shaded green; best models bolded.")
# Build HTML table
cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
max_ranks = {col: df[f"{col}_rank"].max() for col in cols if col!="Model"}
html = "<table style='border-collapse:collapse; width:100%;'>"
# header
html += "<tr>" + "".join(f"<th style='padding:4px;'>{c}</th>" for c in cols) + "</tr>"
# rows
for _, row in df.iterrows():
html += "<tr>"
for c in cols:
val = row[c] if c!="Model" else row[c]
if c=="Model":
html += f"<td style='padding:4px;text-align:left;'>{val}</td>"
else:
# color gradient
rank = row[f"{c}_rank"]
norm = 1 - (rank-1)/(max_ranks[c]-1 or 1)
# interpolate green-white
r = int(255 - norm*(255-182))
g = int(255 - norm*(255-243))
b = 255
style = f"background-color:rgb({r},{g},{b}); padding:4px;"
bold = "font-weight:bold;" if rank==1 else ""
html += f"<td style='{style}{bold}'>{val}</td>"
html += "</tr>"
html += "</table>"
st.markdown(html, unsafe_allow_html=True)
with tab2:
buf2 = BytesIO()
chart.save(buf2, format="PNG")
chart_b64 = base64.b64encode(buf2.getvalue()).decode("utf-8")
st.markdown(f"""
<div style="text-align:center;">
<img src="data:image/png;base64,{chart_b64}" style="width:65%;"/>
</div>
""", unsafe_allow_html=True)
st.markdown("### What is VERIFACT?")
st.write("VERIFACT is a factuality evaluation framework...")
st.markdown("### What is FACTRBENCH?")
st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation...")
st.markdown("### Key Findings")
st.write("VERIFACT outperforms prior methods [...]")