Spaces:
Running
Running
File size: 3,894 Bytes
8e68ad1 35c36b4 56e8880 35c36b4 578adcb 901e92c 35c36b4 6594157 d707ec3 6594157 643980c 6594157 643980c 6594157 578adcb 35c36b4 4cfc3d6 35c36b4 578adcb 35c36b4 578adcb 35c36b4 6594157 578adcb 35c36b4 578adcb 35c36b4 578adcb 35c36b4 578adcb 35c36b4 578adcb 35c36b4 578adcb 35c36b4 643980c 578adcb 643980c 578adcb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO
# βββ Page config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
st.set_page_config(page_title="ExpertLongBench Leaderboard", layout="wide")
logo_image = Image.open("src/ExpertLongBench.png")
# Display logo
buffered = BytesIO()
logo_image.save(buffered, format="PNG")
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
st.markdown(
f"""
<div class="logo-container" style="display:flex; justify-content: center;">
<img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
</div>
""",
unsafe_allow_html=True
)
st.markdown(
'''
<div class="header">
<br/>
<p style="font-size:22px;">
ExpertLongBench: Benchmarking Language Models on Expert-Level Long-Form Generation with Structured Checklists
</p>
<p style="font-size:20px;">
π <a href="">Paper</a> | π» <a href="">GitHub</a> | <strong>K</strong> <a href="">Kaggle</a> <add links later>
βοΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 12 | Updated: <strong>May 2025</strong>
</p>
</div>
''',
unsafe_allow_html=True
)
# βββ Load data ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@st.cache_data
def load_data(path="src/models.json"):
df = pd.read_json(path, lines=True)
score_cols = [f"T{i}" for i in range(1, 12)]
df["Avg"] = df[score_cols].mean(axis=1).round(1)
# Compute rank per column (1 = best)
for col in score_cols + ["Avg"]:
df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
return df
df = load_data()
# Precompute max ranks for color scaling
score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}
# βββ Tabs ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
with tab1:
# st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
# Build raw HTML table
cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
# header
html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
# rows
for _, row in df.iterrows():
html += "<tr>"
for col in cols:
val = row[col]
if col == "Model":
html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
else:
rank = int(row[f"{col}_rank"])
norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
# interpolate green (182,243,182) β white (255,255,255)
r = int(255 - norm*(255-182))
g = int(255 - norm*(255-243))
b = 255
bold = "font-weight:bold;" if rank == 1 else ""
style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
html += f"<td style='{style}'>{val}</td>"
html += "</tr>"
html += "</table>"
st.markdown(html, unsafe_allow_html=True)
with tab2:
st.markdown("## Abstract")
st.write(
"<add final abstract here>"
)
st.markdown("## Pipeline")
st.write(
"<add final pipeline figure here>"
)
|