Spaces:

launch
/

ExpertLongBench

Running

File size: 3,894 Bytes

8e68ad1
35c36b4
56e8880
 
 
35c36b4
578adcb
901e92c
35c36b4
6594157
d707ec3
6594157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643980c
6594157
 
643980c
 
6594157
 
 
 
 
578adcb
35c36b4
4cfc3d6
35c36b4
578adcb
 
 
 
35c36b4
 
 
 
 
578adcb
 
 
 
 
35c36b4
 
 
6594157
578adcb
35c36b4
578adcb
35c36b4
578adcb
35c36b4
 
 
578adcb
 
 
 
35c36b4
578adcb
 
 
35c36b4
 
 
578adcb
 
 
35c36b4
 
 
 
 
643980c
578adcb
643980c
 
 
 
 
578adcb

import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO

# ─── Page config ──────────────────────────────────────────────────────────────
st.set_page_config(page_title="ExpertLongBench Leaderboard", layout="wide")


logo_image = Image.open("src/ExpertLongBench.png")

# Display logo
buffered = BytesIO()
logo_image.save(buffered, format="PNG")
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")

st.markdown(
    f"""
    <div class="logo-container" style="display:flex; justify-content: center;">
        <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
    </div>
    """,
    unsafe_allow_html=True
)

st.markdown(
    '''
    <div class="header">
        <br/>
        <p style="font-size:22px;">
        ExpertLongBench: Benchmarking Language Models on Expert-Level Long-Form Generation with Structured Checklists
        </p>
        <p style="font-size:20px;">
             📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | <strong>K</strong> <a href="">Kaggle</a> <add links later>
            ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 12 | Updated: <strong>May 2025</strong>
        </p>
    </div>
    ''',
    unsafe_allow_html=True
)
# ─── Load data ────────────────────────────────────────────────────────────────
@st.cache_data
def load_data(path="src/models.json"):
    df = pd.read_json(path, lines=True)
    score_cols = [f"T{i}" for i in range(1, 12)]
    df["Avg"] = df[score_cols].mean(axis=1).round(1)
    # Compute rank per column (1 = best)
    for col in score_cols + ["Avg"]:
        df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
    return df

df = load_data()

# Precompute max ranks for color scaling
score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}

# ─── Tabs ──────────────────────────────────────────────────────────────────────
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])

with tab1:
    # st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
    # Build raw HTML table
    cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
    html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
    # header
    html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
    # rows
    for _, row in df.iterrows():
        html += "<tr>"
        for col in cols:
            val = row[col]
            if col == "Model":
                html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
            else:
                rank = int(row[f"{col}_rank"])
                norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
                # interpolate green (182,243,182) → white (255,255,255)
                r = int(255 - norm*(255-182))
                g = int(255 - norm*(255-243))
                b = 255
                bold = "font-weight:bold;" if rank == 1 else ""
                style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
                html += f"<td style='{style}'>{val}</td>"
        html += "</tr>"
    html += "</table>"
    st.markdown(html, unsafe_allow_html=True)

with tab2:
    st.markdown("## Abstract")
    st.write(
        "<add final abstract here>"
    )
    st.markdown("## Pipeline")
    st.write(
        "<add final pipeline figure here>"
    )