shezamunir commited on
Commit
578adcb
Β·
verified Β·
1 Parent(s): d56c24f

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +30 -68
src/streamlit_app.py CHANGED
@@ -1,98 +1,60 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import numpy as np
4
- from PIL import Image
5
- import base64
6
- from io import BytesIO
7
 
8
- # --- Page config ---
9
  st.set_page_config(page_title="VeriFact Leaderboard", layout="wide")
10
 
11
- # --- Load images ---
12
- @st.cache_data
13
- def load_image(path):
14
- return Image.open(path)
15
-
16
- # logo = load_image("factrbench.png")
17
- # chart = load_image("test.png")
18
-
19
- # Display logo
20
- buf = BytesIO()
21
- logo.save(buf, format="PNG")
22
- logo_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
23
- st.markdown(f"""
24
- <div style="text-align:center; margin-bottom:20px;">
25
- <img src="data:image/png;base64,{logo_b64}" style="width:50%; max-width:700px;"/>
26
- </div>
27
- """, unsafe_allow_html=True)
28
-
29
- # Header
30
- st.markdown("""
31
- <div style="text-align:center;">
32
- <p style="font-size:22px;">
33
- VERIFACT: Enhancing Long-Form Factuality Evaluation...
34
- </p>
35
- <p style="font-size:20px;">
36
- # πŸ“‘ <a href="">Paper</a> | πŸ’» <a href="">GitHub</a> | πŸ€— <a href="">HuggingFace</a> |
37
- βš™οΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 11 | Updated: <strong>April 2025</strong>
38
- </p>
39
- </div>
40
- """, unsafe_allow_html=True)
41
-
42
- # --- Load data ---
43
  @st.cache_data
44
  def load_data(path="models.json"):
45
  df = pd.read_json(path, lines=True)
46
- df["Avg"] = df[[f"T{i}" for i in range(1,12)]].mean(axis=1).round(1)
47
- # Compute rank per column
48
- for col in [f"T{i}" for i in range(1,12)] + ["Avg"]:
 
49
  df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
50
  return df
51
 
52
  df = load_data()
53
 
54
- # --- Tabs ---
 
 
 
 
55
  tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
56
 
57
  with tab1:
58
- st.markdown("**Leaderboard:** Higher scores shaded green; best models bolded.")
59
- # Build HTML table
60
  cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
61
- max_ranks = {col: df[f"{col}_rank"].max() for col in cols if col!="Model"}
62
-
63
- html = "<table style='border-collapse:collapse; width:100%;'>"
64
  # header
65
- html += "<tr>" + "".join(f"<th style='padding:4px;'>{c}</th>" for c in cols) + "</tr>"
66
  # rows
67
  for _, row in df.iterrows():
68
  html += "<tr>"
69
- for c in cols:
70
- val = row[c] if c!="Model" else row[c]
71
- if c=="Model":
72
- html += f"<td style='padding:4px;text-align:left;'>{val}</td>"
73
  else:
74
- # color gradient
75
- rank = row[f"{c}_rank"]
76
- norm = 1 - (rank-1)/(max_ranks[c]-1 or 1)
77
- # interpolate green-white
78
  r = int(255 - norm*(255-182))
79
  g = int(255 - norm*(255-243))
80
  b = 255
81
- style = f"background-color:rgb({r},{g},{b}); padding:4px;"
82
- bold = "font-weight:bold;" if rank==1 else ""
83
- html += f"<td style='{style}{bold}'>{val}</td>"
84
  html += "</tr>"
85
  html += "</table>"
86
  st.markdown(html, unsafe_allow_html=True)
87
 
88
  with tab2:
89
- buf2 = BytesIO()
90
- chart.save(buf2, format="PNG")
91
- chart_b64 = base64.b64encode(buf2.getvalue()).decode("utf-8")
92
- st.markdown(f"""
93
- <div style="text-align:center;">
94
- <img src="data:image/png;base64,{chart_b64}" style="width:65%;"/>
95
- </div>
96
- """, unsafe_allow_html=True)
97
-
98
-
 
1
  import streamlit as st
2
  import pandas as pd
 
 
 
 
3
 
4
+ # ─── Page config ──────────────────────────────────────────────────────────────
5
  st.set_page_config(page_title="VeriFact Leaderboard", layout="wide")
6
 
7
+ # ─── Load data ────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  @st.cache_data
9
  def load_data(path="models.json"):
10
  df = pd.read_json(path, lines=True)
11
+ score_cols = [f"T{i}" for i in range(1, 12)]
12
+ df["Avg"] = df[score_cols].mean(axis=1).round(1)
13
+ # Compute rank per column (1 = best)
14
+ for col in score_cols + ["Avg"]:
15
  df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
16
  return df
17
 
18
  df = load_data()
19
 
20
+ # Precompute max ranks for color scaling
21
+ score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
22
+ max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}
23
+
24
+ # ─── Tabs ──────────────────────────────────────────────────────────────────────
25
  tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
26
 
27
  with tab1:
28
+ st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
29
+ # Build raw HTML table
30
  cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
31
+ html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
 
 
32
  # header
33
+ html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
34
  # rows
35
  for _, row in df.iterrows():
36
  html += "<tr>"
37
+ for col in cols:
38
+ val = row[col]
39
+ if col == "Model":
40
+ html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
41
  else:
42
+ rank = int(row[f"{col}_rank"])
43
+ norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
44
+ # interpolate green (182,243,182) β†’ white (255,255,255)
 
45
  r = int(255 - norm*(255-182))
46
  g = int(255 - norm*(255-243))
47
  b = 255
48
+ bold = "font-weight:bold;" if rank == 1 else ""
49
+ style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
50
+ html += f"<td style='{style}'>{val}</td>"
51
  html += "</tr>"
52
  html += "</table>"
53
  st.markdown(html, unsafe_allow_html=True)
54
 
55
  with tab2:
56
+ st.markdown("### Benchmark Details")
57
+ st.write(
58
+ "VERIFACT is a factuality evaluation framework for long‑form LLM outputs. "
59
+ "FACTRBENCH provides reference fact sets and external evidence across real‑world prompts."
60
+ )