陈俊杰
commited on
Commit
·
496eb7b
1
Parent(s):
a2a84e8
cjj-leaderboard
Browse files
app.py
CHANGED
|
@@ -13,10 +13,9 @@ st.title("NTCIR-18 Automatic Evaluation of LLMs (AEOLLM) Task")
|
|
| 13 |
with st.sidebar:
|
| 14 |
page = option_menu(
|
| 15 |
"Navigation",
|
| 16 |
-
["Introduction", "Methodology", "Datasets", "Important Dates",
|
| 17 |
-
"Evaluation Measures", "Data and File format", "Submit",
|
| 18 |
-
|
| 19 |
-
icons=['house', 'book', 'database', 'calendar', 'clipboard', 'file', 'upload', 'trophy', 'people', 'book'],
|
| 20 |
menu_icon="cast",
|
| 21 |
default_index=0,
|
| 22 |
styles={
|
|
@@ -199,69 +198,113 @@ This leaderboard is used to show the performance of the <strong>automatic evalua
|
|
| 199 |
<p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
|
| 200 |
</p>
|
| 201 |
""", unsafe_allow_html=True)
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
"kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
-
df1 = pd.DataFrame(DG)
|
| 214 |
|
| 215 |
-
TE = {
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
}
|
| 222 |
-
df2 = pd.DataFrame(TE)
|
| 223 |
|
| 224 |
-
SG = {
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
}
|
| 231 |
-
df3 = pd.DataFrame(SG)
|
| 232 |
|
| 233 |
-
NFQA = {
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
}
|
| 240 |
-
df4 = pd.DataFrame(NFQA)
|
| 241 |
|
| 242 |
-
df = [df1, df2, df3, df4]
|
| 243 |
-
for d in df:
|
| 244 |
-
|
| 245 |
-
|
| 246 |
|
| 247 |
# 创建标签页
|
| 248 |
-
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
|
| 249 |
|
| 250 |
-
with tab1:
|
| 251 |
-
|
| 252 |
-
|
| 253 |
|
| 254 |
-
with tab2:
|
| 255 |
-
|
| 256 |
-
|
| 257 |
|
| 258 |
-
with tab3:
|
| 259 |
-
|
| 260 |
-
|
| 261 |
|
| 262 |
-
with tab4:
|
| 263 |
-
|
| 264 |
-
|
| 265 |
st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
|
| 266 |
# 获取北京时间
|
| 267 |
time_placeholder = st.empty()
|
|
|
|
| 13 |
with st.sidebar:
|
| 14 |
page = option_menu(
|
| 15 |
"Navigation",
|
| 16 |
+
["LeaderBoard", "Introduction", "Methodology", "Datasets", "Important Dates",
|
| 17 |
+
"Evaluation Measures", "Data and File format", "Submit", "Organisers", "References"],
|
| 18 |
+
icons=['trophy', 'house', 'book', 'database', 'calendar', 'clipboard', 'file', 'upload', 'people', 'book'],
|
|
|
|
| 19 |
menu_icon="cast",
|
| 20 |
default_index=0,
|
| 21 |
styles={
|
|
|
|
| 198 |
<p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
|
| 199 |
</p>
|
| 200 |
""", unsafe_allow_html=True)
|
| 201 |
+
index = pd.MultiIndex.from_tuples([
|
| 202 |
+
('', 'teamId'),
|
| 203 |
+
('', 'methods'),
|
| 204 |
+
('', 'overall'),
|
| 205 |
+
('Dialogue Generation', 'accuracy'),
|
| 206 |
+
('Dialogue Generation', "kendall's tau"),
|
| 207 |
+
('Dialogue Generation', 'spearman'),
|
| 208 |
+
('Text Expansion', "accuracy"),
|
| 209 |
+
('Text Expansion', "kendall's tau"),
|
| 210 |
+
('Text Expansion', 'spearman'),
|
| 211 |
+
('Summary Generation', 'accuracy'),
|
| 212 |
+
('Summary Generation', "kendall's tau"),
|
| 213 |
+
('Summary Generation', 'spearman'),
|
| 214 |
+
('Non-Factoid QA', "accuracy"),
|
| 215 |
+
('Non-Factoid QA', "kendall's tau"),
|
| 216 |
+
('Non-Factoid QA', 'spearman')
|
| 217 |
+
])
|
| 218 |
|
| 219 |
+
data = {
|
| 220 |
+
('', 'teamId'): ['baseline', 'baseline', 'baseline', 'baseline'],
|
| 221 |
+
('', 'methods'): ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
| 222 |
+
('', 'overall'): [],
|
| 223 |
+
('Dialogue Generation', 'accuracy'): [0.5806, 0.5483, 0.6001, 0.6472],
|
| 224 |
+
('Dialogue Generation', "kendall's tau"): [0.3243, 0.1739, 0.3042, 0.4167],
|
| 225 |
+
('Dialogue Generation', 'spearman'): [0.3505, 0.1857, 0.3264, 0.4512],
|
| 226 |
+
('Text Expansion', "accuracy"): [0.5107, 0.5050, 0.5461, 0.5581],
|
| 227 |
+
('Text Expansion', "kendall's tau"): [0.1281, 0.0635, 0.2716, 0.3864],
|
| 228 |
+
('Text Expansion', 'spearman'): [0.1352, 0.0667, 0.2867, 0.4157],
|
| 229 |
+
('Summary Generation', 'accuracy'): [0.6504, 0.6014, 0.7162, 0.7441],
|
| 230 |
+
('Summary Generation', "kendall's tau"): [0.3957, 0.2688, 0.5092, 0.5001],
|
| 231 |
+
('Summary Generation', 'spearman'): [0.4188, 0.2817, 0.5403, 0.5405],
|
| 232 |
+
('Non-Factoid QA', "accuracy"): [0.5935, 0.5817, 0.7000, 0.7203],
|
| 233 |
+
('Non-Factoid QA', "kendall's tau"): [0.2332, 0.2389, 0.4440, 0.4235],
|
| 234 |
+
('Non-Factoid QA', 'spearman'): [0.2443, 0.2492, 0.4630, 0.4511]
|
| 235 |
}
|
| 236 |
+
overall = [0, 0, 0, 0]
|
| 237 |
+
for d in data:
|
| 238 |
+
if d != ('', 'teamId') and d != ('', 'methods') and d != ('', 'overall'):
|
| 239 |
+
for i in range(4):
|
| 240 |
+
overall[i] += data[d][i]
|
| 241 |
+
overall = [i / (3*4) for i in overall]
|
| 242 |
+
data[('', 'overall')] = overall
|
| 243 |
+
for d in data:
|
| 244 |
+
for col in d.select_dtypes(include=['float64', 'int64']).columns:
|
| 245 |
+
d[col] = d[col].apply(lambda x: f"{x:.4f}")
|
| 246 |
+
st.dataframe(data, use_container_width=True)
|
| 247 |
+
# # teamId 唯一标识码
|
| 248 |
+
# DG = {
|
| 249 |
+
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
| 250 |
+
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
| 251 |
+
# "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
|
| 252 |
+
# "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
|
| 253 |
+
# "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
|
| 254 |
+
# }
|
| 255 |
|
| 256 |
+
# df1 = pd.DataFrame(DG)
|
| 257 |
|
| 258 |
+
# TE = {
|
| 259 |
+
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
| 260 |
+
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
| 261 |
+
# "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
|
| 262 |
+
# "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
|
| 263 |
+
# "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
|
| 264 |
+
# }
|
| 265 |
+
# df2 = pd.DataFrame(TE)
|
| 266 |
|
| 267 |
+
# SG = {
|
| 268 |
+
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
| 269 |
+
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
| 270 |
+
# "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
|
| 271 |
+
# "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
|
| 272 |
+
# "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
|
| 273 |
+
# }
|
| 274 |
+
# df3 = pd.DataFrame(SG)
|
| 275 |
|
| 276 |
+
# NFQA = {
|
| 277 |
+
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
| 278 |
+
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
| 279 |
+
# "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
|
| 280 |
+
# "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
|
| 281 |
+
# "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
|
| 282 |
+
# }
|
| 283 |
+
# df4 = pd.DataFrame(NFQA)
|
| 284 |
|
| 285 |
+
# df = [df1, df2, df3, df4]
|
| 286 |
+
# for d in df:
|
| 287 |
+
# for col in d.select_dtypes(include=['float64', 'int64']).columns:
|
| 288 |
+
# d[col] = d[col].apply(lambda x: f"{x:.4f}")
|
| 289 |
|
| 290 |
# 创建标签页
|
| 291 |
+
# tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
|
| 292 |
|
| 293 |
+
# with tab1:
|
| 294 |
+
# st.markdown("""<p class='main-text'>Task: Dialogue Generation; Dataset: DialyDialog</p>""", unsafe_allow_html=True)
|
| 295 |
+
# st.dataframe(df1, use_container_width=True)
|
| 296 |
|
| 297 |
+
# with tab2:
|
| 298 |
+
# st.markdown("""<p class='main-text'>Task: Text Expansion; Dataset: WritingPrompts</p>""", unsafe_allow_html=True)
|
| 299 |
+
# st.dataframe(df2, use_container_width=True)
|
| 300 |
|
| 301 |
+
# with tab3:
|
| 302 |
+
# st.markdown("""<p class='main-text'>Task: Summary Generation; Dataset: Xsum</p>""", unsafe_allow_html=True)
|
| 303 |
+
# st.dataframe(df3, use_container_width=True)
|
| 304 |
|
| 305 |
+
# with tab4:
|
| 306 |
+
# st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
|
| 307 |
+
# st.dataframe(df4, use_container_width=True)
|
| 308 |
st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
|
| 309 |
# 获取北京时间
|
| 310 |
time_placeholder = st.empty()
|
test.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data = {
|
| 2 |
+
('', 'teamId'): ['baseline', 'baseline', 'baseline', 'baseline'],
|
| 3 |
+
('', 'methods'): ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
| 4 |
+
('', 'overall'): [],
|
| 5 |
+
('Dialogue Generation', 'accuracy'): [0.5806, 0.5483, 0.6001, 0.6472],
|
| 6 |
+
('Dialogue Generation', "kendall's tau"): [0.3243, 0.1739, 0.3042, 0.4167],
|
| 7 |
+
('Dialogue Generation', 'spearman'): [0.3505, 0.1857, 0.3264, 0.4512],
|
| 8 |
+
('Text Expansion', "accuracy"): [0.5107, 0.5050, 0.5461, 0.5581],
|
| 9 |
+
('Text Expansion', "kendall's tau"): [0.1281, 0.0635, 0.2716, 0.3864],
|
| 10 |
+
('Text Expansion', 'spearman'): [0.1352, 0.0667, 0.2867, 0.4157],
|
| 11 |
+
('Summary Generation', 'accuracy'): [0.6504, 0.6014, 0.7162, 0.7441],
|
| 12 |
+
('Summary Generation', "kendall's tau"): [0.3957, 0.2688, 0.5092, 0.5001],
|
| 13 |
+
('Summary Generation', 'spearman'): [0.4188, 0.2817, 0.5403, 0.5405],
|
| 14 |
+
('Non-Factoid QA', "accuracy"): [0.5935, 0.5817, 0.7000, 0.7203],
|
| 15 |
+
('Non-Factoid QA', "kendall's tau"): [0.2332, 0.2389, 0.4440, 0.4235],
|
| 16 |
+
('Non-Factoid QA', 'spearman'): [0.2443, 0.2492, 0.4630, 0.4511]
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
overall = [0, 0, 0, 0]
|
| 20 |
+
|
| 21 |
+
for d in data:
|
| 22 |
+
if d != ('', 'teamId') and d != ('', 'methods') and d != ('', 'overall'):
|
| 23 |
+
for i in range(4):
|
| 24 |
+
overall[i] += data[d][i]
|
| 25 |
+
|
| 26 |
+
overall = [i / (3*4) for i in overall]
|
| 27 |
+
print(overall)
|