Spaces:

THUIR
/

AEOLLM

Running

App Files Files Community

陈俊杰 commited on Sep 3, 2024

Commit

496eb7b

1 Parent(s): a2a84e8

cjj-leaderboard

Browse files

Files changed (2) hide show

app.py +97 -54
test.py +27 -0

app.py CHANGED Viewed

@@ -13,10 +13,9 @@ st.title("NTCIR-18 Automatic Evaluation of LLMs (AEOLLM) Task")
 with st.sidebar:
     page = option_menu(
         "Navigation",
-        ["Introduction", "Methodology", "Datasets", "Important Dates",
-         "Evaluation Measures", "Data and File format", "Submit",
-         "LeaderBoard", "Organisers", "References"],
-        icons=['house', 'book', 'database', 'calendar', 'clipboard', 'file', 'upload', 'trophy', 'people', 'book'],
         menu_icon="cast",
         default_index=0,
         styles={
@@ -199,69 +198,113 @@ This leaderboard is used to show the performance of the <strong>automatic evalua
 <p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
 </p>
     """, unsafe_allow_html=True)
-    # 创建示例数据
-    # teamId 唯一标识码
-    DG = {
-        "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
-        "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
-        "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
-        "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
-        "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
     }
-    df1 = pd.DataFrame(DG)
-    TE = {
-        "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
-        "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
-        "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
-        "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
-        "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
-    }
-    df2 = pd.DataFrame(TE)
-    SG = {
-        "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
-        "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
-        "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
-        "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
-        "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
-    }
-    df3 = pd.DataFrame(SG)
-    NFQA = {
-        "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
-        "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
-        "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
-        "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
-        "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
-    }
-    df4 = pd.DataFrame(NFQA)
-    df = [df1, df2, df3, df4]
-    for d in df:
-        for col in d.select_dtypes(include=['float64', 'int64']).columns:
-            d[col] = d[col].apply(lambda x: f"{x:.4f}")
     # 创建标签页
-    tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
-    with tab1:
-        st.markdown("""<p class='main-text'>Task: Dialogue Generation; Dataset: DialyDialog</p>""", unsafe_allow_html=True)
-        st.dataframe(df1, use_container_width=True)
-    with tab2:
-        st.markdown("""<p class='main-text'>Task: Text Expansion; Dataset: WritingPrompts</p>""", unsafe_allow_html=True)
-        st.dataframe(df2, use_container_width=True)
-    with tab3:
-        st.markdown("""<p class='main-text'>Task: Summary Generation; Dataset: Xsum</p>""", unsafe_allow_html=True)
-        st.dataframe(df3, use_container_width=True)
-    with tab4:
-        st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
-        st.dataframe(df4, use_container_width=True)
     st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
     # 获取北京时间
     time_placeholder = st.empty()

 with st.sidebar:
     page = option_menu(
         "Navigation",
+        ["LeaderBoard", "Introduction", "Methodology", "Datasets", "Important Dates",
+         "Evaluation Measures", "Data and File format", "Submit", "Organisers", "References"],
+        icons=['trophy', 'house', 'book', 'database', 'calendar', 'clipboard', 'file', 'upload', 'people', 'book'],
         menu_icon="cast",
         default_index=0,
         styles={
 <p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
 </p>
     """, unsafe_allow_html=True)
+    index = pd.MultiIndex.from_tuples([
+        ('', 'teamId'),
+        ('', 'methods'),
+        ('', 'overall'),
+        ('Dialogue Generation', 'accuracy'),
+        ('Dialogue Generation', "kendall's tau"),
+        ('Dialogue Generation', 'spearman'),
+        ('Text Expansion', "accuracy"),
+        ('Text Expansion', "kendall's tau"),
+        ('Text Expansion', 'spearman'),
+        ('Summary Generation', 'accuracy'),
+        ('Summary Generation', "kendall's tau"),
+        ('Summary Generation', 'spearman'),
+        ('Non-Factoid QA', "accuracy"),
+        ('Non-Factoid QA', "kendall's tau"),
+        ('Non-Factoid QA', 'spearman')
+    ])
+    data = {
+        ('', 'teamId'): ['baseline', 'baseline', 'baseline', 'baseline'],
+        ('', 'methods'): ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
+        ('', 'overall'): [],
+        ('Dialogue Generation', 'accuracy'): [0.5806, 0.5483, 0.6001, 0.6472],
+        ('Dialogue Generation', "kendall's tau"): [0.3243, 0.1739, 0.3042, 0.4167],
+        ('Dialogue Generation', 'spearman'): [0.3505, 0.1857, 0.3264, 0.4512],
+        ('Text Expansion', "accuracy"): [0.5107, 0.5050, 0.5461, 0.5581],
+        ('Text Expansion', "kendall's tau"): [0.1281, 0.0635, 0.2716, 0.3864],
+        ('Text Expansion', 'spearman'): [0.1352, 0.0667, 0.2867, 0.4157],
+        ('Summary Generation', 'accuracy'): [0.6504, 0.6014, 0.7162, 0.7441],
+        ('Summary Generation', "kendall's tau"): [0.3957, 0.2688, 0.5092, 0.5001],
+        ('Summary Generation', 'spearman'): [0.4188, 0.2817, 0.5403, 0.5405],
+        ('Non-Factoid QA', "accuracy"): [0.5935, 0.5817, 0.7000, 0.7203],
+        ('Non-Factoid QA', "kendall's tau"): [0.2332, 0.2389, 0.4440, 0.4235],
+        ('Non-Factoid QA', 'spearman'): [0.2443, 0.2492, 0.4630, 0.4511]
     }
+    overall = [0, 0, 0, 0]
+    for d in data:
+        if d != ('', 'teamId') and d != ('', 'methods') and d != ('', 'overall'):
+            for i in range(4):
+                    overall[i] += data[d][i]
+    overall = [i / (3*4) for i in overall]
+    data[('', 'overall')] = overall
+    for d in data:
+        for col in d.select_dtypes(include=['float64', 'int64']).columns:
+            d[col] = d[col].apply(lambda x: f"{x:.4f}")
+    st.dataframe(data, use_container_width=True)
+    # # teamId 唯一标识码
+    # DG = {
+    #     "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
+    #     "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
+    #     "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
+    #     "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
+    #     "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
+    # }
+    # df1 = pd.DataFrame(DG)
+    # TE = {
+    #     "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
+    #     "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
+    #     "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
+    #     "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
+    #     "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
+    # }
+    # df2 = pd.DataFrame(TE)
+    # SG = {
+    #     "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
+    #     "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
+    #     "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
+    #     "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
+    #     "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
+    # }
+    # df3 = pd.DataFrame(SG)
+    # NFQA = {
+    #     "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
+    #     "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
+    #     "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
+    #     "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
+    #     "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
+    # }
+    # df4 = pd.DataFrame(NFQA)
+    # df = [df1, df2, df3, df4]
+    # for d in df:
+    #     for col in d.select_dtypes(include=['float64', 'int64']).columns:
+    #         d[col] = d[col].apply(lambda x: f"{x:.4f}")
     # 创建标签页
+    # tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
+    # with tab1:
+    #     st.markdown("""<p class='main-text'>Task: Dialogue Generation; Dataset: DialyDialog</p>""", unsafe_allow_html=True)
+    #     st.dataframe(df1, use_container_width=True)
+    # with tab2:
+    #     st.markdown("""<p class='main-text'>Task: Text Expansion; Dataset: WritingPrompts</p>""", unsafe_allow_html=True)
+    #     st.dataframe(df2, use_container_width=True)
+    # with tab3:
+    #     st.markdown("""<p class='main-text'>Task: Summary Generation; Dataset: Xsum</p>""", unsafe_allow_html=True)
+    #     st.dataframe(df3, use_container_width=True)
+    # with tab4:
+    #     st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
+    #     st.dataframe(df4, use_container_width=True)
     st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
     # 获取北京时间
     time_placeholder = st.empty()

test.py ADDED Viewed

	@@ -0,0 +1,27 @@

+data = {
+        ('', 'teamId'): ['baseline', 'baseline', 'baseline', 'baseline'],
+        ('', 'methods'): ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
+        ('', 'overall'): [],
+        ('Dialogue Generation', 'accuracy'): [0.5806, 0.5483, 0.6001, 0.6472],
+        ('Dialogue Generation', "kendall's tau"): [0.3243, 0.1739, 0.3042, 0.4167],
+        ('Dialogue Generation', 'spearman'): [0.3505, 0.1857, 0.3264, 0.4512],
+        ('Text Expansion', "accuracy"): [0.5107, 0.5050, 0.5461, 0.5581],
+        ('Text Expansion', "kendall's tau"): [0.1281, 0.0635, 0.2716, 0.3864],
+        ('Text Expansion', 'spearman'): [0.1352, 0.0667, 0.2867, 0.4157],
+        ('Summary Generation', 'accuracy'): [0.6504, 0.6014, 0.7162, 0.7441],
+        ('Summary Generation', "kendall's tau"): [0.3957, 0.2688, 0.5092, 0.5001],
+        ('Summary Generation', 'spearman'): [0.4188, 0.2817, 0.5403, 0.5405],
+        ('Non-Factoid QA', "accuracy"): [0.5935, 0.5817, 0.7000, 0.7203],
+        ('Non-Factoid QA', "kendall's tau"): [0.2332, 0.2389, 0.4440, 0.4235],
+        ('Non-Factoid QA', 'spearman'): [0.2443, 0.2492, 0.4630, 0.4511]
+    }
+overall = [0, 0, 0, 0]
+for d in data:
+    if d != ('', 'teamId') and d != ('', 'methods') and d != ('', 'overall'):
+       for i in range(4):
+            overall[i] += data[d][i]
+overall = [i / (3*4) for i in overall]
+print(overall)