Spaces:
Running
Running
udpate the results format
Browse files- app.py +3 -48
- src/leaderboard/load_results.py +38 -0
app.py
CHANGED
|
@@ -14,6 +14,7 @@ from src.display.about import (
|
|
| 14 |
)
|
| 15 |
from src.display.css_html_js import custom_css
|
| 16 |
from src.envs import API
|
|
|
|
| 17 |
|
| 18 |
# clone / pull the lmeh eval data
|
| 19 |
TOKEN = os.environ.get("TOKEN", None)
|
|
@@ -29,62 +30,16 @@ snapshot_download(
|
|
| 29 |
def restart_space():
|
| 30 |
API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
|
| 31 |
|
| 32 |
-
# Load the
|
| 33 |
-
# def load_csv(file_path):
|
| 34 |
-
# data = pd.read_csv(file_path)
|
| 35 |
-
# return data
|
| 36 |
-
|
| 37 |
-
def load_data(data_path):
|
| 38 |
-
df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
|
| 39 |
-
|
| 40 |
-
columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
# Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
|
| 44 |
-
df_m3exam = df.iloc[:, :11] # M3Exam columns
|
| 45 |
-
df_mmlu = df.iloc[:, [0, 1, 2, 3, 11, 12, 13, 14, 15, 16, 17]] # MMLU columns
|
| 46 |
-
df_avg = df.iloc[:, [0, 1, 2, 3, 18, 19, 20, 21, 22, 23, 24]] # Average columns
|
| 47 |
-
df_mmlu.columns = columns
|
| 48 |
-
df_avg.columns = columns
|
| 49 |
-
|
| 50 |
-
# # multiply the values in the ['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea'] by 100 and display as 1 decimal
|
| 51 |
-
for df_tmp in [df_m3exam, df_mmlu, df_avg]:
|
| 52 |
-
df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] *= 100
|
| 53 |
-
df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2)
|
| 54 |
-
df_tmp['rank'] = df_tmp['avg'].rank(ascending=False).astype(int)
|
| 55 |
-
# change the order of the columns to ['Model', 'type', 'open?', 'shot', 'avg', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi']
|
| 56 |
-
# and sort the columns by 'avg' in descending order
|
| 57 |
-
columns_sorted = ['rank','Model', 'type', 'open?', 'shot', 'avg', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi']
|
| 58 |
-
df_m3exam = df_m3exam[columns_sorted].sort_values(by='avg', ascending=False)
|
| 59 |
-
df_mmlu = df_mmlu[columns_sorted].sort_values(by='avg', ascending=False)
|
| 60 |
-
df_avg = df_avg[columns_sorted].sort_values(by='avg', ascending=False)
|
| 61 |
-
|
| 62 |
-
# change the column name from 'avg' to 'avg⬆️'
|
| 63 |
-
df_m3exam = df_m3exam.rename(columns={'avg': 'avg⬆️'})
|
| 64 |
-
df_mmlu = df_mmlu.rename(columns={'avg': 'avg⬆️'})
|
| 65 |
-
df_avg = df_avg.rename(columns={'avg': 'avg⬆️'})
|
| 66 |
-
|
| 67 |
-
return df_m3exam, df_mmlu, df_avg
|
| 68 |
-
|
| 69 |
-
# Example path to your CSV file
|
| 70 |
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results.csv'
|
| 71 |
-
# data = load_csv(csv_path)
|
| 72 |
-
|
| 73 |
df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
| 74 |
|
| 75 |
-
# def show_data():
|
| 76 |
-
# return data
|
| 77 |
-
|
| 78 |
-
# iface = gr.Interface(fn=show_data, inputs = None, outputs="dataframe", title="SeaExam Leaderboard",
|
| 79 |
-
# description="Leaderboard for the SeaExam competition.")
|
| 80 |
-
# iface.launch()
|
| 81 |
-
|
| 82 |
demo = gr.Blocks(css=custom_css)
|
| 83 |
with demo:
|
| 84 |
gr.HTML(TITLE)
|
| 85 |
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 86 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 87 |
-
with gr.TabItem("🏅
|
| 88 |
leaderboard_table = gr.components.Dataframe(
|
| 89 |
value=df_avg,
|
| 90 |
# value=leaderboard_df[
|
|
|
|
| 14 |
)
|
| 15 |
from src.display.css_html_js import custom_css
|
| 16 |
from src.envs import API
|
| 17 |
+
from src.leaderboard.load_results import load_data
|
| 18 |
|
| 19 |
# clone / pull the lmeh eval data
|
| 20 |
TOKEN = os.environ.get("TOKEN", None)
|
|
|
|
| 30 |
def restart_space():
|
| 31 |
API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
|
| 32 |
|
| 33 |
+
# Load the data from the csv file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results.csv'
|
|
|
|
|
|
|
| 35 |
df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
demo = gr.Blocks(css=custom_css)
|
| 38 |
with demo:
|
| 39 |
gr.HTML(TITLE)
|
| 40 |
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 41 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 42 |
+
with gr.TabItem("🏅 Overall", elem_id="llm-benchmark-Sum", id=0):
|
| 43 |
leaderboard_table = gr.components.Dataframe(
|
| 44 |
value=df_avg,
|
| 45 |
# value=leaderboard_df[
|
src/leaderboard/load_results.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
def load_data(data_path):
|
| 4 |
+
df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
|
| 5 |
+
|
| 6 |
+
columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
|
| 7 |
+
|
| 8 |
+
# Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
|
| 9 |
+
df_m3exam = df.iloc[:, :11] # M3Exam columns
|
| 10 |
+
df_mmlu = df.iloc[:, [0, 1, 2, 3, 11, 12, 13, 14, 15, 16, 17]] # MMLU columns
|
| 11 |
+
df_avg = df.iloc[:, [0, 1, 2, 3, 18, 19, 20, 21, 22, 23, 24]] # Average columns
|
| 12 |
+
df_mmlu.columns = columns
|
| 13 |
+
df_avg.columns = columns
|
| 14 |
+
|
| 15 |
+
# # multiply the values in the ['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea'] by 100 and display as 1 decimal
|
| 16 |
+
for df_tmp in [df_m3exam, df_mmlu, df_avg]:
|
| 17 |
+
df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] *= 100
|
| 18 |
+
df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2)
|
| 19 |
+
df_tmp['rank'] = df_tmp['avg'].rank(ascending=False).astype(int)
|
| 20 |
+
|
| 21 |
+
# change the order of the columns to ['Model', 'type', 'open?', 'shot', 'avg', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi']
|
| 22 |
+
# and sort the columns by 'avg' in descending order
|
| 23 |
+
columns_sorted = ['rank','type', 'Model', 'open?', 'shot', 'avg', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi']
|
| 24 |
+
df_m3exam = df_m3exam[columns_sorted].sort_values(by='avg', ascending=False)
|
| 25 |
+
df_mmlu = df_mmlu[columns_sorted].sort_values(by='avg', ascending=False)
|
| 26 |
+
df_avg = df_avg[columns_sorted].sort_values(by='avg', ascending=False)
|
| 27 |
+
|
| 28 |
+
# change the column name from 'avg' to 'avg⬆️'
|
| 29 |
+
df_m3exam = df_m3exam.rename(columns={'avg': 'avg⬆️'})
|
| 30 |
+
df_mmlu = df_mmlu.rename(columns={'avg': 'avg⬆️'})
|
| 31 |
+
df_avg = df_avg.rename(columns={'avg': 'avg⬆️'})
|
| 32 |
+
|
| 33 |
+
# map the values in the 'type' column to the following values: {'base': 'Base', 'chat': 'Chat'}
|
| 34 |
+
df_m3exam['type'] = df_m3exam['type'].map({'base': '🟢', 'chat': '🔶'})
|
| 35 |
+
df_mmlu['type'] = df_mmlu['type'].map({'base': '🟢', 'chat': '🔶'})
|
| 36 |
+
df_avg['type'] = df_avg['type'].map({'base': '🟢', 'chat': '🔶'})
|
| 37 |
+
|
| 38 |
+
return df_m3exam, df_mmlu, df_avg
|