Adam Jirkovsky
commited on
Commit
·
2fc1b8f
1
Parent(s):
e3e7110
Add graphical results comparison
Browse files- app.py +29 -8
- src/display/about.py +2 -1
- src/display/utils.py +2 -0
- src/populate.py +2 -9
app.py
CHANGED
|
@@ -14,7 +14,7 @@ from src.display.about import (
|
|
| 14 |
TABLE_DESC,
|
| 15 |
)
|
| 16 |
from src.display.css_html_js import custom_css
|
| 17 |
-
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 18 |
from src.display.utils import (
|
| 19 |
BENCHMARK_COLS,
|
| 20 |
COLS,
|
|
@@ -35,6 +35,7 @@ from src.submission.submit import add_new_eval
|
|
| 35 |
from captcha.image import ImageCaptcha
|
| 36 |
from PIL import Image
|
| 37 |
import random, string
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
original_df = None
|
|
@@ -44,6 +45,12 @@ leaderboard_df = None
|
|
| 44 |
def restart_space():
|
| 45 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
def download_data():
|
| 48 |
global original_df
|
| 49 |
global leaderboard_df
|
|
@@ -65,7 +72,8 @@ def download_data():
|
|
| 65 |
|
| 66 |
_, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 67 |
leaderboard_df = original_df.copy()
|
| 68 |
-
|
|
|
|
| 69 |
|
| 70 |
download_data()
|
| 71 |
|
|
@@ -88,8 +96,6 @@ def update_table(
|
|
| 88 |
#filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
| 89 |
filtered_df = filter_queries(query, hidden_df)
|
| 90 |
df = select_columns(filtered_df, columns)
|
| 91 |
-
print("TF")
|
| 92 |
-
print(df)
|
| 93 |
return df
|
| 94 |
|
| 95 |
|
|
@@ -234,7 +240,6 @@ with demo:
|
|
| 234 |
)
|
| 235 |
"""
|
| 236 |
gr.Markdown(TABLE_DESC, elem_classes="markdown-text")
|
| 237 |
-
#print(shown_columns.value)
|
| 238 |
leaderboard_table = gr.Dataframe(
|
| 239 |
value=leaderboard_df[
|
| 240 |
[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value
|
|
@@ -244,10 +249,8 @@ with demo:
|
|
| 244 |
elem_id="leaderboard-table",
|
| 245 |
interactive=False,
|
| 246 |
visible=True,
|
| 247 |
-
wrap=False
|
| 248 |
)
|
| 249 |
-
print(leaderboard_table.value)
|
| 250 |
-
print(leaderboard_table.headers)
|
| 251 |
|
| 252 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 253 |
hidden_leaderboard_table_for_search = gr.Dataframe(
|
|
@@ -278,6 +281,24 @@ with demo:
|
|
| 278 |
leaderboard_table,
|
| 279 |
queue=True,
|
| 280 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 282 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 283 |
|
|
|
|
| 14 |
TABLE_DESC,
|
| 15 |
)
|
| 16 |
from src.display.css_html_js import custom_css
|
| 17 |
+
from src.display.formatting import styled_error, styled_message, styled_warning, model_hyperlink
|
| 18 |
from src.display.utils import (
|
| 19 |
BENCHMARK_COLS,
|
| 20 |
COLS,
|
|
|
|
| 35 |
from captcha.image import ImageCaptcha
|
| 36 |
from PIL import Image
|
| 37 |
import random, string
|
| 38 |
+
import matplotlib.pyplot as plt
|
| 39 |
|
| 40 |
|
| 41 |
original_df = None
|
|
|
|
| 45 |
def restart_space():
|
| 46 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
| 47 |
|
| 48 |
+
def add_model_hyperlinks(row):
|
| 49 |
+
if row["Model URL"] is None or row["Model URL"] == "":
|
| 50 |
+
return row["Model"]
|
| 51 |
+
else:
|
| 52 |
+
return model_hyperlink(row["Model URL"], row["Model"])
|
| 53 |
+
|
| 54 |
def download_data():
|
| 55 |
global original_df
|
| 56 |
global leaderboard_df
|
|
|
|
| 72 |
|
| 73 |
_, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 74 |
leaderboard_df = original_df.copy()
|
| 75 |
+
leaderboard_df["Model"] = leaderboard_df.apply(add_model_hyperlinks, axis=1)
|
| 76 |
+
leaderboard_df.sort_values(by=["Aggregate Score"], ascending=False, inplace=True)
|
| 77 |
|
| 78 |
download_data()
|
| 79 |
|
|
|
|
| 96 |
#filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
| 97 |
filtered_df = filter_queries(query, hidden_df)
|
| 98 |
df = select_columns(filtered_df, columns)
|
|
|
|
|
|
|
| 99 |
return df
|
| 100 |
|
| 101 |
|
|
|
|
| 240 |
)
|
| 241 |
"""
|
| 242 |
gr.Markdown(TABLE_DESC, elem_classes="markdown-text")
|
|
|
|
| 243 |
leaderboard_table = gr.Dataframe(
|
| 244 |
value=leaderboard_df[
|
| 245 |
[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value
|
|
|
|
| 249 |
elem_id="leaderboard-table",
|
| 250 |
interactive=False,
|
| 251 |
visible=True,
|
| 252 |
+
wrap=False,
|
| 253 |
)
|
|
|
|
|
|
|
| 254 |
|
| 255 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 256 |
hidden_leaderboard_table_for_search = gr.Dataframe(
|
|
|
|
| 281 |
leaderboard_table,
|
| 282 |
queue=True,
|
| 283 |
)
|
| 284 |
+
|
| 285 |
+
model_num = len(original_df)
|
| 286 |
+
graph_df = original_df.drop(columns=["_", "Precision", "Model URL"]).set_index("Model").T
|
| 287 |
+
graph_ax = graph_df.plot(
|
| 288 |
+
kind="barh",
|
| 289 |
+
title="Graphical performance comparison",
|
| 290 |
+
xlabel="Accuracy [%]",
|
| 291 |
+
ylabel="Model",
|
| 292 |
+
width=0.9,
|
| 293 |
+
figsize=(15, 7 + 2*model_num),
|
| 294 |
+
)
|
| 295 |
+
graph_ax.invert_yaxis()
|
| 296 |
+
for container in graph_ax.containers:
|
| 297 |
+
graph_ax.bar_label(container, fontsize=8, fmt="%.1f")
|
| 298 |
+
graph_ax.legend(loc='center left', bbox_to_anchor=(1.01, 0.95))
|
| 299 |
+
plt.tight_layout(rect=[0, 0, 0.95, 1])
|
| 300 |
+
|
| 301 |
+
plot = gr.Plot(graph_ax.get_figure(), label="Graphical performance comparison")
|
| 302 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 303 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 304 |
|
src/display/about.py
CHANGED
|
@@ -31,7 +31,7 @@ class Tasks(Enum):
|
|
| 31 |
# Your leaderboard name
|
| 32 |
TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
|
| 33 |
|
| 34 |
-
TABLE_DESC = "The values shown in the table represent the accuracy metric in percentage."
|
| 35 |
|
| 36 |
# What does your leaderboard evaluate?
|
| 37 |
INTRODUCTION_OLD = """
|
|
@@ -94,6 +94,7 @@ The leaderboard table also displays aggregated scores across task categories, in
|
|
| 94 |
- **Reasoning (Avg.):** ANLI, Belebele, CTKFacts, SQAD
|
| 95 |
- **Math (Avg.):** GSM8K, Klokanek
|
| 96 |
- **Classification (Avg.):** Czech News, Facebook Comments, Mall Reviews, Subjectivity
|
|
|
|
| 97 |
|
| 98 |
## Evaluation Process
|
| 99 |
|
|
|
|
| 31 |
# Your leaderboard name
|
| 32 |
TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
|
| 33 |
|
| 34 |
+
TABLE_DESC = "The values shown in the leaderboard table represent the accuracy metric in percentage."
|
| 35 |
|
| 36 |
# What does your leaderboard evaluate?
|
| 37 |
INTRODUCTION_OLD = """
|
|
|
|
| 94 |
- **Reasoning (Avg.):** ANLI, Belebele, CTKFacts, SQAD
|
| 95 |
- **Math (Avg.):** GSM8K, Klokanek
|
| 96 |
- **Classification (Avg.):** Czech News, Facebook Comments, Mall Reviews, Subjectivity
|
| 97 |
+
- **Aggregate Score:** Average over above categories
|
| 98 |
|
| 99 |
## Evaluation Process
|
| 100 |
|
src/display/utils.py
CHANGED
|
@@ -51,6 +51,7 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
|
|
| 51 |
auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 52 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
| 53 |
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model URL", "str", False)])
|
|
|
|
| 54 |
auto_eval_column_dict.append(["grammar_avg", ColumnContent, ColumnContent("Grammar (Avg.)", "number", True)])
|
| 55 |
auto_eval_column_dict.append(["knowledge_avg", ColumnContent, ColumnContent("Knowledge (Avg.)", "number", True)])
|
| 56 |
auto_eval_column_dict.append(["reasoning_avg", ColumnContent, ColumnContent("Reasoning (Avg.)", "number", True)])
|
|
@@ -100,6 +101,7 @@ HEADER_MAP = {
|
|
| 100 |
"subjectivity_cs": "Subjectivity",
|
| 101 |
"truthfulqa_cs": "TruthfulQA",
|
| 102 |
"dummy": "_",
|
|
|
|
| 103 |
}
|
| 104 |
|
| 105 |
|
|
|
|
| 51 |
auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 52 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
| 53 |
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model URL", "str", False)])
|
| 54 |
+
auto_eval_column_dict.append(["aggregate_score", ColumnContent, ColumnContent("Aggregate Score", "number", True)])
|
| 55 |
auto_eval_column_dict.append(["grammar_avg", ColumnContent, ColumnContent("Grammar (Avg.)", "number", True)])
|
| 56 |
auto_eval_column_dict.append(["knowledge_avg", ColumnContent, ColumnContent("Knowledge (Avg.)", "number", True)])
|
| 57 |
auto_eval_column_dict.append(["reasoning_avg", ColumnContent, ColumnContent("Reasoning (Avg.)", "number", True)])
|
|
|
|
| 101 |
"subjectivity_cs": "Subjectivity",
|
| 102 |
"truthfulqa_cs": "TruthfulQA",
|
| 103 |
"dummy": "_",
|
| 104 |
+
"aggregate_score": "Aggregate Score",
|
| 105 |
}
|
| 106 |
|
| 107 |
|
src/populate.py
CHANGED
|
@@ -9,13 +9,6 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn, HEADER_MAP
|
|
| 9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 10 |
|
| 11 |
|
| 12 |
-
def add_model_hyperlinks(row):
|
| 13 |
-
if row["Model URL"] is None or row["Model URL"] == "":
|
| 14 |
-
return row["Model"]
|
| 15 |
-
else:
|
| 16 |
-
return model_hyperlink(row["Model URL"], row["Model"])
|
| 17 |
-
|
| 18 |
-
|
| 19 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 20 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 21 |
#all_data_json = [v.to_dict() for v in raw_data]
|
|
@@ -28,14 +21,14 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 28 |
df["Reasoning (Avg.)"] = df[["ANLI", "Belebele", "CTKFacts", "SQAD"]].mean(axis=1)
|
| 29 |
df["Math (Avg.)"] = df[["GSM8K", "Klokanek"]].mean(axis=1)
|
| 30 |
df["Classification (Avg.)"] = df[["Czech News", "Facebook Comments", "Mall Reviews", "Subjectivity"]].mean(axis=1)
|
|
|
|
| 31 |
df["_"] = "" # The dataframe does not display the last column - BUG in gradio?
|
| 32 |
df = df[cols].round(decimals=2)
|
| 33 |
df.replace(r'\s+', np.nan, regex=True)
|
| 34 |
# filter out if any of the benchmarks have not been produced
|
| 35 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 36 |
-
df['Model'] = df.apply(add_model_hyperlinks, axis=1)
|
| 37 |
|
| 38 |
-
return raw_data, df
|
| 39 |
|
| 40 |
|
| 41 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
|
| 9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 14 |
#all_data_json = [v.to_dict() for v in raw_data]
|
|
|
|
| 21 |
df["Reasoning (Avg.)"] = df[["ANLI", "Belebele", "CTKFacts", "SQAD"]].mean(axis=1)
|
| 22 |
df["Math (Avg.)"] = df[["GSM8K", "Klokanek"]].mean(axis=1)
|
| 23 |
df["Classification (Avg.)"] = df[["Czech News", "Facebook Comments", "Mall Reviews", "Subjectivity"]].mean(axis=1)
|
| 24 |
+
df["Aggregate Score"] = df[["Grammar (Avg.)", "Knowledge (Avg.)", "Reasoning (Avg.)", "Math (Avg.)", "Classification (Avg.)"]].mean(axis=1)
|
| 25 |
df["_"] = "" # The dataframe does not display the last column - BUG in gradio?
|
| 26 |
df = df[cols].round(decimals=2)
|
| 27 |
df.replace(r'\s+', np.nan, regex=True)
|
| 28 |
# filter out if any of the benchmarks have not been produced
|
| 29 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
|
| 30 |
|
| 31 |
+
return raw_data, df,
|
| 32 |
|
| 33 |
|
| 34 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|