Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: adapt UI in app.py
Browse files- app.py +79 -93
- src/benchmarks.py +4 -1
- src/envs.py +4 -4
- src/populate.py +5 -3
- tests/src/test_populate.py +2 -2
app.py
CHANGED
|
@@ -18,28 +18,28 @@ from src.display.utils import (
|
|
| 18 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 19 |
from src.populate import get_leaderboard_df
|
| 20 |
from utils import update_table
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
def restart_space():
|
| 24 |
API.restart_space(repo_id=REPO_ID)
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
restart_space()
|
| 43 |
|
| 44 |
raw_data_qa, original_df_qa = get_leaderboard_df(
|
| 45 |
EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_1')
|
|
@@ -58,7 +58,7 @@ with demo:
|
|
| 58 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 59 |
|
| 60 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 61 |
-
with gr.TabItem("
|
| 62 |
with gr.Row():
|
| 63 |
with gr.Column():
|
| 64 |
with gr.Row():
|
|
@@ -67,56 +67,49 @@ with demo:
|
|
| 67 |
show_label=False,
|
| 68 |
elem_id="search-bar",
|
| 69 |
)
|
|
|
|
| 70 |
with gr.Row():
|
| 71 |
-
|
| 72 |
-
choices=
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
],
|
| 77 |
-
value=[
|
| 78 |
-
c.name
|
| 79 |
-
for c in fields(AutoEvalColumnQA)
|
| 80 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
| 81 |
-
],
|
| 82 |
-
label="Select columns to show",
|
| 83 |
-
elem_id="column-select",
|
| 84 |
interactive=True,
|
| 85 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
with gr.Row():
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
)
|
| 90 |
with gr.Column(min_width=320):
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
value=[t.to_str() for t in ModelType],
|
| 96 |
-
interactive=True,
|
| 97 |
-
elem_id="filter-columns-type",
|
| 98 |
-
)
|
| 99 |
-
filter_columns_precision = gr.CheckboxGroup(
|
| 100 |
-
label="Precision",
|
| 101 |
-
choices=[i.value.name for i in Precision],
|
| 102 |
-
value=[i.value.name for i in Precision],
|
| 103 |
interactive=True,
|
| 104 |
-
elem_id="
|
| 105 |
-
)
|
| 106 |
-
filter_columns_size = gr.CheckboxGroup(
|
| 107 |
-
label="Model sizes (in billions of parameters)",
|
| 108 |
-
choices=list(NUMERIC_INTERVALS.keys()),
|
| 109 |
-
value=list(NUMERIC_INTERVALS.keys()),
|
| 110 |
-
interactive=True,
|
| 111 |
-
elem_id="filter-columns-size",
|
| 112 |
)
|
|
|
|
|
|
|
| 113 |
|
|
|
|
| 114 |
leaderboard_table = gr.components.Dataframe(
|
| 115 |
-
value=leaderboard_df
|
| 116 |
-
|
| 117 |
-
+ shown_columns.value
|
| 118 |
-
],
|
| 119 |
-
headers=[c.name for c in fields(AutoEvalColumnQA) if c.never_hidden] + shown_columns.value,
|
| 120 |
datatype=TYPES,
|
| 121 |
elem_id="leaderboard-table",
|
| 122 |
interactive=False,
|
|
@@ -124,41 +117,34 @@ with demo:
|
|
| 124 |
)
|
| 125 |
|
| 126 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 127 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
)
|
| 133 |
-
search_bar.submit(
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
filter_columns_size,
|
| 156 |
-
deleted_models_visibility,
|
| 157 |
-
search_bar,
|
| 158 |
-
],
|
| 159 |
-
leaderboard_table,
|
| 160 |
-
queue=True,
|
| 161 |
-
)
|
| 162 |
|
| 163 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 164 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 18 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 19 |
from src.populate import get_leaderboard_df
|
| 20 |
from utils import update_table
|
| 21 |
+
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, metric_list
|
| 22 |
|
| 23 |
|
| 24 |
def restart_space():
|
| 25 |
API.restart_space(repo_id=REPO_ID)
|
| 26 |
|
| 27 |
+
# try:
|
| 28 |
+
# print(EVAL_REQUESTS_PATH)
|
| 29 |
+
# snapshot_download(
|
| 30 |
+
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
| 31 |
+
# token=TOKEN
|
| 32 |
+
# )
|
| 33 |
+
# except Exception:
|
| 34 |
+
# restart_space()
|
| 35 |
+
# try:
|
| 36 |
+
# print(EVAL_RESULTS_PATH)
|
| 37 |
+
# snapshot_download(
|
| 38 |
+
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
| 39 |
+
# token=TOKEN
|
| 40 |
+
# )
|
| 41 |
+
# except Exception:
|
| 42 |
+
# restart_space()
|
|
|
|
| 43 |
|
| 44 |
raw_data_qa, original_df_qa = get_leaderboard_df(
|
| 45 |
EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_1')
|
|
|
|
| 58 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 59 |
|
| 60 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 61 |
+
with gr.TabItem("QA", elem_id="llm-benchmark-tab-table", id=0):
|
| 62 |
with gr.Row():
|
| 63 |
with gr.Column():
|
| 64 |
with gr.Row():
|
|
|
|
| 67 |
show_label=False,
|
| 68 |
elem_id="search-bar",
|
| 69 |
)
|
| 70 |
+
# select domain
|
| 71 |
with gr.Row():
|
| 72 |
+
selected_domains = gr.CheckboxGroup(
|
| 73 |
+
choices=DOMAIN_COLS_QA,
|
| 74 |
+
value=DOMAIN_COLS_QA,
|
| 75 |
+
label="Select the domains",
|
| 76 |
+
elem_id="domain-column-select",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
interactive=True,
|
| 78 |
)
|
| 79 |
+
# select language
|
| 80 |
+
with gr.Row():
|
| 81 |
+
selected_langs = gr.CheckboxGroup(
|
| 82 |
+
choices=LANG_COLS_QA,
|
| 83 |
+
value=LANG_COLS_QA,
|
| 84 |
+
label="Select the languages",
|
| 85 |
+
elem_id="language-column-select",
|
| 86 |
+
interactive=True
|
| 87 |
+
)
|
| 88 |
+
# select reranking models
|
| 89 |
+
reranking_models = list(frozenset([eval_result.retrieval_model for eval_result in raw_data_qa]))
|
| 90 |
with gr.Row():
|
| 91 |
+
selected_rerankings = gr.CheckboxGroup(
|
| 92 |
+
choices=reranking_models,
|
| 93 |
+
value=reranking_models,
|
| 94 |
+
label="Select the reranking models",
|
| 95 |
+
elem_id="reranking-select",
|
| 96 |
+
interactive=True
|
| 97 |
)
|
| 98 |
with gr.Column(min_width=320):
|
| 99 |
+
selected_metric = gr.Dropdown(
|
| 100 |
+
choices=metric_list,
|
| 101 |
+
value=metric_list,
|
| 102 |
+
label="Select the metric",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
interactive=True,
|
| 104 |
+
elem_id="metric-select",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
)
|
| 106 |
+
# update shown_columns when selected_langs and selected_domains are changed
|
| 107 |
+
shown_columns = leaderboard_df.columns
|
| 108 |
|
| 109 |
+
# reload the leaderboard_df and raw_data when selected_metric is changed
|
| 110 |
leaderboard_table = gr.components.Dataframe(
|
| 111 |
+
value=leaderboard_df,
|
| 112 |
+
# headers=shown_columns,
|
|
|
|
|
|
|
|
|
|
| 113 |
datatype=TYPES,
|
| 114 |
elem_id="leaderboard-table",
|
| 115 |
interactive=False,
|
|
|
|
| 117 |
)
|
| 118 |
|
| 119 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 120 |
+
# hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 121 |
+
# value=original_df_qa[COLS],
|
| 122 |
+
# headers=COLS,
|
| 123 |
+
# datatype=TYPES,
|
| 124 |
+
# visible=False,
|
| 125 |
+
# )
|
| 126 |
+
# search_bar.submit(
|
| 127 |
+
# update_table,
|
| 128 |
+
# [
|
| 129 |
+
# hidden_leaderboard_table_for_search,
|
| 130 |
+
# shown_columns,
|
| 131 |
+
# selected_rerankings,
|
| 132 |
+
# search_bar,
|
| 133 |
+
# ],
|
| 134 |
+
# leaderboard_table,
|
| 135 |
+
# )
|
| 136 |
+
# for selector in [shown_columns, selected_rerankings, search_bar]:
|
| 137 |
+
# selector.change(
|
| 138 |
+
# update_table,
|
| 139 |
+
# [
|
| 140 |
+
# hidden_leaderboard_table_for_search,
|
| 141 |
+
# shown_columns,
|
| 142 |
+
# selected_rerankings,
|
| 143 |
+
# search_bar,
|
| 144 |
+
# ],
|
| 145 |
+
# leaderboard_table,
|
| 146 |
+
# queue=True,
|
| 147 |
+
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 150 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
src/benchmarks.py
CHANGED
|
@@ -135,4 +135,7 @@ for task, domain_dict in dataset_dict.items():
|
|
| 135 |
BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
|
| 136 |
BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
|
| 137 |
|
| 138 |
-
BENCHMARK_COLS_QA = [c.col_name for c in qa_benchmark_dict.values()]
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
|
| 136 |
BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
|
| 137 |
|
| 138 |
+
BENCHMARK_COLS_QA = [c.col_name for c in qa_benchmark_dict.values()]
|
| 139 |
+
|
| 140 |
+
DOMAIN_COLS_QA = list(frozenset([c.domain for c in qa_benchmark_dict.values()]))
|
| 141 |
+
LANG_COLS_QA = list(frozenset([c.lang for c in qa_benchmark_dict.values()]))
|
src/envs.py
CHANGED
|
@@ -17,9 +17,9 @@ RESULTS_REPO = f"{OWNER}/results"
|
|
| 17 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 18 |
|
| 19 |
# Local caches
|
| 20 |
-
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
| 22 |
-
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 23 |
-
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 24 |
|
| 25 |
API = HfApi(token=TOKEN)
|
|
|
|
| 17 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 18 |
|
| 19 |
# Local caches
|
| 20 |
+
EVAL_REQUESTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/tests/toydata/test_requests" # os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
+
EVAL_RESULTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/tests/toydata/test_results" #os.path.join(CACHE_PATH, "eval-results")
|
| 22 |
+
# EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 23 |
+
# EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 24 |
|
| 25 |
API = HfApi(token=TOKEN)
|
src/populate.py
CHANGED
|
@@ -17,13 +17,15 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 17 |
all_data_json += v.to_dict(task=task, metric=metric)
|
| 18 |
|
| 19 |
df = pd.DataFrame.from_records(all_data_json)
|
| 20 |
-
|
|
|
|
| 21 |
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
| 22 |
df.reset_index(inplace=True)
|
| 23 |
-
|
|
|
|
| 24 |
|
| 25 |
# filter out if any of the benchmarks have not been produced
|
| 26 |
-
df = df[has_no_nan_values(df,
|
| 27 |
return raw_data, df
|
| 28 |
|
| 29 |
|
|
|
|
| 17 |
all_data_json += v.to_dict(task=task, metric=metric)
|
| 18 |
|
| 19 |
df = pd.DataFrame.from_records(all_data_json)
|
| 20 |
+
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
| 21 |
+
df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1)
|
| 22 |
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
| 23 |
df.reset_index(inplace=True)
|
| 24 |
+
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
|
| 25 |
+
df = df[_cols].round(decimals=2)
|
| 26 |
|
| 27 |
# filter out if any of the benchmarks have not been produced
|
| 28 |
+
df = df[has_no_nan_values(df, _benchmark_cols)]
|
| 29 |
return raw_data, df
|
| 30 |
|
| 31 |
|
tests/src/test_populate.py
CHANGED
|
@@ -9,9 +9,9 @@ def test_get_leaderboard_df():
|
|
| 9 |
results_path = cur_fp.parents[1] / "toydata" / "test_results"
|
| 10 |
cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
|
| 11 |
benchmark_cols = ['wiki_en', 'wiki_zh',]
|
| 12 |
-
raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols)
|
| 13 |
assert df.shape[0] == 2
|
| 14 |
-
# the results
|
| 15 |
for i in range(2):
|
| 16 |
assert df["Retrieval Model"][i] == "bge-m3"
|
| 17 |
# the results contains only two reranking model
|
|
|
|
| 9 |
results_path = cur_fp.parents[1] / "toydata" / "test_results"
|
| 10 |
cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
|
| 11 |
benchmark_cols = ['wiki_en', 'wiki_zh',]
|
| 12 |
+
raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols, 'qa', 'ndcg_at_1')
|
| 13 |
assert df.shape[0] == 2
|
| 14 |
+
# the results contain only one embedding model
|
| 15 |
for i in range(2):
|
| 16 |
assert df["Retrieval Model"][i] == "bge-m3"
|
| 17 |
# the results contains only two reranking model
|