Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: add the ranking only tab for qa
Browse files
app.py
CHANGED
|
@@ -11,7 +11,7 @@ from src.about import (
|
|
| 11 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
|
| 12 |
DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC
|
| 13 |
from src.display.css_html_js import custom_css
|
| 14 |
-
from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_RERANKING_MODEL
|
| 15 |
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
| 16 |
from src.read_evals import get_raw_eval_results, get_leaderboard_df
|
| 17 |
from src.utils import update_metric, upload_file, get_default_cols, submit_results, reset_rank
|
|
@@ -23,14 +23,14 @@ def restart_space():
|
|
| 23 |
API.restart_space(repo_id=REPO_ID)
|
| 24 |
|
| 25 |
|
| 26 |
-
try:
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
except Exception as e:
|
| 32 |
-
|
| 33 |
-
|
| 34 |
|
| 35 |
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
|
| 36 |
|
|
@@ -110,7 +110,7 @@ with demo:
|
|
| 110 |
show_revision_and_timestamp = get_revision_and_ts_checkbox()
|
| 111 |
|
| 112 |
with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
|
| 113 |
-
with gr.TabItem("
|
| 114 |
with gr.Row():
|
| 115 |
# search retrieval models
|
| 116 |
with gr.Column():
|
|
@@ -149,17 +149,17 @@ with demo:
|
|
| 149 |
leaderboard_table,
|
| 150 |
queue=True
|
| 151 |
)
|
| 152 |
-
with gr.TabItem("
|
| 153 |
with gr.Column():
|
| 154 |
search_bar_retriever = get_search_bar()
|
| 155 |
selected_noreranker = get_noreranking_dropdown()
|
| 156 |
lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 157 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
| 158 |
-
hidden_lb_db_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 159 |
-
hidden_lb_db_retriever = reset_rank(hidden_lb_db_retriever)
|
| 160 |
lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
|
| 161 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 162 |
-
|
|
|
|
|
|
|
| 163 |
|
| 164 |
set_listeners(
|
| 165 |
"qa",
|
|
@@ -188,7 +188,47 @@ with demo:
|
|
| 188 |
lb_table_retriever,
|
| 189 |
queue=True
|
| 190 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
| 193 |
with gr.Row():
|
| 194 |
with gr.Column(min_width=320):
|
|
|
|
| 11 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
|
| 12 |
DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC
|
| 13 |
from src.display.css_html_js import custom_css
|
| 14 |
+
from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
| 15 |
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
| 16 |
from src.read_evals import get_raw_eval_results, get_leaderboard_df
|
| 17 |
from src.utils import update_metric, upload_file, get_default_cols, submit_results, reset_rank
|
|
|
|
| 23 |
API.restart_space(repo_id=REPO_ID)
|
| 24 |
|
| 25 |
|
| 26 |
+
# try:
|
| 27 |
+
# snapshot_download(
|
| 28 |
+
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
| 29 |
+
# token=TOKEN
|
| 30 |
+
# )
|
| 31 |
+
# except Exception as e:
|
| 32 |
+
# print(f'failed to download')
|
| 33 |
+
# restart_space()
|
| 34 |
|
| 35 |
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
|
| 36 |
|
|
|
|
| 110 |
show_revision_and_timestamp = get_revision_and_ts_checkbox()
|
| 111 |
|
| 112 |
with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
|
| 113 |
+
with gr.TabItem("Retrieval + Reranking", id=10):
|
| 114 |
with gr.Row():
|
| 115 |
# search retrieval models
|
| 116 |
with gr.Column():
|
|
|
|
| 149 |
leaderboard_table,
|
| 150 |
queue=True
|
| 151 |
)
|
| 152 |
+
with gr.TabItem("Retrieval Only", id=11):
|
| 153 |
with gr.Column():
|
| 154 |
search_bar_retriever = get_search_bar()
|
| 155 |
selected_noreranker = get_noreranking_dropdown()
|
| 156 |
lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 157 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
|
|
|
|
|
|
| 158 |
lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
|
| 159 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 160 |
+
hidden_lb_df_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 161 |
+
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
| 162 |
+
hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
|
| 163 |
|
| 164 |
set_listeners(
|
| 165 |
"qa",
|
|
|
|
| 188 |
lb_table_retriever,
|
| 189 |
queue=True
|
| 190 |
)
|
| 191 |
+
with gr.TabItem("Reranking Only", id=12):
|
| 192 |
+
with gr.Row():
|
| 193 |
+
with gr.Column(scale=1):
|
| 194 |
+
selected_rerankings_reranker = get_reranking_dropdown(reranking_models)
|
| 195 |
+
with gr.Column(scale=1):
|
| 196 |
+
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
| 197 |
+
lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == "BM25"]
|
| 198 |
+
lb_df_reranker = reset_rank(lb_df_reranker)
|
| 199 |
+
lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
|
| 200 |
+
hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == "BM25"]
|
| 201 |
+
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
| 202 |
+
hidden_lb_table_reranker = get_leaderboard_table(
|
| 203 |
+
hidden_lb_df_reranker, types_qa, visible=False
|
| 204 |
+
)
|
| 205 |
|
| 206 |
+
set_listeners(
|
| 207 |
+
"qa",
|
| 208 |
+
lb_table_reranker,
|
| 209 |
+
hidden_lb_table_reranker,
|
| 210 |
+
search_bar_reranker,
|
| 211 |
+
selected_domains,
|
| 212 |
+
selected_langs,
|
| 213 |
+
selected_rerankings_reranker,
|
| 214 |
+
show_anonymous,
|
| 215 |
+
show_revision_and_timestamp,
|
| 216 |
+
)
|
| 217 |
+
# set metric listener
|
| 218 |
+
selected_metric.change(
|
| 219 |
+
update_metric_qa,
|
| 220 |
+
[
|
| 221 |
+
selected_metric,
|
| 222 |
+
selected_domains,
|
| 223 |
+
selected_langs,
|
| 224 |
+
selected_rerankings_reranker,
|
| 225 |
+
search_bar_reranker,
|
| 226 |
+
show_anonymous,
|
| 227 |
+
show_revision_and_timestamp,
|
| 228 |
+
],
|
| 229 |
+
lb_table_reranker,
|
| 230 |
+
queue=True
|
| 231 |
+
)
|
| 232 |
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
| 233 |
with gr.Row():
|
| 234 |
with gr.Column(min_width=320):
|