Spaces:
Runtime error
Runtime error
refactor: reformat
Browse files- app.py +28 -26
- src/loaders.py +6 -9
- src/models.py +11 -4
- src/utils.py +13 -9
- tests/src/display/test_utils.py +0 -1
- tests/test_utils.py +9 -2
app.py
CHANGED
|
@@ -6,6 +6,7 @@ from huggingface_hub import snapshot_download
|
|
| 6 |
|
| 7 |
from src.about import BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
|
| 8 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
|
|
|
| 9 |
from src.components import (
|
| 10 |
get_anonymous_checkbox,
|
| 11 |
get_domain_dropdown,
|
|
@@ -31,7 +32,6 @@ from src.envs import (
|
|
| 31 |
RESULTS_REPO,
|
| 32 |
TOKEN,
|
| 33 |
)
|
| 34 |
-
from src.columns import COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL
|
| 35 |
from src.loaders import load_eval_results
|
| 36 |
from src.models import TaskType, model_hyperlink
|
| 37 |
from src.utils import remove_html, reset_rank, set_listeners, submit_results, update_metric, upload_file
|
|
@@ -81,7 +81,7 @@ def update_qa_metric(
|
|
| 81 |
reranking_model,
|
| 82 |
query,
|
| 83 |
show_anonymous,
|
| 84 |
-
show_revision_and_timestamp
|
| 85 |
)
|
| 86 |
|
| 87 |
|
|
@@ -173,7 +173,9 @@ with demo:
|
|
| 173 |
# shown_table
|
| 174 |
qa_df_elem_ret_rerank = get_leaderboard_table(datastore.qa_fmt_df, datastore.qa_types)
|
| 175 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 176 |
-
qa_df_elem_ret_rerank_hidden = get_leaderboard_table(
|
|
|
|
|
|
|
| 177 |
|
| 178 |
version.change(
|
| 179 |
update_qa_version,
|
|
@@ -214,14 +216,24 @@ with demo:
|
|
| 214 |
qa_df_elem_ret = get_leaderboard_table(_qa_df_ret, datastore.qa_types)
|
| 215 |
|
| 216 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 217 |
-
_qa_df_ret_hidden = datastore.qa_raw_df[
|
|
|
|
|
|
|
| 218 |
_qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
|
| 219 |
-
qa_df_elem_ret_hidden = get_leaderboard_table(
|
|
|
|
|
|
|
| 220 |
|
| 221 |
version.change(
|
| 222 |
update_qa_version,
|
| 223 |
version,
|
| 224 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
)
|
| 226 |
|
| 227 |
set_listeners(
|
|
@@ -253,13 +265,9 @@ with demo:
|
|
| 253 |
)
|
| 254 |
|
| 255 |
with gr.TabItem("Reranking Only", id=12):
|
| 256 |
-
_qa_df_rerank = datastore.qa_fmt_df[
|
| 257 |
-
datastore.qa_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 258 |
-
]
|
| 259 |
_qa_df_rerank = reset_rank(_qa_df_rerank)
|
| 260 |
-
qa_rerank_models = (
|
| 261 |
-
_qa_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
| 262 |
-
)
|
| 263 |
with gr.Row():
|
| 264 |
with gr.Column(scale=1):
|
| 265 |
qa_models_rerank = get_reranking_dropdown(qa_rerank_models)
|
|
@@ -269,7 +277,7 @@ with demo:
|
|
| 269 |
|
| 270 |
_qa_df_rerank_hidden = datastore.qa_raw_df[
|
| 271 |
datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 272 |
-
|
| 273 |
_qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
|
| 274 |
qa_df_elem_rerank_hidden = get_leaderboard_table(
|
| 275 |
_qa_df_rerank_hidden, datastore.qa_types, visible=False
|
|
@@ -333,9 +341,7 @@ with demo:
|
|
| 333 |
with gr.Column():
|
| 334 |
models = get_reranking_dropdown(datastore.reranking_models)
|
| 335 |
|
| 336 |
-
doc_df_elem_ret_rerank = get_leaderboard_table(
|
| 337 |
-
datastore.doc_fmt_df, datastore.doc_types
|
| 338 |
-
)
|
| 339 |
|
| 340 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 341 |
doc_df_elem_ret_rerank_hidden = get_leaderboard_table(
|
|
@@ -385,15 +391,13 @@ with demo:
|
|
| 385 |
|
| 386 |
_doc_df_ret = datastore.doc_fmt_df[
|
| 387 |
datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 388 |
-
|
| 389 |
_doc_df_ret = reset_rank(_doc_df_ret)
|
| 390 |
-
doc_df_elem_ret = get_leaderboard_table(
|
| 391 |
-
_doc_df_ret, datastore.doc_types
|
| 392 |
-
)
|
| 393 |
|
| 394 |
_doc_df_ret_hidden = datastore.doc_raw_df[
|
| 395 |
datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 396 |
-
|
| 397 |
_doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
|
| 398 |
doc_df_elem_ret_hidden = get_leaderboard_table(
|
| 399 |
_doc_df_ret_hidden, datastore.doc_types, visible=False
|
|
@@ -435,22 +439,20 @@ with demo:
|
|
| 435 |
with gr.TabItem("Reranking Only", id=22):
|
| 436 |
_doc_df_rerank = datastore.doc_fmt_df[
|
| 437 |
datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 438 |
-
|
| 439 |
_doc_df_rerank = reset_rank(_doc_df_rerank)
|
| 440 |
doc_rerank_models = (
|
| 441 |
_doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
| 442 |
)
|
| 443 |
with gr.Row():
|
| 444 |
with gr.Column(scale=1):
|
| 445 |
-
doc_models_rerank = get_reranking_dropdown(
|
| 446 |
-
doc_rerank_models
|
| 447 |
-
)
|
| 448 |
with gr.Column(scale=1):
|
| 449 |
doc_search_bar_rerank = gr.Textbox(show_label=False, visible=False)
|
| 450 |
doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
|
| 451 |
_doc_df_rerank_hidden = datastore.doc_raw_df[
|
| 452 |
datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 453 |
-
|
| 454 |
_doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
|
| 455 |
doc_df_elem_rerank_hidden = get_leaderboard_table(
|
| 456 |
_doc_df_rerank_hidden, datastore.doc_types, visible=False
|
|
|
|
| 6 |
|
| 7 |
from src.about import BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
|
| 8 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
| 9 |
+
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
| 10 |
from src.components import (
|
| 11 |
get_anonymous_checkbox,
|
| 12 |
get_domain_dropdown,
|
|
|
|
| 32 |
RESULTS_REPO,
|
| 33 |
TOKEN,
|
| 34 |
)
|
|
|
|
| 35 |
from src.loaders import load_eval_results
|
| 36 |
from src.models import TaskType, model_hyperlink
|
| 37 |
from src.utils import remove_html, reset_rank, set_listeners, submit_results, update_metric, upload_file
|
|
|
|
| 81 |
reranking_model,
|
| 82 |
query,
|
| 83 |
show_anonymous,
|
| 84 |
+
show_revision_and_timestamp,
|
| 85 |
)
|
| 86 |
|
| 87 |
|
|
|
|
| 173 |
# shown_table
|
| 174 |
qa_df_elem_ret_rerank = get_leaderboard_table(datastore.qa_fmt_df, datastore.qa_types)
|
| 175 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 176 |
+
qa_df_elem_ret_rerank_hidden = get_leaderboard_table(
|
| 177 |
+
datastore.qa_raw_df, datastore.qa_types, visible=False
|
| 178 |
+
)
|
| 179 |
|
| 180 |
version.change(
|
| 181 |
update_qa_version,
|
|
|
|
| 216 |
qa_df_elem_ret = get_leaderboard_table(_qa_df_ret, datastore.qa_types)
|
| 217 |
|
| 218 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 219 |
+
_qa_df_ret_hidden = datastore.qa_raw_df[
|
| 220 |
+
datastore.qa_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 221 |
+
]
|
| 222 |
_qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
|
| 223 |
+
qa_df_elem_ret_hidden = get_leaderboard_table(
|
| 224 |
+
_qa_df_ret_hidden, datastore.qa_types, visible=False
|
| 225 |
+
)
|
| 226 |
|
| 227 |
version.change(
|
| 228 |
update_qa_version,
|
| 229 |
version,
|
| 230 |
+
[
|
| 231 |
+
domains,
|
| 232 |
+
langs,
|
| 233 |
+
models_ret,
|
| 234 |
+
qa_df_elem_ret,
|
| 235 |
+
qa_df_elem_ret_hidden,
|
| 236 |
+
],
|
| 237 |
)
|
| 238 |
|
| 239 |
set_listeners(
|
|
|
|
| 265 |
)
|
| 266 |
|
| 267 |
with gr.TabItem("Reranking Only", id=12):
|
| 268 |
+
_qa_df_rerank = datastore.qa_fmt_df[datastore.qa_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
|
|
|
|
|
|
| 269 |
_qa_df_rerank = reset_rank(_qa_df_rerank)
|
| 270 |
+
qa_rerank_models = _qa_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
|
|
|
|
|
|
| 271 |
with gr.Row():
|
| 272 |
with gr.Column(scale=1):
|
| 273 |
qa_models_rerank = get_reranking_dropdown(qa_rerank_models)
|
|
|
|
| 277 |
|
| 278 |
_qa_df_rerank_hidden = datastore.qa_raw_df[
|
| 279 |
datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 280 |
+
]
|
| 281 |
_qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
|
| 282 |
qa_df_elem_rerank_hidden = get_leaderboard_table(
|
| 283 |
_qa_df_rerank_hidden, datastore.qa_types, visible=False
|
|
|
|
| 341 |
with gr.Column():
|
| 342 |
models = get_reranking_dropdown(datastore.reranking_models)
|
| 343 |
|
| 344 |
+
doc_df_elem_ret_rerank = get_leaderboard_table(datastore.doc_fmt_df, datastore.doc_types)
|
|
|
|
|
|
|
| 345 |
|
| 346 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 347 |
doc_df_elem_ret_rerank_hidden = get_leaderboard_table(
|
|
|
|
| 391 |
|
| 392 |
_doc_df_ret = datastore.doc_fmt_df[
|
| 393 |
datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 394 |
+
]
|
| 395 |
_doc_df_ret = reset_rank(_doc_df_ret)
|
| 396 |
+
doc_df_elem_ret = get_leaderboard_table(_doc_df_ret, datastore.doc_types)
|
|
|
|
|
|
|
| 397 |
|
| 398 |
_doc_df_ret_hidden = datastore.doc_raw_df[
|
| 399 |
datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 400 |
+
]
|
| 401 |
_doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
|
| 402 |
doc_df_elem_ret_hidden = get_leaderboard_table(
|
| 403 |
_doc_df_ret_hidden, datastore.doc_types, visible=False
|
|
|
|
| 439 |
with gr.TabItem("Reranking Only", id=22):
|
| 440 |
_doc_df_rerank = datastore.doc_fmt_df[
|
| 441 |
datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 442 |
+
]
|
| 443 |
_doc_df_rerank = reset_rank(_doc_df_rerank)
|
| 444 |
doc_rerank_models = (
|
| 445 |
_doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
| 446 |
)
|
| 447 |
with gr.Row():
|
| 448 |
with gr.Column(scale=1):
|
| 449 |
+
doc_models_rerank = get_reranking_dropdown(doc_rerank_models)
|
|
|
|
|
|
|
| 450 |
with gr.Column(scale=1):
|
| 451 |
doc_search_bar_rerank = gr.Textbox(show_label=False, visible=False)
|
| 452 |
doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
|
| 453 |
_doc_df_rerank_hidden = datastore.doc_raw_df[
|
| 454 |
datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 455 |
+
]
|
| 456 |
_doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
|
| 457 |
doc_df_elem_rerank_hidden = get_leaderboard_table(
|
| 458 |
_doc_df_rerank_hidden, datastore.doc_types, visible=False
|
src/loaders.py
CHANGED
|
@@ -1,14 +1,10 @@
|
|
| 1 |
import os.path
|
| 2 |
-
from typing import
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
-
from src.
|
| 7 |
-
|
| 8 |
-
DEFAULT_METRIC_LONG_DOC,
|
| 9 |
-
DEFAULT_METRIC_QA,
|
| 10 |
-
)
|
| 11 |
-
from src.columns import COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
|
| 12 |
from src.models import FullEvalResult, LeaderboardDataStore, TaskType
|
| 13 |
from src.utils import get_default_cols, get_leaderboard_df
|
| 14 |
|
|
@@ -80,8 +76,9 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
|
|
| 80 |
datastore.doc_fmt_df = datastore.doc_fmt_df[~datastore.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
|
| 81 |
datastore.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 82 |
|
| 83 |
-
datastore.reranking_models =
|
| 84 |
-
|
|
|
|
| 85 |
return datastore
|
| 86 |
|
| 87 |
|
|
|
|
| 1 |
import os.path
|
| 2 |
+
from typing import Dict, List
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
+
from src.columns import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
|
| 7 |
+
from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from src.models import FullEvalResult, LeaderboardDataStore, TaskType
|
| 9 |
from src.utils import get_default_cols, get_leaderboard_df
|
| 10 |
|
|
|
|
| 76 |
datastore.doc_fmt_df = datastore.doc_fmt_df[~datastore.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
|
| 77 |
datastore.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 78 |
|
| 79 |
+
datastore.reranking_models = sorted(
|
| 80 |
+
list(frozenset([eval_result.reranking_model for eval_result in datastore.raw_data]))
|
| 81 |
+
)
|
| 82 |
return datastore
|
| 83 |
|
| 84 |
|
src/models.py
CHANGED
|
@@ -1,14 +1,20 @@
|
|
| 1 |
import json
|
| 2 |
-
from enum import Enum
|
| 3 |
-
|
| 4 |
from collections import defaultdict
|
| 5 |
from dataclasses import dataclass
|
|
|
|
| 6 |
from typing import List, Optional
|
| 7 |
|
| 8 |
import pandas as pd
|
| 9 |
|
| 10 |
-
from src.columns import
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
def get_safe_name(name: str):
|
|
@@ -16,6 +22,7 @@ def get_safe_name(name: str):
|
|
| 16 |
name = name.replace("-", "_")
|
| 17 |
return "".join(character.lower() for character in name if (character.isalnum() or character == "_"))
|
| 18 |
|
|
|
|
| 19 |
@dataclass
|
| 20 |
class EvalResult:
|
| 21 |
"""
|
|
|
|
| 1 |
import json
|
|
|
|
|
|
|
| 2 |
from collections import defaultdict
|
| 3 |
from dataclasses import dataclass
|
| 4 |
+
from enum import Enum
|
| 5 |
from typing import List, Optional
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
|
| 9 |
+
from src.columns import (
|
| 10 |
+
COL_NAME_IS_ANONYMOUS,
|
| 11 |
+
COL_NAME_RERANKING_MODEL,
|
| 12 |
+
COL_NAME_RERANKING_MODEL_LINK,
|
| 13 |
+
COL_NAME_RETRIEVAL_MODEL,
|
| 14 |
+
COL_NAME_RETRIEVAL_MODEL_LINK,
|
| 15 |
+
COL_NAME_REVISION,
|
| 16 |
+
COL_NAME_TIMESTAMP,
|
| 17 |
+
)
|
| 18 |
|
| 19 |
|
| 20 |
def get_safe_name(name: str):
|
|
|
|
| 22 |
name = name.replace("-", "_")
|
| 23 |
return "".join(character.lower() for character in name if (character.isalnum() or character == "_"))
|
| 24 |
|
| 25 |
+
|
| 26 |
@dataclass
|
| 27 |
class EvalResult:
|
| 28 |
"""
|
src/utils.py
CHANGED
|
@@ -6,16 +6,20 @@ from pathlib import Path
|
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
|
| 9 |
-
from src.models import TaskType
|
| 10 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
| 11 |
-
from src.columns import
|
| 12 |
-
|
| 13 |
-
COL_NAME_IS_ANONYMOUS
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
| 18 |
)
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def calculate_mean(row):
|
|
@@ -200,7 +204,7 @@ def update_metric(
|
|
| 200 |
elif task == TaskType.long_doc:
|
| 201 |
update_func = update_doc_df_elem
|
| 202 |
else:
|
| 203 |
-
raise
|
| 204 |
df_elem = get_leaderboard_df(datastore, task=task, metric=metric)
|
| 205 |
version = datastore.version
|
| 206 |
return update_func(
|
|
|
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
|
|
|
|
| 9 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
| 10 |
+
from src.columns import (
|
| 11 |
+
COL_NAME_AVG,
|
| 12 |
+
COL_NAME_IS_ANONYMOUS,
|
| 13 |
+
COL_NAME_RANK,
|
| 14 |
+
COL_NAME_RERANKING_MODEL,
|
| 15 |
+
COL_NAME_RETRIEVAL_MODEL,
|
| 16 |
+
COL_NAME_REVISION,
|
| 17 |
+
COL_NAME_TIMESTAMP,
|
| 18 |
+
get_default_col_names_and_types,
|
| 19 |
+
get_fixed_col_names_and_types,
|
| 20 |
)
|
| 21 |
+
from src.envs import API, LATEST_BENCHMARK_VERSION, SEARCH_RESULTS_REPO
|
| 22 |
+
from src.models import TaskType
|
| 23 |
|
| 24 |
|
| 25 |
def calculate_mean(row):
|
|
|
|
| 204 |
elif task == TaskType.long_doc:
|
| 205 |
update_func = update_doc_df_elem
|
| 206 |
else:
|
| 207 |
+
raise NotImplementedError
|
| 208 |
df_elem = get_leaderboard_df(datastore, task=task, metric=metric)
|
| 209 |
version = datastore.version
|
| 210 |
return update_func(
|
tests/src/display/test_utils.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
from src.display.utils import (
|
| 3 |
COLS_LONG_DOC,
|
| 4 |
COLS_QA,
|
|
|
|
|
|
|
| 1 |
from src.display.utils import (
|
| 2 |
COLS_LONG_DOC,
|
| 3 |
COLS_QA,
|
tests/test_utils.py
CHANGED
|
@@ -2,8 +2,15 @@ import pandas as pd
|
|
| 2 |
import pytest
|
| 3 |
|
| 4 |
from app import update_table
|
| 5 |
-
from src.columns import
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from src.utils import (
|
| 8 |
filter_models,
|
| 9 |
filter_queries,
|
|
|
|
| 2 |
import pytest
|
| 3 |
|
| 4 |
from app import update_table
|
| 5 |
+
from src.columns import (
|
| 6 |
+
COL_NAME_AVG,
|
| 7 |
+
COL_NAME_IS_ANONYMOUS,
|
| 8 |
+
COL_NAME_RANK,
|
| 9 |
+
COL_NAME_RERANKING_MODEL,
|
| 10 |
+
COL_NAME_RETRIEVAL_MODEL,
|
| 11 |
+
COL_NAME_REVISION,
|
| 12 |
+
COL_NAME_TIMESTAMP,
|
| 13 |
+
)
|
| 14 |
from src.utils import (
|
| 15 |
filter_models,
|
| 16 |
filter_queries,
|