Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: improve the layout
Browse files- app.py +39 -37
- src/benchmarks.py +4 -2
- utils.py +3 -3
app.py
CHANGED
|
@@ -13,7 +13,7 @@ from src.leaderboard.read_evals import get_raw_eval_results, get_leaderboard_df
|
|
| 13 |
|
| 14 |
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
| 15 |
from utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols, submit_results
|
| 16 |
-
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC,
|
| 17 |
from src.display.utils import TYPES_QA, TYPES_LONG_DOC
|
| 18 |
|
| 19 |
|
|
@@ -31,9 +31,9 @@ except Exception:
|
|
| 31 |
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
|
| 32 |
|
| 33 |
original_df_qa = get_leaderboard_df(
|
| 34 |
-
raw_data, task='qa', metric=
|
| 35 |
original_df_long_doc = get_leaderboard_df(
|
| 36 |
-
raw_data, task='long-doc', metric=
|
| 37 |
print(f'raw data: {len(raw_data)}')
|
| 38 |
print(f'QA data loaded: {original_df_qa.shape}')
|
| 39 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
|
@@ -75,22 +75,33 @@ with demo:
|
|
| 75 |
with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
|
| 76 |
with gr.Row():
|
| 77 |
with gr.Column():
|
| 78 |
-
# search
|
| 79 |
with gr.Row():
|
| 80 |
search_bar = gr.Textbox(
|
| 81 |
-
placeholder=" 🔍 Search for
|
| 82 |
show_label=False,
|
| 83 |
elem_id="search-bar",
|
|
|
|
| 84 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
# select the metric
|
| 86 |
selected_metric = gr.Dropdown(
|
| 87 |
-
choices=
|
| 88 |
-
value=
|
| 89 |
label="Select the metric",
|
| 90 |
interactive=True,
|
| 91 |
elem_id="metric-select",
|
| 92 |
)
|
| 93 |
-
with gr.Column(min_width=320):
|
| 94 |
# select domain
|
| 95 |
with gr.Row():
|
| 96 |
selected_domains = gr.CheckboxGroup(
|
|
@@ -110,16 +121,6 @@ with demo:
|
|
| 110 |
multiselect=True,
|
| 111 |
interactive=True
|
| 112 |
)
|
| 113 |
-
# select reranking model
|
| 114 |
-
reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
|
| 115 |
-
with gr.Row():
|
| 116 |
-
selected_rerankings = gr.CheckboxGroup(
|
| 117 |
-
choices=reranking_models,
|
| 118 |
-
value=reranking_models,
|
| 119 |
-
label="Select the reranking models",
|
| 120 |
-
elem_id="reranking-select",
|
| 121 |
-
interactive=True
|
| 122 |
-
)
|
| 123 |
|
| 124 |
leaderboard_table = gr.components.Dataframe(
|
| 125 |
value=leaderboard_df_qa,
|
|
@@ -187,19 +188,30 @@ with demo:
|
|
| 187 |
with gr.Column():
|
| 188 |
with gr.Row():
|
| 189 |
search_bar = gr.Textbox(
|
| 190 |
-
placeholder=" 🔍 Search for
|
| 191 |
show_label=False,
|
| 192 |
elem_id="search-bar-long-doc",
|
| 193 |
)
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
| 202 |
with gr.Column(min_width=320):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
# select domain
|
| 204 |
with gr.Row():
|
| 205 |
selected_domains = gr.CheckboxGroup(
|
|
@@ -219,16 +231,6 @@ with demo:
|
|
| 219 |
multiselect=True,
|
| 220 |
interactive=True
|
| 221 |
)
|
| 222 |
-
# select reranking model
|
| 223 |
-
reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
|
| 224 |
-
with gr.Row():
|
| 225 |
-
selected_rerankings = gr.CheckboxGroup(
|
| 226 |
-
choices=reranking_models,
|
| 227 |
-
value=reranking_models,
|
| 228 |
-
label="Select the reranking models",
|
| 229 |
-
elem_id="reranking-select-long-doc",
|
| 230 |
-
interactive=True
|
| 231 |
-
)
|
| 232 |
|
| 233 |
leaderboard_table_long_doc = gr.components.Dataframe(
|
| 234 |
value=leaderboard_df_long_doc,
|
|
|
|
| 13 |
|
| 14 |
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
| 15 |
from utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols, submit_results
|
| 16 |
+
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, DEFAULT_METRIC
|
| 17 |
from src.display.utils import TYPES_QA, TYPES_LONG_DOC
|
| 18 |
|
| 19 |
|
|
|
|
| 31 |
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
|
| 32 |
|
| 33 |
original_df_qa = get_leaderboard_df(
|
| 34 |
+
raw_data, task='qa', metric=DEFAULT_METRIC)
|
| 35 |
original_df_long_doc = get_leaderboard_df(
|
| 36 |
+
raw_data, task='long-doc', metric=DEFAULT_METRIC)
|
| 37 |
print(f'raw data: {len(raw_data)}')
|
| 38 |
print(f'QA data loaded: {original_df_qa.shape}')
|
| 39 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
|
|
|
| 75 |
with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
|
| 76 |
with gr.Row():
|
| 77 |
with gr.Column():
|
| 78 |
+
# search retrieval models
|
| 79 |
with gr.Row():
|
| 80 |
search_bar = gr.Textbox(
|
| 81 |
+
placeholder=" 🔍 Search for retrieval models (separate multiple queries with `;`) and press ENTER...",
|
| 82 |
show_label=False,
|
| 83 |
elem_id="search-bar",
|
| 84 |
+
info="Search the retrieval models"
|
| 85 |
)
|
| 86 |
+
# select reranking model
|
| 87 |
+
reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
|
| 88 |
+
with gr.Row():
|
| 89 |
+
selected_rerankings = gr.CheckboxGroup(
|
| 90 |
+
choices=reranking_models,
|
| 91 |
+
value=reranking_models,
|
| 92 |
+
label="Select the reranking models",
|
| 93 |
+
elem_id="reranking-select",
|
| 94 |
+
interactive=True
|
| 95 |
+
)
|
| 96 |
+
with gr.Column(min_width=320):
|
| 97 |
# select the metric
|
| 98 |
selected_metric = gr.Dropdown(
|
| 99 |
+
choices=METRIC_LIST,
|
| 100 |
+
value=DEFAULT_METRIC,
|
| 101 |
label="Select the metric",
|
| 102 |
interactive=True,
|
| 103 |
elem_id="metric-select",
|
| 104 |
)
|
|
|
|
| 105 |
# select domain
|
| 106 |
with gr.Row():
|
| 107 |
selected_domains = gr.CheckboxGroup(
|
|
|
|
| 121 |
multiselect=True,
|
| 122 |
interactive=True
|
| 123 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
leaderboard_table = gr.components.Dataframe(
|
| 126 |
value=leaderboard_df_qa,
|
|
|
|
| 188 |
with gr.Column():
|
| 189 |
with gr.Row():
|
| 190 |
search_bar = gr.Textbox(
|
| 191 |
+
placeholder=" 🔍 Search for retrieval models (separate multiple queries with `;`) and press ENTER...",
|
| 192 |
show_label=False,
|
| 193 |
elem_id="search-bar-long-doc",
|
| 194 |
)
|
| 195 |
+
# select reranking model
|
| 196 |
+
reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
|
| 197 |
+
with gr.Row():
|
| 198 |
+
selected_rerankings = gr.CheckboxGroup(
|
| 199 |
+
choices=reranking_models,
|
| 200 |
+
value=reranking_models,
|
| 201 |
+
label="Select the reranking models",
|
| 202 |
+
elem_id="reranking-select-long-doc",
|
| 203 |
+
interactive=True
|
| 204 |
+
)
|
| 205 |
with gr.Column(min_width=320):
|
| 206 |
+
# select the metric
|
| 207 |
+
with gr.Row():
|
| 208 |
+
selected_metric = gr.Dropdown(
|
| 209 |
+
choices=METRIC_LIST,
|
| 210 |
+
value=DEFAULT_METRIC,
|
| 211 |
+
label="Select the metric",
|
| 212 |
+
interactive=True,
|
| 213 |
+
elem_id="metric-select-long-doc",
|
| 214 |
+
)
|
| 215 |
# select domain
|
| 216 |
with gr.Row():
|
| 217 |
selected_domains = gr.CheckboxGroup(
|
|
|
|
| 231 |
multiselect=True,
|
| 232 |
interactive=True
|
| 233 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
leaderboard_table_long_doc = gr.components.Dataframe(
|
| 236 |
value=leaderboard_df_long_doc,
|
src/benchmarks.py
CHANGED
|
@@ -70,7 +70,7 @@ dataset_dict = {
|
|
| 70 |
}
|
| 71 |
}
|
| 72 |
|
| 73 |
-
|
| 74 |
"ndcg_at_1",
|
| 75 |
"ndcg_at_3",
|
| 76 |
"ndcg_at_5",
|
|
@@ -130,7 +130,7 @@ for task, domain_dict in dataset_dict.items():
|
|
| 130 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
| 131 |
benchmark_name = get_safe_name(benchmark_name)
|
| 132 |
col_name = benchmark_name
|
| 133 |
-
for metric in
|
| 134 |
long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain,
|
| 135 |
lang, task)
|
| 136 |
|
|
@@ -145,3 +145,5 @@ LANG_COLS_QA = list(frozenset([c.lang for c in qa_benchmark_dict.values()]))
|
|
| 145 |
|
| 146 |
DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
|
| 147 |
LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
|
|
|
|
|
|
|
|
|
| 70 |
}
|
| 71 |
}
|
| 72 |
|
| 73 |
+
METRIC_LIST = [
|
| 74 |
"ndcg_at_1",
|
| 75 |
"ndcg_at_3",
|
| 76 |
"ndcg_at_5",
|
|
|
|
| 130 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
| 131 |
benchmark_name = get_safe_name(benchmark_name)
|
| 132 |
col_name = benchmark_name
|
| 133 |
+
for metric in METRIC_LIST:
|
| 134 |
long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain,
|
| 135 |
lang, task)
|
| 136 |
|
|
|
|
| 145 |
|
| 146 |
DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
|
| 147 |
LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
|
| 148 |
+
|
| 149 |
+
DEFAULT_METRIC = "ndcg_at_10"
|
utils.py
CHANGED
|
@@ -33,8 +33,8 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
|
| 33 |
filtered_df = pd.concat(final_df)
|
| 34 |
filtered_df = filtered_df.drop_duplicates(
|
| 35 |
subset=[
|
| 36 |
-
|
| 37 |
-
|
| 38 |
]
|
| 39 |
)
|
| 40 |
|
|
@@ -42,7 +42,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
|
| 42 |
|
| 43 |
|
| 44 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
| 45 |
-
return df[(df[
|
| 46 |
|
| 47 |
|
| 48 |
def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
|
|
|
|
| 33 |
filtered_df = pd.concat(final_df)
|
| 34 |
filtered_df = filtered_df.drop_duplicates(
|
| 35 |
subset=[
|
| 36 |
+
COL_NAME_RETRIEVAL_MODEL,
|
| 37 |
+
COL_NAME_RERANKING_MODEL,
|
| 38 |
]
|
| 39 |
)
|
| 40 |
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
| 45 |
+
return df[(df[COL_NAME_RETRIEVAL_MODEL].str.contains(query, case=False))]
|
| 46 |
|
| 47 |
|
| 48 |
def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
|