Spaces:
Runtime error
Runtime error
Commit
·
4d67578
1
Parent(s):
803802d
Add German Clustering; Rmv Models w/o score; Rmv dups; Increment ds
Browse files
app.py
CHANGED
|
@@ -48,6 +48,13 @@ TASK_LIST_CLUSTERING = [
|
|
| 48 |
"TwentyNewsgroupsClustering",
|
| 49 |
]
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
TASK_LIST_PAIR_CLASSIFICATION = [
|
| 52 |
"SprintDuplicateQuestions",
|
| 53 |
"TwitterSemEval2015",
|
|
@@ -117,6 +124,7 @@ TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_
|
|
| 117 |
TASK_TO_METRIC = {
|
| 118 |
"BitextMining": "f1",
|
| 119 |
"Clustering": "v_measure",
|
|
|
|
| 120 |
"Classification": "accuracy",
|
| 121 |
"PairClassification": "cos_sim_ap",
|
| 122 |
"Reranking": "map",
|
|
@@ -255,6 +263,9 @@ MODELS_TO_SKIP = {
|
|
| 255 |
"radames/e5-large", # Duplicate
|
| 256 |
"gentlebowl/instructor-large-safetensors", # Duplicate
|
| 257 |
"Consensus/instructor-base", # Duplicate
|
|
|
|
|
|
|
|
|
|
| 258 |
}
|
| 259 |
|
| 260 |
|
|
@@ -271,7 +282,7 @@ def add_task(examples):
|
|
| 271 |
# Could be added to the dataset loading script instead
|
| 272 |
if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM:
|
| 273 |
examples["mteb_task"] = "Classification"
|
| 274 |
-
elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING:
|
| 275 |
examples["mteb_task"] = "Clustering"
|
| 276 |
elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION:
|
| 277 |
examples["mteb_task"] = "PairClassification"
|
|
@@ -288,7 +299,7 @@ def add_task(examples):
|
|
| 288 |
return examples
|
| 289 |
|
| 290 |
for model in EXTERNAL_MODELS:
|
| 291 |
-
ds = load_dataset("mteb/results", model
|
| 292 |
# For local debugging:
|
| 293 |
#, download_mode='force_redownload', verification_mode="no_checks")
|
| 294 |
ds = ds.map(add_lang)
|
|
@@ -321,14 +332,16 @@ def get_emb_dim(model):
|
|
| 321 |
return dim
|
| 322 |
|
| 323 |
|
| 324 |
-
def get_mteb_data(tasks=["Clustering"], langs=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
| 325 |
api = HfApi()
|
| 326 |
models = api.list_models(filter="mteb")
|
| 327 |
# Initialize list to models that we cannot fetch metadata from
|
| 328 |
df_list = []
|
| 329 |
for model in EXTERNAL_MODEL_RESULTS:
|
| 330 |
results_list = [res for task in tasks for res in EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]]
|
| 331 |
-
if
|
|
|
|
|
|
|
| 332 |
# Would be cleaner to rely on an extra language column instead
|
| 333 |
langs_format = [f"({lang})" for lang in langs]
|
| 334 |
res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])}
|
|
@@ -359,16 +372,20 @@ def get_mteb_data(tasks=["Clustering"], langs=[], fillna=True, add_emb_dim=False
|
|
| 359 |
# ],
|
| 360 |
# },
|
| 361 |
# Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
|
| 362 |
-
if
|
|
|
|
|
|
|
| 363 |
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))]
|
| 364 |
else:
|
| 365 |
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)]
|
| 366 |
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results]
|
| 367 |
out = {k: v for d in out for k, v in d.items()}
|
| 368 |
out["Model"] = make_clickable_model(model.modelId)
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
| 372 |
df = pd.DataFrame(df_list)
|
| 373 |
# Put 'Model' column first
|
| 374 |
cols = sorted(list(df.columns))
|
|
@@ -437,7 +454,7 @@ with block:
|
|
| 437 |
gr.Markdown(f"""
|
| 438 |
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗
|
| 439 |
|
| 440 |
-
- **Total Datasets**:
|
| 441 |
- **Total Languages**: 112
|
| 442 |
- **Total Scores**: >{NUM_SCORES}
|
| 443 |
- **Total Models**: {len(DATA_OVERALL)}
|
|
@@ -531,27 +548,53 @@ with block:
|
|
| 531 |
outputs=data_classification,
|
| 532 |
)
|
| 533 |
with gr.TabItem("Clustering"):
|
| 534 |
-
with gr.
|
| 535 |
-
gr.
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
with gr.TabItem("Pair Classification"):
|
| 556 |
with gr.Row():
|
| 557 |
gr.Markdown("""
|
|
@@ -681,9 +724,7 @@ with block:
|
|
| 681 |
)
|
| 682 |
gr.Markdown(r"""
|
| 683 |
|
| 684 |
-
Made with ❤️ for NLP
|
| 685 |
-
|
| 686 |
-
If this work is useful to you, please consider citing:
|
| 687 |
|
| 688 |
```bibtex
|
| 689 |
@article{muennighoff2022mteb,
|
|
@@ -702,7 +743,8 @@ with block:
|
|
| 702 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
| 703 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
| 704 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
| 705 |
-
block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering)
|
|
|
|
| 706 |
block.load(get_mteb_data, inputs=[task_pair_classification], outputs=data_pair_classification)
|
| 707 |
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
|
| 708 |
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
|
|
|
|
| 48 |
"TwentyNewsgroupsClustering",
|
| 49 |
]
|
| 50 |
|
| 51 |
+
TASK_LIST_CLUSTERING_DE = [
|
| 52 |
+
"BlurbsClusteringP2P",
|
| 53 |
+
"BlurbsClusteringS2S",
|
| 54 |
+
"TenKGnadClusteringP2P",
|
| 55 |
+
"TenKGnadClusteringS2S",
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
TASK_LIST_PAIR_CLASSIFICATION = [
|
| 59 |
"SprintDuplicateQuestions",
|
| 60 |
"TwitterSemEval2015",
|
|
|
|
| 124 |
TASK_TO_METRIC = {
|
| 125 |
"BitextMining": "f1",
|
| 126 |
"Clustering": "v_measure",
|
| 127 |
+
"Clustering (DE)": "v_measure",
|
| 128 |
"Classification": "accuracy",
|
| 129 |
"PairClassification": "cos_sim_ap",
|
| 130 |
"Reranking": "map",
|
|
|
|
| 263 |
"radames/e5-large", # Duplicate
|
| 264 |
"gentlebowl/instructor-large-safetensors", # Duplicate
|
| 265 |
"Consensus/instructor-base", # Duplicate
|
| 266 |
+
"GovCompete/instructor-xl", # Duplicate
|
| 267 |
+
"GovCompete/e5-large-v2", # Duplicate
|
| 268 |
+
"t12e/instructor-base", # Duplicate
|
| 269 |
}
|
| 270 |
|
| 271 |
|
|
|
|
| 282 |
# Could be added to the dataset loading script instead
|
| 283 |
if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM:
|
| 284 |
examples["mteb_task"] = "Classification"
|
| 285 |
+
elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE:
|
| 286 |
examples["mteb_task"] = "Clustering"
|
| 287 |
elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION:
|
| 288 |
examples["mteb_task"] = "PairClassification"
|
|
|
|
| 299 |
return examples
|
| 300 |
|
| 301 |
for model in EXTERNAL_MODELS:
|
| 302 |
+
ds = load_dataset("mteb/results", model)#, download_mode='force_redownload', verification_mode="no_checks")
|
| 303 |
# For local debugging:
|
| 304 |
#, download_mode='force_redownload', verification_mode="no_checks")
|
| 305 |
ds = ds.map(add_lang)
|
|
|
|
| 332 |
return dim
|
| 333 |
|
| 334 |
|
| 335 |
+
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
| 336 |
api = HfApi()
|
| 337 |
models = api.list_models(filter="mteb")
|
| 338 |
# Initialize list to models that we cannot fetch metadata from
|
| 339 |
df_list = []
|
| 340 |
for model in EXTERNAL_MODEL_RESULTS:
|
| 341 |
results_list = [res for task in tasks for res in EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]]
|
| 342 |
+
if len(datasets) > 0:
|
| 343 |
+
res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
|
| 344 |
+
elif langs:
|
| 345 |
# Would be cleaner to rely on an extra language column instead
|
| 346 |
langs_format = [f"({lang})" for lang in langs]
|
| 347 |
res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])}
|
|
|
|
| 372 |
# ],
|
| 373 |
# },
|
| 374 |
# Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
|
| 375 |
+
if len(datasets) > 0:
|
| 376 |
+
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and any([x in sub_res.get("dataset", {}).get("name", "") for x in datasets])]
|
| 377 |
+
elif langs:
|
| 378 |
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))]
|
| 379 |
else:
|
| 380 |
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)]
|
| 381 |
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results]
|
| 382 |
out = {k: v for d in out for k, v in d.items()}
|
| 383 |
out["Model"] = make_clickable_model(model.modelId)
|
| 384 |
+
# Model & at least one result
|
| 385 |
+
if len(out) > 1:
|
| 386 |
+
if add_emb_dim:
|
| 387 |
+
out["Embedding Dimensions"] = get_emb_dim(model)
|
| 388 |
+
df_list.append(out)
|
| 389 |
df = pd.DataFrame(df_list)
|
| 390 |
# Put 'Model' column first
|
| 391 |
cols = sorted(list(df.columns))
|
|
|
|
| 454 |
gr.Markdown(f"""
|
| 455 |
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗
|
| 456 |
|
| 457 |
+
- **Total Datasets**: 62
|
| 458 |
- **Total Languages**: 112
|
| 459 |
- **Total Scores**: >{NUM_SCORES}
|
| 460 |
- **Total Models**: {len(DATA_OVERALL)}
|
|
|
|
| 548 |
outputs=data_classification,
|
| 549 |
)
|
| 550 |
with gr.TabItem("Clustering"):
|
| 551 |
+
with gr.TabItem("English"):
|
| 552 |
+
with gr.Row():
|
| 553 |
+
gr.Markdown("""
|
| 554 |
+
**Clustering Leaderboard ✨**
|
| 555 |
+
|
| 556 |
+
- **Metric:** Validity Measure (v_measure)
|
| 557 |
+
- **Languages:** English
|
| 558 |
+
""")
|
| 559 |
+
with gr.Row():
|
| 560 |
+
data_clustering = gr.components.Dataframe(
|
| 561 |
+
DATA_CLUSTERING,
|
| 562 |
+
datatype=["markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
|
| 563 |
+
type="pandas",
|
| 564 |
+
)
|
| 565 |
+
with gr.Row():
|
| 566 |
+
data_run = gr.Button("Refresh")
|
| 567 |
+
task_clustering = gr.Variable(value=["Clustering"])
|
| 568 |
+
empty = gr.Variable(value=[])
|
| 569 |
+
datasets_clustering = gr.Variable(value=TASK_LIST_CLUSTERING)
|
| 570 |
+
data_run.click(
|
| 571 |
+
get_mteb_data,
|
| 572 |
+
inputs=[task_clustering, empty, datasets_clustering],
|
| 573 |
+
outputs=data_clustering,
|
| 574 |
+
)
|
| 575 |
+
with gr.TabItem("German"):
|
| 576 |
+
with gr.Row():
|
| 577 |
+
gr.Markdown("""
|
| 578 |
+
**Clustering Leaderboard ✨🇩🇪**
|
| 579 |
+
|
| 580 |
+
- **Metric:** Validity Measure (v_measure)
|
| 581 |
+
- **Languages:** German
|
| 582 |
+
""")
|
| 583 |
+
with gr.Row():
|
| 584 |
+
data_clustering_de = gr.components.Dataframe(
|
| 585 |
+
datatype=["markdown"] + ["number"] * len(TASK_LIST_CLUSTERING_DE),
|
| 586 |
+
type="pandas",
|
| 587 |
+
)
|
| 588 |
+
with gr.Row():
|
| 589 |
+
data_run = gr.Button("Refresh")
|
| 590 |
+
task_clustering_de = gr.Variable(value=["Clustering"])
|
| 591 |
+
empty_de = gr.Variable(value=[])
|
| 592 |
+
datasets_clustering_de = gr.Variable(value=TASK_LIST_CLUSTERING_DE)
|
| 593 |
+
data_run.click(
|
| 594 |
+
get_mteb_data,
|
| 595 |
+
inputs=[task_clustering_de, empty_de, datasets_clustering_de],
|
| 596 |
+
outputs=data_clustering_de,
|
| 597 |
+
)
|
| 598 |
with gr.TabItem("Pair Classification"):
|
| 599 |
with gr.Row():
|
| 600 |
gr.Markdown("""
|
|
|
|
| 724 |
)
|
| 725 |
gr.Markdown(r"""
|
| 726 |
|
| 727 |
+
Made with ❤️ for NLP. If this work is useful to you, please consider citing:
|
|
|
|
|
|
|
| 728 |
|
| 729 |
```bibtex
|
| 730 |
@article{muennighoff2022mteb,
|
|
|
|
| 743 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
| 744 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
| 745 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
| 746 |
+
block.load(get_mteb_data, inputs=[task_clustering, empty, datasets_clustering], outputs=data_clustering)
|
| 747 |
+
block.load(get_mteb_data, inputs=[task_clustering_de, empty_de, datasets_clustering_de], outputs=data_clustering_de)
|
| 748 |
block.load(get_mteb_data, inputs=[task_pair_classification], outputs=data_pair_classification)
|
| 749 |
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
|
| 750 |
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
|