Spaces:
Runtime error
Runtime error
Commit
·
a1e84d6
1
Parent(s):
e220002
Add Polish Retrieval
Browse files
app.py
CHANGED
|
@@ -398,8 +398,8 @@ EXTERNAL_MODEL_TO_SEQLEN = {
|
|
| 398 |
"dfm-sentence-encoder-large-1": 512,
|
| 399 |
"distiluse-base-multilingual-cased-v2": 512,
|
| 400 |
"e5-base": 512,
|
| 401 |
-
"e5-large": 512,
|
| 402 |
-
"e5-small": 512,
|
| 403 |
"electra-small-nordic": 512,
|
| 404 |
"electra-small-swedish-cased-discriminator": 512,
|
| 405 |
"gbert-base": 512,
|
|
@@ -452,18 +452,18 @@ EXTERNAL_MODEL_TO_SIZE = {
|
|
| 452 |
"allenai-specter": 0.44,
|
| 453 |
"all-MiniLM-L12-v2": 0.13,
|
| 454 |
"all-MiniLM-L6-v2": 0.09,
|
| 455 |
-
"all-mpnet-base-v2": 0.44,
|
| 456 |
-
"bert-base-uncased": 0.44,
|
| 457 |
"bert-base-swedish-cased": 0.50,
|
| 458 |
"cross-en-de-roberta-sentence-transformer": 1.11,
|
| 459 |
-
"contriever-base-msmarco": 0.44,
|
| 460 |
"DanskBERT": 0.50,
|
| 461 |
"distiluse-base-multilingual-cased-v2": 0.54,
|
| 462 |
"dfm-encoder-large-v1": 1.42,
|
| 463 |
"dfm-sentence-encoder-large-1": 1.63,
|
| 464 |
"e5-base": 0.44,
|
| 465 |
"e5-small": 0.13,
|
| 466 |
-
"e5-large": 1.34,
|
| 467 |
"electra-small-nordic": 0.09,
|
| 468 |
"electra-small-swedish-cased-discriminator": 0.06,
|
| 469 |
"gbert-base": 0.44,
|
|
@@ -471,18 +471,18 @@ EXTERNAL_MODEL_TO_SIZE = {
|
|
| 471 |
"gelectra-base": 0.44,
|
| 472 |
"gelectra-large": 1.34,
|
| 473 |
"glove.6B.300d": 0.48,
|
| 474 |
-
"gottbert-base": 0.51,
|
| 475 |
"gtr-t5-base": 0.22,
|
| 476 |
"gtr-t5-large": 0.67,
|
| 477 |
"gtr-t5-xl": 2.48,
|
| 478 |
"gtr-t5-xxl": 9.73,
|
| 479 |
-
"komninos": 0.27,
|
| 480 |
"LASER2": 0.17,
|
| 481 |
"LaBSE": 1.88,
|
| 482 |
"msmarco-bert-co-condensor": 0.44,
|
| 483 |
"multilingual-e5-base": 1.11,
|
| 484 |
"multilingual-e5-small": 0.47,
|
| 485 |
-
"multilingual-e5-large": 2.24,
|
| 486 |
"nb-bert-base": 0.71,
|
| 487 |
"nb-bert-large": 1.42,
|
| 488 |
"norbert3-base": 0.52,
|
|
@@ -496,7 +496,7 @@ EXTERNAL_MODEL_TO_SIZE = {
|
|
| 496 |
"sentence-t5-xxl": 9.73,
|
| 497 |
"sup-simcse-bert-base-uncased": 0.44,
|
| 498 |
"unsup-simcse-bert-base-uncased": 0.44,
|
| 499 |
-
"use-cmlm-multilingual": 1.89,
|
| 500 |
"xlm-roberta-base": 1.12,
|
| 501 |
"xlm-roberta-large": 2.24,
|
| 502 |
}
|
|
@@ -522,6 +522,7 @@ MODELS_TO_SKIP = {
|
|
| 522 |
"newsrx/instructor-large",
|
| 523 |
"newsrx/instructor-xl",
|
| 524 |
"dmlls/all-mpnet-base-v2",
|
|
|
|
| 525 |
}
|
| 526 |
|
| 527 |
|
|
@@ -544,7 +545,7 @@ def add_task(examples):
|
|
| 544 |
examples["mteb_task"] = "PairClassification"
|
| 545 |
elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING:
|
| 546 |
examples["mteb_task"] = "Reranking"
|
| 547 |
-
elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM:
|
| 548 |
examples["mteb_task"] = "Retrieval"
|
| 549 |
elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM:
|
| 550 |
examples["mteb_task"] = "STS"
|
|
@@ -749,7 +750,7 @@ DATA_CLASSIFICATION_NB = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIF
|
|
| 749 |
DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
|
| 750 |
DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
|
| 751 |
DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
|
| 752 |
-
|
| 753 |
DATA_STS = get_mteb_data(["STS"])
|
| 754 |
|
| 755 |
# Exact, add all non-nan integer values for every dataset
|
|
@@ -815,11 +816,11 @@ with block:
|
|
| 815 |
with gr.Row():
|
| 816 |
data_run = gr.Button("Refresh")
|
| 817 |
task_bitext_mining = gr.Variable(value=["BitextMining"])
|
| 818 |
-
|
| 819 |
-
|
| 820 |
data_run.click(
|
| 821 |
get_mteb_data,
|
| 822 |
-
inputs=[task_bitext_mining,
|
| 823 |
outputs=data_bitext_mining,
|
| 824 |
)
|
| 825 |
with gr.TabItem("Danish"):
|
|
@@ -832,24 +833,24 @@ with block:
|
|
| 832 |
- **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
|
| 833 |
""")
|
| 834 |
with gr.Row():
|
| 835 |
-
|
| 836 |
DATA_BITEXT_MINING_OTHER,
|
| 837 |
datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING_OTHER.columns),
|
| 838 |
type="pandas",
|
| 839 |
)
|
| 840 |
with gr.Row():
|
| 841 |
data_run = gr.Button("Refresh")
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
data_run.click(
|
| 846 |
get_mteb_data,
|
| 847 |
inputs=[
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
],
|
| 852 |
-
outputs=
|
| 853 |
)
|
| 854 |
with gr.TabItem("Classification"):
|
| 855 |
with gr.TabItem("English"):
|
|
@@ -1011,11 +1012,11 @@ with block:
|
|
| 1011 |
with gr.Row():
|
| 1012 |
data_run = gr.Button("Refresh")
|
| 1013 |
task_clustering = gr.Variable(value=["Clustering"])
|
| 1014 |
-
|
| 1015 |
datasets_clustering = gr.Variable(value=TASK_LIST_CLUSTERING)
|
| 1016 |
data_run.click(
|
| 1017 |
get_mteb_data,
|
| 1018 |
-
inputs=[task_clustering,
|
| 1019 |
outputs=data_clustering,
|
| 1020 |
)
|
| 1021 |
with gr.TabItem("German"):
|
|
@@ -1036,11 +1037,11 @@ with block:
|
|
| 1036 |
with gr.Row():
|
| 1037 |
data_run = gr.Button("Refresh")
|
| 1038 |
task_clustering_de = gr.Variable(value=["Clustering"])
|
| 1039 |
-
|
| 1040 |
datasets_clustering_de = gr.Variable(value=TASK_LIST_CLUSTERING_DE)
|
| 1041 |
data_run.click(
|
| 1042 |
get_mteb_data,
|
| 1043 |
-
inputs=[task_clustering_de,
|
| 1044 |
outputs=data_clustering_de,
|
| 1045 |
)
|
| 1046 |
with gr.TabItem("Pair Classification"):
|
|
@@ -1108,7 +1109,6 @@ with block:
|
|
| 1108 |
data_run.click(
|
| 1109 |
get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval
|
| 1110 |
)
|
| 1111 |
-
'''
|
| 1112 |
with gr.TabItem("Polish"):
|
| 1113 |
with gr.Row():
|
| 1114 |
gr.Markdown("""
|
|
@@ -1128,10 +1128,13 @@ with block:
|
|
| 1128 |
with gr.Row():
|
| 1129 |
data_run = gr.Button("Refresh")
|
| 1130 |
task_retrieval_pl = gr.Variable(value=["Retrieval"])
|
|
|
|
|
|
|
| 1131 |
data_run.click(
|
| 1132 |
-
get_mteb_data,
|
| 1133 |
-
|
| 1134 |
-
|
|
|
|
| 1135 |
with gr.TabItem("STS"):
|
| 1136 |
with gr.TabItem("English"):
|
| 1137 |
with gr.Row():
|
|
|
|
| 398 |
"dfm-sentence-encoder-large-1": 512,
|
| 399 |
"distiluse-base-multilingual-cased-v2": 512,
|
| 400 |
"e5-base": 512,
|
| 401 |
+
"e5-large": 512,
|
| 402 |
+
"e5-small": 512,
|
| 403 |
"electra-small-nordic": 512,
|
| 404 |
"electra-small-swedish-cased-discriminator": 512,
|
| 405 |
"gbert-base": 512,
|
|
|
|
| 452 |
"allenai-specter": 0.44,
|
| 453 |
"all-MiniLM-L12-v2": 0.13,
|
| 454 |
"all-MiniLM-L6-v2": 0.09,
|
| 455 |
+
"all-mpnet-base-v2": 0.44,
|
| 456 |
+
"bert-base-uncased": 0.44,
|
| 457 |
"bert-base-swedish-cased": 0.50,
|
| 458 |
"cross-en-de-roberta-sentence-transformer": 1.11,
|
| 459 |
+
"contriever-base-msmarco": 0.44,
|
| 460 |
"DanskBERT": 0.50,
|
| 461 |
"distiluse-base-multilingual-cased-v2": 0.54,
|
| 462 |
"dfm-encoder-large-v1": 1.42,
|
| 463 |
"dfm-sentence-encoder-large-1": 1.63,
|
| 464 |
"e5-base": 0.44,
|
| 465 |
"e5-small": 0.13,
|
| 466 |
+
"e5-large": 1.34,
|
| 467 |
"electra-small-nordic": 0.09,
|
| 468 |
"electra-small-swedish-cased-discriminator": 0.06,
|
| 469 |
"gbert-base": 0.44,
|
|
|
|
| 471 |
"gelectra-base": 0.44,
|
| 472 |
"gelectra-large": 1.34,
|
| 473 |
"glove.6B.300d": 0.48,
|
| 474 |
+
"gottbert-base": 0.51,
|
| 475 |
"gtr-t5-base": 0.22,
|
| 476 |
"gtr-t5-large": 0.67,
|
| 477 |
"gtr-t5-xl": 2.48,
|
| 478 |
"gtr-t5-xxl": 9.73,
|
| 479 |
+
"komninos": 0.27,
|
| 480 |
"LASER2": 0.17,
|
| 481 |
"LaBSE": 1.88,
|
| 482 |
"msmarco-bert-co-condensor": 0.44,
|
| 483 |
"multilingual-e5-base": 1.11,
|
| 484 |
"multilingual-e5-small": 0.47,
|
| 485 |
+
"multilingual-e5-large": 2.24,
|
| 486 |
"nb-bert-base": 0.71,
|
| 487 |
"nb-bert-large": 1.42,
|
| 488 |
"norbert3-base": 0.52,
|
|
|
|
| 496 |
"sentence-t5-xxl": 9.73,
|
| 497 |
"sup-simcse-bert-base-uncased": 0.44,
|
| 498 |
"unsup-simcse-bert-base-uncased": 0.44,
|
| 499 |
+
"use-cmlm-multilingual": 1.89,
|
| 500 |
"xlm-roberta-base": 1.12,
|
| 501 |
"xlm-roberta-large": 2.24,
|
| 502 |
}
|
|
|
|
| 522 |
"newsrx/instructor-large",
|
| 523 |
"newsrx/instructor-xl",
|
| 524 |
"dmlls/all-mpnet-base-v2",
|
| 525 |
+
"cgldo/semanticClone",
|
| 526 |
}
|
| 527 |
|
| 528 |
|
|
|
|
| 545 |
examples["mteb_task"] = "PairClassification"
|
| 546 |
elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING:
|
| 547 |
examples["mteb_task"] = "Reranking"
|
| 548 |
+
elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM + TASK_LIST_RETRIEVAL_PL:
|
| 549 |
examples["mteb_task"] = "Retrieval"
|
| 550 |
elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM:
|
| 551 |
examples["mteb_task"] = "STS"
|
|
|
|
| 750 |
DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
|
| 751 |
DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
|
| 752 |
DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
|
| 753 |
+
DATA_RETRIEVAL_PL = get_mteb_data(["Retrieval"], [], TASK_LIST_RETRIEVAL_PL)
|
| 754 |
DATA_STS = get_mteb_data(["STS"])
|
| 755 |
|
| 756 |
# Exact, add all non-nan integer values for every dataset
|
|
|
|
| 816 |
with gr.Row():
|
| 817 |
data_run = gr.Button("Refresh")
|
| 818 |
task_bitext_mining = gr.Variable(value=["BitextMining"])
|
| 819 |
+
lang_bitext_mining = gr.Variable(value=[])
|
| 820 |
+
datasets_bitext_mining = gr.Variable(value=TASK_LIST_BITEXT_MINING)
|
| 821 |
data_run.click(
|
| 822 |
get_mteb_data,
|
| 823 |
+
inputs=[task_bitext_mining, lang_bitext_mining, datasets_bitext_mining],
|
| 824 |
outputs=data_bitext_mining,
|
| 825 |
)
|
| 826 |
with gr.TabItem("Danish"):
|
|
|
|
| 833 |
- **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
|
| 834 |
""")
|
| 835 |
with gr.Row():
|
| 836 |
+
data_bitext_mining_da = gr.components.Dataframe(
|
| 837 |
DATA_BITEXT_MINING_OTHER,
|
| 838 |
datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING_OTHER.columns),
|
| 839 |
type="pandas",
|
| 840 |
)
|
| 841 |
with gr.Row():
|
| 842 |
data_run = gr.Button("Refresh")
|
| 843 |
+
task_bitext_mining_da = gr.Variable(value=["BitextMining"])
|
| 844 |
+
lang_bitext_mining_da = gr.Variable(value=[])
|
| 845 |
+
datasets_bitext_mining_da = gr.Variable(value=TASK_LIST_BITEXT_MINING_OTHER)
|
| 846 |
data_run.click(
|
| 847 |
get_mteb_data,
|
| 848 |
inputs=[
|
| 849 |
+
task_bitext_mining_da,
|
| 850 |
+
lang_bitext_mining_da,
|
| 851 |
+
datasets_bitext_mining_da,
|
| 852 |
],
|
| 853 |
+
outputs=data_bitext_mining_da,
|
| 854 |
)
|
| 855 |
with gr.TabItem("Classification"):
|
| 856 |
with gr.TabItem("English"):
|
|
|
|
| 1012 |
with gr.Row():
|
| 1013 |
data_run = gr.Button("Refresh")
|
| 1014 |
task_clustering = gr.Variable(value=["Clustering"])
|
| 1015 |
+
lang_clustering = gr.Variable(value=[])
|
| 1016 |
datasets_clustering = gr.Variable(value=TASK_LIST_CLUSTERING)
|
| 1017 |
data_run.click(
|
| 1018 |
get_mteb_data,
|
| 1019 |
+
inputs=[task_clustering, lang_clustering, datasets_clustering],
|
| 1020 |
outputs=data_clustering,
|
| 1021 |
)
|
| 1022 |
with gr.TabItem("German"):
|
|
|
|
| 1037 |
with gr.Row():
|
| 1038 |
data_run = gr.Button("Refresh")
|
| 1039 |
task_clustering_de = gr.Variable(value=["Clustering"])
|
| 1040 |
+
lang_clustering_de = gr.Variable(value=[])
|
| 1041 |
datasets_clustering_de = gr.Variable(value=TASK_LIST_CLUSTERING_DE)
|
| 1042 |
data_run.click(
|
| 1043 |
get_mteb_data,
|
| 1044 |
+
inputs=[task_clustering_de, lang_clustering_de, datasets_clustering_de],
|
| 1045 |
outputs=data_clustering_de,
|
| 1046 |
)
|
| 1047 |
with gr.TabItem("Pair Classification"):
|
|
|
|
| 1109 |
data_run.click(
|
| 1110 |
get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval
|
| 1111 |
)
|
|
|
|
| 1112 |
with gr.TabItem("Polish"):
|
| 1113 |
with gr.Row():
|
| 1114 |
gr.Markdown("""
|
|
|
|
| 1128 |
with gr.Row():
|
| 1129 |
data_run = gr.Button("Refresh")
|
| 1130 |
task_retrieval_pl = gr.Variable(value=["Retrieval"])
|
| 1131 |
+
lang_retrieval_pl = gr.Variable(value=[])
|
| 1132 |
+
datasets_retrieval_pl = gr.Variable(value=TASK_LIST_RETRIEVAL_PL)
|
| 1133 |
data_run.click(
|
| 1134 |
+
get_mteb_data,
|
| 1135 |
+
inputs=[task_retrieval_pl, lang_retrieval_pl, datasets_retrieval_pl],
|
| 1136 |
+
outputs=data_retrieval_pl
|
| 1137 |
+
)
|
| 1138 |
with gr.TabItem("STS"):
|
| 1139 |
with gr.TabItem("English"):
|
| 1140 |
with gr.Row():
|