Spaces:
Running
Running
Roman Solomatin
commited on
align results with models card
Browse files- EXTERNAL_MODEL_RESULTS.json +0 -0
- config.yaml +42 -42
- refresh.py +9 -5
EXTERNAL_MODEL_RESULTS.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
config.yaml
CHANGED
|
@@ -23,7 +23,7 @@ tasks:
|
|
| 23 |
metric: max_ap
|
| 24 |
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
|
| 25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
| 26 |
-
Reranking:
|
| 27 |
icon: "🥈"
|
| 28 |
metric: map
|
| 29 |
metric_description: "Mean Average Precision (MAP)"
|
|
@@ -345,35 +345,35 @@ boards:
|
|
| 345 |
credits: "[Roman Solomatin](https://github.com/Samoed) and SaluteDevices: [Alena Fenogenova](https://github.com/Alenush), [Aleksandr Abramov](https://github.com/Ab1992ao), [Artem Snegirev](https://github.com/artemsnegirev), [Anna Maksimova](https://github.com/anpalmak2003), [Maria Tikhonova](https://github.com/MariyaTikhonova)"
|
| 346 |
tasks:
|
| 347 |
Classification:
|
| 348 |
-
- GeoreviewClassification
|
| 349 |
-
- HeadlineClassification
|
| 350 |
-
- InappropriatenessClassification
|
| 351 |
-
- KinopoiskClassification
|
| 352 |
-
- RuReviewsClassification
|
| 353 |
-
- RuSciBenchGRNTIClassification
|
| 354 |
-
- RuSciBenchOECDClassification
|
| 355 |
-
- MassiveIntentClassification (
|
| 356 |
-
- MassiveScenarioClassification (
|
| 357 |
Clustering:
|
| 358 |
-
- GeoreviewClusteringP2P
|
| 359 |
-
- RuSciBenchGRNTIClusteringP2P
|
| 360 |
-
- RuSciBenchOECDClusteringP2P
|
| 361 |
PairClassification:
|
| 362 |
-
- TERRa
|
| 363 |
Reranking:
|
| 364 |
-
- RuBQReranking
|
| 365 |
-
- MIRACLReranking (
|
| 366 |
Retrieval:
|
| 367 |
-
- RiaNewsRetrieval
|
| 368 |
-
- RuBQRetrieval
|
| 369 |
-
- MIRACLRetrieval (
|
| 370 |
STS:
|
| 371 |
-
- RUParaPhraserSTS
|
| 372 |
-
- RuSTSBenchmarkSTS
|
| 373 |
-
- STS22 (
|
| 374 |
MultilabelClassification:
|
| 375 |
-
- CEDRClassification
|
| 376 |
-
- SensitiveTopicsClassification
|
| 377 |
se:
|
| 378 |
title: Swedish
|
| 379 |
language_long: Swedish
|
|
@@ -530,23 +530,23 @@ boards:
|
|
| 530 |
metric: nDCG@10
|
| 531 |
tasks:
|
| 532 |
Retrieval:
|
| 533 |
-
- AppsRetrieval
|
| 534 |
-
- CodeFeedbackMT
|
| 535 |
-
- CodeFeedbackST
|
| 536 |
-
- CodeSearchNetCCRetrieval (python
|
| 537 |
-
- CodeSearchNetCCRetrieval (javascript
|
| 538 |
-
- CodeSearchNetCCRetrieval (go
|
| 539 |
-
- CodeSearchNetCCRetrieval (ruby
|
| 540 |
-
- CodeSearchNetCCRetrieval (java
|
| 541 |
-
- CodeSearchNetCCRetrieval (php
|
| 542 |
-
- CodeSearchNetRetrieval (python
|
| 543 |
-
- CodeSearchNetRetrieval (javascript
|
| 544 |
-
- CodeSearchNetRetrieval (go
|
| 545 |
-
- CodeSearchNetRetrieval (ruby
|
| 546 |
-
- CodeSearchNetRetrieval (java
|
| 547 |
-
- CodeSearchNetRetrieval (php
|
| 548 |
-
- CodeTransOceanContest
|
| 549 |
- CodeTransOceanDL
|
| 550 |
-
- CosQA
|
| 551 |
- StackOverflowQA
|
| 552 |
-
- SyntheticText2SQL
|
|
|
|
| 23 |
metric: max_ap
|
| 24 |
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
|
| 25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
| 26 |
+
Reranking:
|
| 27 |
icon: "🥈"
|
| 28 |
metric: map
|
| 29 |
metric_description: "Mean Average Precision (MAP)"
|
|
|
|
| 345 |
credits: "[Roman Solomatin](https://github.com/Samoed) and SaluteDevices: [Alena Fenogenova](https://github.com/Alenush), [Aleksandr Abramov](https://github.com/Ab1992ao), [Artem Snegirev](https://github.com/artemsnegirev), [Anna Maksimova](https://github.com/anpalmak2003), [Maria Tikhonova](https://github.com/MariyaTikhonova)"
|
| 346 |
tasks:
|
| 347 |
Classification:
|
| 348 |
+
- GeoreviewClassification
|
| 349 |
+
- HeadlineClassification
|
| 350 |
+
- InappropriatenessClassification
|
| 351 |
+
- KinopoiskClassification
|
| 352 |
+
- RuReviewsClassification
|
| 353 |
+
- RuSciBenchGRNTIClassification
|
| 354 |
+
- RuSciBenchOECDClassification
|
| 355 |
+
- MassiveIntentClassification (ru)
|
| 356 |
+
- MassiveScenarioClassification (ru)
|
| 357 |
Clustering:
|
| 358 |
+
- GeoreviewClusteringP2P
|
| 359 |
+
- RuSciBenchGRNTIClusteringP2P
|
| 360 |
+
- RuSciBenchOECDClusteringP2P
|
| 361 |
PairClassification:
|
| 362 |
+
- TERRa
|
| 363 |
Reranking:
|
| 364 |
+
- RuBQReranking
|
| 365 |
+
- MIRACLReranking (ru)
|
| 366 |
Retrieval:
|
| 367 |
+
- RiaNewsRetrieval
|
| 368 |
+
- RuBQRetrieval
|
| 369 |
+
- MIRACLRetrieval (ru)
|
| 370 |
STS:
|
| 371 |
+
- RUParaPhraserSTS
|
| 372 |
+
- RuSTSBenchmarkSTS
|
| 373 |
+
- STS22 (ru)
|
| 374 |
MultilabelClassification:
|
| 375 |
+
- CEDRClassification
|
| 376 |
+
- SensitiveTopicsClassification
|
| 377 |
se:
|
| 378 |
title: Swedish
|
| 379 |
language_long: Swedish
|
|
|
|
| 530 |
metric: nDCG@10
|
| 531 |
tasks:
|
| 532 |
Retrieval:
|
| 533 |
+
- AppsRetrieval
|
| 534 |
+
- CodeFeedbackMT
|
| 535 |
+
- CodeFeedbackST
|
| 536 |
+
- CodeSearchNetCCRetrieval (python)
|
| 537 |
+
- CodeSearchNetCCRetrieval (javascript)
|
| 538 |
+
- CodeSearchNetCCRetrieval (go)
|
| 539 |
+
- CodeSearchNetCCRetrieval (ruby)
|
| 540 |
+
- CodeSearchNetCCRetrieval (java)
|
| 541 |
+
- CodeSearchNetCCRetrieval (php)
|
| 542 |
+
- CodeSearchNetRetrieval (python)
|
| 543 |
+
- CodeSearchNetRetrieval (javascript)
|
| 544 |
+
- CodeSearchNetRetrieval (go)
|
| 545 |
+
- CodeSearchNetRetrieval (ruby)
|
| 546 |
+
- CodeSearchNetRetrieval (java)
|
| 547 |
+
- CodeSearchNetRetrieval (php)
|
| 548 |
+
- CodeTransOceanContest
|
| 549 |
- CodeTransOceanDL
|
| 550 |
+
- CosQA
|
| 551 |
- StackOverflowQA
|
| 552 |
+
- SyntheticText2SQL
|
refresh.py
CHANGED
|
@@ -132,11 +132,11 @@ def make_clickable_model(model_name: str, link: None | str = None) -> str:
|
|
| 132 |
|
| 133 |
|
| 134 |
def add_lang(examples):
|
| 135 |
-
if not (examples["
|
| 136 |
examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
|
| 137 |
else:
|
| 138 |
examples["mteb_dataset_name_with_lang"] = (
|
| 139 |
-
examples["mteb_dataset_name"] + f' ({examples["
|
| 140 |
)
|
| 141 |
return examples
|
| 142 |
|
|
@@ -313,7 +313,7 @@ def get_external_model_results():
|
|
| 313 |
|
| 314 |
# Save & cache EXTERNAL_MODEL_RESULTS
|
| 315 |
with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
|
| 316 |
-
json.dump(EXTERNAL_MODEL_RESULTS, f, indent=4)
|
| 317 |
|
| 318 |
return EXTERNAL_MODEL_RESULTS
|
| 319 |
|
|
@@ -332,6 +332,10 @@ def download_or_use_cache(modelId: str):
|
|
| 332 |
return meta
|
| 333 |
|
| 334 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
def get_mteb_data(
|
| 336 |
tasks: list = ["Clustering"],
|
| 337 |
langs: list = [],
|
|
@@ -450,11 +454,11 @@ def get_mteb_data(
|
|
| 450 |
try:
|
| 451 |
out = [
|
| 452 |
{
|
| 453 |
-
res["dataset"]["name"]
|
| 454 |
round(score["value"], 2)
|
| 455 |
for score in res["metrics"]
|
| 456 |
if filter_metric_fetched(
|
| 457 |
-
res["dataset"]["name"]
|
| 458 |
score["type"],
|
| 459 |
task_to_metric.get(res["task"]["type"]),
|
| 460 |
res["dataset"]["split"],
|
|
|
|
| 132 |
|
| 133 |
|
| 134 |
def add_lang(examples):
|
| 135 |
+
if not (examples["hf_subset"]) or (examples["hf_subset"] == "default"):
|
| 136 |
examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
|
| 137 |
else:
|
| 138 |
examples["mteb_dataset_name_with_lang"] = (
|
| 139 |
+
examples["mteb_dataset_name"] + f' ({examples["hf_subset"]})'
|
| 140 |
)
|
| 141 |
return examples
|
| 142 |
|
|
|
|
| 313 |
|
| 314 |
# Save & cache EXTERNAL_MODEL_RESULTS
|
| 315 |
with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
|
| 316 |
+
json.dump(dict(sorted(EXTERNAL_MODEL_RESULTS.items())), f, indent=4)
|
| 317 |
|
| 318 |
return EXTERNAL_MODEL_RESULTS
|
| 319 |
|
|
|
|
| 332 |
return meta
|
| 333 |
|
| 334 |
|
| 335 |
+
def simplify_dataset_name(name):
|
| 336 |
+
return name.replace("MTEB ", "").replace(" (default)", "")
|
| 337 |
+
|
| 338 |
+
|
| 339 |
def get_mteb_data(
|
| 340 |
tasks: list = ["Clustering"],
|
| 341 |
langs: list = [],
|
|
|
|
| 454 |
try:
|
| 455 |
out = [
|
| 456 |
{
|
| 457 |
+
simplify_dataset_name(res["dataset"]["name"]): [
|
| 458 |
round(score["value"], 2)
|
| 459 |
for score in res["metrics"]
|
| 460 |
if filter_metric_fetched(
|
| 461 |
+
simplify_dataset_name(res["dataset"]["name"]),
|
| 462 |
score["type"],
|
| 463 |
task_to_metric.get(res["task"]["type"]),
|
| 464 |
res["dataset"]["split"],
|