Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
Β·
6181979
1
Parent(s):
3be8255
Fix metric names & metadata new format
Browse files- EXTERNAL_MODEL_RESULTS.json +0 -0
- app.py +24 -18
- config.yaml +9 -9
EXTERNAL_MODEL_RESULTS.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app.py
CHANGED
|
@@ -23,7 +23,15 @@ PRETTY_NAMES = {
|
|
| 23 |
"BitextMining": "Bitext Mining",
|
| 24 |
}
|
| 25 |
|
| 26 |
-
TASK_TO_METRIC = {k: v["metric"] for k, v in TASKS_CONFIG.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def make_clickable_model(model_name, link=None):
|
| 29 |
if link is None:
|
|
@@ -93,16 +101,16 @@ def add_task(examples):
|
|
| 93 |
examples["mteb_task"] = "Unknown"
|
| 94 |
return examples
|
| 95 |
|
| 96 |
-
def filter_metric_external(x, task,
|
| 97 |
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
| 98 |
if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
|
| 99 |
return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
|
| 100 |
else:
|
| 101 |
-
return x["mteb_task"] == task and x["metric"]
|
| 102 |
|
| 103 |
-
def filter_metric_fetched(name, metric,
|
| 104 |
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
| 105 |
-
return metric == 'ndcg_at_1' if name in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval'] else metric
|
| 106 |
|
| 107 |
if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
|
| 108 |
with open("EXTERNAL_MODEL_RESULTS.json") as f:
|
|
@@ -112,9 +120,9 @@ if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
|
|
| 112 |
for model in EXTERNAL_MODELS:
|
| 113 |
if model not in EXTERNAL_MODEL_RESULTS:
|
| 114 |
models_to_run.append(model)
|
| 115 |
-
EXTERNAL_MODEL_RESULTS[model] = {k: {v: []} for k, v in TASK_TO_METRIC.items()}
|
| 116 |
else:
|
| 117 |
-
EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
|
| 118 |
models_to_run = EXTERNAL_MODELS
|
| 119 |
|
| 120 |
pbar = tqdm(models_to_run, desc="Fetching external model results")
|
|
@@ -127,10 +135,11 @@ for model in pbar:
|
|
| 127 |
ds = ds.map(add_task)
|
| 128 |
base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
|
| 129 |
|
| 130 |
-
for task,
|
| 131 |
-
ds_dict = ds.filter(lambda x: filter_metric_external(x, task,
|
| 132 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
| 133 |
-
|
|
|
|
| 134 |
|
| 135 |
# Save & cache EXTERNAL_MODEL_RESULTS
|
| 136 |
with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
|
|
@@ -204,9 +213,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
| 204 |
results_list = []
|
| 205 |
for task in tasks:
|
| 206 |
# Not all models have InstructionRetrieval, other new tasks
|
| 207 |
-
if task not in EXTERNAL_MODEL_RESULTS[model]:
|
| 208 |
-
|
| 209 |
-
results_list += EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]
|
| 210 |
|
| 211 |
if len(datasets) > 0:
|
| 212 |
res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
|
|
@@ -262,7 +270,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
| 262 |
# import pdb; pdb.set_trace()
|
| 263 |
try:
|
| 264 |
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if filter_metric_fetched(res["dataset"]["name"].replace("MTEB ", ""), score["type"], task_to_metric.get(res["task"]["type"]))][0]} for res in task_results]
|
| 265 |
-
except:
|
|
|
|
| 266 |
print("ERROR", model.modelId)
|
| 267 |
continue
|
| 268 |
out = {k: v for d in out for k, v in d.items()}
|
|
@@ -304,10 +313,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
| 304 |
if len(datasets) > 0:
|
| 305 |
# Update legacy column names to be merged with newer ones
|
| 306 |
# Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
|
| 307 |
-
#if ('MLSUMClusteringP2P (fr)' in datasets):
|
| 308 |
-
# import pdb; pdb.set_trace()
|
| 309 |
if ('MLSUMClusteringP2P (fr)' in datasets) and ('MLSUMClusteringP2P' in cols):
|
| 310 |
-
#import pdb; pdb.set_trace()
|
| 311 |
df['MLSUMClusteringP2P (fr)'] = df['MLSUMClusteringP2P (fr)'].fillna(df['MLSUMClusteringP2P'])
|
| 312 |
datasets.remove('MLSUMClusteringP2P')
|
| 313 |
if ('MLSUMClusteringS2S (fr)' in datasets) and ('MLSUMClusteringS2S' in cols):
|
|
@@ -656,7 +662,7 @@ with gr.Blocks(css=css) as block:
|
|
| 656 |
gr.Markdown(f"""
|
| 657 |
{item['description']}
|
| 658 |
|
| 659 |
-
- **Metric:** {
|
| 660 |
- **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
|
| 661 |
{"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
|
| 662 |
""")
|
|
|
|
| 23 |
"BitextMining": "Bitext Mining",
|
| 24 |
}
|
| 25 |
|
| 26 |
+
TASK_TO_METRIC = {k: [v["metric"]] for k, v in TASKS_CONFIG.items()}
|
| 27 |
+
# Add legacy metric names
|
| 28 |
+
TASK_TO_METRIC["STS"].append("cos_sim_spearman")
|
| 29 |
+
TASK_TO_METRIC["STS"].append("cosine_spearman")
|
| 30 |
+
TASK_TO_METRIC["Summarization"].append("cos_sim_spearman")
|
| 31 |
+
TASK_TO_METRIC["Summarization"].append("cosine_spearman")
|
| 32 |
+
TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
|
| 33 |
+
TASK_TO_METRIC["PairClassification"].append("cosine_ap")
|
| 34 |
+
|
| 35 |
|
| 36 |
def make_clickable_model(model_name, link=None):
|
| 37 |
if link is None:
|
|
|
|
| 101 |
examples["mteb_task"] = "Unknown"
|
| 102 |
return examples
|
| 103 |
|
| 104 |
+
def filter_metric_external(x, task, metrics):
|
| 105 |
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
| 106 |
if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
|
| 107 |
return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
|
| 108 |
else:
|
| 109 |
+
return x["mteb_task"] == task and x["metric"] in metrics
|
| 110 |
|
| 111 |
+
def filter_metric_fetched(name, metric, expected_metrics):
|
| 112 |
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
| 113 |
+
return metric == 'ndcg_at_1' if name in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval'] else metric in expected_metrics
|
| 114 |
|
| 115 |
if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
|
| 116 |
with open("EXTERNAL_MODEL_RESULTS.json") as f:
|
|
|
|
| 120 |
for model in EXTERNAL_MODELS:
|
| 121 |
if model not in EXTERNAL_MODEL_RESULTS:
|
| 122 |
models_to_run.append(model)
|
| 123 |
+
EXTERNAL_MODEL_RESULTS[model] = {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()}
|
| 124 |
else:
|
| 125 |
+
EXTERNAL_MODEL_RESULTS = {model: {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
|
| 126 |
models_to_run = EXTERNAL_MODELS
|
| 127 |
|
| 128 |
pbar = tqdm(models_to_run, desc="Fetching external model results")
|
|
|
|
| 135 |
ds = ds.map(add_task)
|
| 136 |
base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
|
| 137 |
|
| 138 |
+
for task, metrics in TASK_TO_METRIC.items():
|
| 139 |
+
ds_dict = ds.filter(lambda x: filter_metric_external(x, task, metrics))["test"].to_dict()
|
| 140 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
| 141 |
+
# metrics[0] is the main name for this metric; other names in the list are legacy for backward-compat
|
| 142 |
+
EXTERNAL_MODEL_RESULTS[model][task][metrics[0]].append({**base_dict, **ds_dict})
|
| 143 |
|
| 144 |
# Save & cache EXTERNAL_MODEL_RESULTS
|
| 145 |
with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
|
|
|
|
| 213 |
results_list = []
|
| 214 |
for task in tasks:
|
| 215 |
# Not all models have InstructionRetrieval, other new tasks
|
| 216 |
+
if task not in EXTERNAL_MODEL_RESULTS[model]: continue
|
| 217 |
+
results_list += EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task][0]]
|
|
|
|
| 218 |
|
| 219 |
if len(datasets) > 0:
|
| 220 |
res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
|
|
|
|
| 270 |
# import pdb; pdb.set_trace()
|
| 271 |
try:
|
| 272 |
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if filter_metric_fetched(res["dataset"]["name"].replace("MTEB ", ""), score["type"], task_to_metric.get(res["task"]["type"]))][0]} for res in task_results]
|
| 273 |
+
except Exception as e:
|
| 274 |
+
import pdb; pdb.set_trace()
|
| 275 |
print("ERROR", model.modelId)
|
| 276 |
continue
|
| 277 |
out = {k: v for d in out for k, v in d.items()}
|
|
|
|
| 313 |
if len(datasets) > 0:
|
| 314 |
# Update legacy column names to be merged with newer ones
|
| 315 |
# Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
|
|
|
|
|
|
|
| 316 |
if ('MLSUMClusteringP2P (fr)' in datasets) and ('MLSUMClusteringP2P' in cols):
|
|
|
|
| 317 |
df['MLSUMClusteringP2P (fr)'] = df['MLSUMClusteringP2P (fr)'].fillna(df['MLSUMClusteringP2P'])
|
| 318 |
datasets.remove('MLSUMClusteringP2P')
|
| 319 |
if ('MLSUMClusteringS2S (fr)' in datasets) and ('MLSUMClusteringS2S' in cols):
|
|
|
|
| 662 |
gr.Markdown(f"""
|
| 663 |
{item['description']}
|
| 664 |
|
| 665 |
+
- **Metric:** {specific_metric}
|
| 666 |
- **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
|
| 667 |
{"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
|
| 668 |
""")
|
config.yaml
CHANGED
|
@@ -16,12 +16,12 @@ tasks:
|
|
| 16 |
Clustering:
|
| 17 |
icon: "β¨"
|
| 18 |
metric: v_measure
|
| 19 |
-
metric_description: "Validity Measure (
|
| 20 |
task_description: "Clustering is the task of grouping similar documents together."
|
| 21 |
PairClassification:
|
| 22 |
icon: "π"
|
| 23 |
-
metric:
|
| 24 |
-
metric_description: "Average Precision based on
|
| 25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
| 26 |
Reranking:
|
| 27 |
icon: "π₯"
|
|
@@ -31,22 +31,22 @@ tasks:
|
|
| 31 |
Retrieval:
|
| 32 |
icon: "π"
|
| 33 |
metric: ndcg_at_10
|
| 34 |
-
metric_description: "Normalized Discounted Cumulative Gain @
|
| 35 |
task_description: "Retrieval is the task of finding relevant documents for a query."
|
| 36 |
STS:
|
| 37 |
icon: "βοΈ"
|
| 38 |
-
metric:
|
| 39 |
-
metric_description: "Spearman correlation based on
|
| 40 |
task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
|
| 41 |
Summarization:
|
| 42 |
icon: "π"
|
| 43 |
-
metric:
|
| 44 |
-
metric_description: "Spearman correlation
|
| 45 |
task_description: "Summarization is the task of generating a summary of a text."
|
| 46 |
InstructionRetrieval:
|
| 47 |
icon: "ππ"
|
| 48 |
metric: "p-MRR"
|
| 49 |
-
metric_description: "paired mean reciprocal rank"
|
| 50 |
task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions."
|
| 51 |
boards:
|
| 52 |
en:
|
|
|
|
| 16 |
Clustering:
|
| 17 |
icon: "β¨"
|
| 18 |
metric: v_measure
|
| 19 |
+
metric_description: "Validity Measure (V-measure)"
|
| 20 |
task_description: "Clustering is the task of grouping similar documents together."
|
| 21 |
PairClassification:
|
| 22 |
icon: "π"
|
| 23 |
+
metric: ap
|
| 24 |
+
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
|
| 25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
| 26 |
Reranking:
|
| 27 |
icon: "π₯"
|
|
|
|
| 31 |
Retrieval:
|
| 32 |
icon: "π"
|
| 33 |
metric: ndcg_at_10
|
| 34 |
+
metric_description: "Normalized Discounted Cumulative Gain @ 10 (nDCG@10)"
|
| 35 |
task_description: "Retrieval is the task of finding relevant documents for a query."
|
| 36 |
STS:
|
| 37 |
icon: "βοΈ"
|
| 38 |
+
metric: spearman
|
| 39 |
+
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
| 40 |
task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
|
| 41 |
Summarization:
|
| 42 |
icon: "π"
|
| 43 |
+
metric: spearman
|
| 44 |
+
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
| 45 |
task_description: "Summarization is the task of generating a summary of a text."
|
| 46 |
InstructionRetrieval:
|
| 47 |
icon: "ππ"
|
| 48 |
metric: "p-MRR"
|
| 49 |
+
metric_description: "paired mean reciprocal rank (p-MRR)"
|
| 50 |
task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions."
|
| 51 |
boards:
|
| 52 |
en:
|