Caden Shokat
commited on
Commit
·
fa6c34a
1
Parent(s):
f0943d0
added model evaluation for both models
Browse files- src/eval/bert_baseline.py +0 -78
- src/eval/ir_eval.py +36 -0
- src/eval/log_metrics.py +22 -0
- src/eval/model_baseline.py +0 -10
- src/eval/run_base_eval.py +17 -0
- src/eval/run_ft_eval.py +17 -0
src/eval/bert_baseline.py
DELETED
@@ -1,78 +0,0 @@
|
|
1 |
-
from transformers import AutoModel, AutoTokenizer
|
2 |
-
from datasets import load_dataset
|
3 |
-
from sklearn.cluster import KMeans
|
4 |
-
from torch.nn.functional import normalize
|
5 |
-
from scipy.stats import spearmanr
|
6 |
-
from sklearn.datasets import fetch_20newsgroups
|
7 |
-
import torch
|
8 |
-
import numpy as np
|
9 |
-
|
10 |
-
|
11 |
-
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
12 |
-
device = torch.device("mps")
|
13 |
-
print("Using Apple MPS")
|
14 |
-
else:
|
15 |
-
device = torch.device("cpu")
|
16 |
-
print("Using CPU")
|
17 |
-
|
18 |
-
def embed_texts(texts, model, tokenizer, device=device):
|
19 |
-
ins = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
|
20 |
-
|
21 |
-
with torch.no_grad():
|
22 |
-
out = model(**ins).last_hidden_state
|
23 |
-
|
24 |
-
vecs = out.mean(dim=1)
|
25 |
-
return normalize(vecs, dim=-1).cpu().numpy()
|
26 |
-
|
27 |
-
def spearman_eval(model_name="bert-base-uncased", split="validation"):
|
28 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
29 |
-
model = AutoModel.from_pretrained(model_name).eval().to(device)
|
30 |
-
ds = load_dataset("glue", "stsb", split=split)
|
31 |
-
|
32 |
-
sims, gold = [], []
|
33 |
-
for ex in ds:
|
34 |
-
u = embed_texts([ex["sentence1"]], model, tokenizer)[0]
|
35 |
-
v = embed_texts([ex["sentence2"]], model, tokenizer)[0]
|
36 |
-
|
37 |
-
sims.append(float(np.dot(u, v)))
|
38 |
-
gold.append(ex["label"] / 5.0)
|
39 |
-
|
40 |
-
corr, _ = spearmanr(sims, gold)
|
41 |
-
print(f"BERT Baseline Spearman: {corr:.4f}")
|
42 |
-
|
43 |
-
|
44 |
-
def embed_in_batches(texts, model, tokenizer, batch_size=100):
|
45 |
-
all_vecs = []
|
46 |
-
for i in range(0, len(texts), batch_size):
|
47 |
-
batch = texts[i : i + batch_size]
|
48 |
-
vecs = embed_texts(batch, model, tokenizer)
|
49 |
-
all_vecs.append(vecs)
|
50 |
-
if device.type == "mps":
|
51 |
-
torch.mps.empty_cache()
|
52 |
-
return np.vstack(all_vecs)
|
53 |
-
|
54 |
-
|
55 |
-
def clustering_purity(model_name="bert-base-uncased", sample_size=2000, batch_size=100):
|
56 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
57 |
-
model = AutoModel.from_pretrained(model_name).eval().to(device)
|
58 |
-
|
59 |
-
ds = load_dataset("SetFit/20_newsgroups", split="train")
|
60 |
-
texts = ds["text"][:sample_size]
|
61 |
-
labels = np.array(ds["label"][:sample_size])
|
62 |
-
|
63 |
-
vecs = embed_in_batches(texts, model, tokenizer, batch_size)
|
64 |
-
|
65 |
-
clusters = KMeans(n_clusters=len(set(labels)),
|
66 |
-
random_state=0).fit_predict(vecs)
|
67 |
-
purity = (clusters == labels).sum() / len(labels)
|
68 |
-
print(f"Purity (N={sample_size}): {purity:.4f}")
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
if __name__ == "__main__":
|
73 |
-
# spearman_eval()
|
74 |
-
clustering_purity()
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/eval/ir_eval.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset, concatenate_datasets
|
2 |
+
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
|
3 |
+
from sentence_transformers.util import cos_sim
|
4 |
+
from src.utils.paths import TRAIN_JSON, TEST_JSON
|
5 |
+
|
6 |
+
def build_eval(matryoshka_dims: list[int] | tuple[int, ...]):
|
7 |
+
test_dataset = load_dataset("json", data_files=str(TEST_JSON), split="train")
|
8 |
+
train_dataset = load_dataset("json", data_files=str(TRAIN_JSON), split="train")
|
9 |
+
|
10 |
+
aws_dataset = concatenate_datasets([train_dataset, test_dataset])
|
11 |
+
|
12 |
+
corpus = dict(zip(aws_dataset["id"], aws_dataset["positive"]))
|
13 |
+
|
14 |
+
queries = dict(zip(test_dataset["id"], test_dataset["anchor"]))
|
15 |
+
|
16 |
+
relevant_docs: dict[int, list[int]] = {}
|
17 |
+
g2c = {}
|
18 |
+
for cid, g in zip(aws_dataset["id"], aws_dataset["global_id"]):
|
19 |
+
g2c.setdefault(g, []).append(cid)
|
20 |
+
|
21 |
+
for qid, g in zip(test_dataset["id"], test_dataset["global_id"]):
|
22 |
+
relevant_docs[qid] = g2c.get(g, [])
|
23 |
+
|
24 |
+
evaluators = []
|
25 |
+
for dim in matryoshka_dims:
|
26 |
+
ir = InformationRetrievalEvaluator(
|
27 |
+
queries=queries,
|
28 |
+
corpus=corpus,
|
29 |
+
relevant_docs=relevant_docs,
|
30 |
+
name=f"dim_{dim}",
|
31 |
+
truncate_dim=dim,
|
32 |
+
score_functions={"cosine": cos_sim},
|
33 |
+
)
|
34 |
+
evaluators.append(ir)
|
35 |
+
|
36 |
+
return SequentialEvaluator(evaluators)
|
src/eval/log_metrics.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def print_results_table(title: str, results: dict, dims: list[int] | tuple[int, ...]):
|
2 |
+
print(f"\n{title}")
|
3 |
+
print("-" * 85)
|
4 |
+
header = f"{'Metric':15} " + " ".join([f"{d:>12}d" for d in dims])
|
5 |
+
print(header)
|
6 |
+
print("-" * 85)
|
7 |
+
|
8 |
+
metrics = [
|
9 |
+
"ndcg@10", "mrr@10", "map@100",
|
10 |
+
"accuracy@1", "accuracy@3", "accuracy@5", "accuracy@10",
|
11 |
+
"precision@1", "precision@3", "precision@5", "precision@10",
|
12 |
+
"recall@1", "recall@3", "recall@5", "recall@10",
|
13 |
+
]
|
14 |
+
|
15 |
+
for m in metrics:
|
16 |
+
row = [f"{'=='+m+'==' if m=='ndcg@10' else m:15}"]
|
17 |
+
for d in dims:
|
18 |
+
key = f"dim_{d}_cosine_{m}"
|
19 |
+
row.append(f"{results[key]:12.4f}")
|
20 |
+
print(" ".join(row))
|
21 |
+
print("-" * 85)
|
22 |
+
print(f"seq_score: {results['sequential_score']:1f}")
|
src/eval/model_baseline.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
|
3 |
-
from sentence_transformers import SentenceTransformer, SentenceTransformerModelCardData, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
|
4 |
-
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
|
5 |
-
from sentence_transformers.util import cos_sim
|
6 |
-
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
|
7 |
-
from sentence_transformers.training_args import BatchSamplers
|
8 |
-
|
9 |
-
from datasets import load_dataset, concatenate_datasets
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/eval/run_base_eval.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
from src.utils.config import CFG
|
4 |
+
from src.eval.ir_eval import build_eval
|
5 |
+
from src.eval.log_metrics import print_results_table
|
6 |
+
|
7 |
+
def main():
|
8 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
9 |
+
model = SentenceTransformer(CFG.model_id, device=device)
|
10 |
+
|
11 |
+
evaluator = build_eval(CFG.matryoshka_dims)
|
12 |
+
base_results = evaluator(model)
|
13 |
+
|
14 |
+
print_results_table("Base Model Evaluation Results", base_results, CFG.matryoshka_dims)
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
main()
|
src/eval/run_ft_eval.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
from src.utils.config import CFG
|
4 |
+
from src.eval.ir_eval import build_eval
|
5 |
+
from src.eval.log_metrics import print_results_table
|
6 |
+
|
7 |
+
def main():
|
8 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
9 |
+
model = SentenceTransformer(CFG.output_dir, device=device)
|
10 |
+
|
11 |
+
evaluator = build_eval(CFG.matryoshka_dims)
|
12 |
+
ft_results = evaluator(model)
|
13 |
+
|
14 |
+
print_results_table("Fine Tuned Model Evaluation Results", ft_results, CFG.matryoshka_dims)
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
main()
|