Caden Shokat commited on
Commit
fa6c34a
·
1 Parent(s): f0943d0

added model evaluation for both models

Browse files
src/eval/bert_baseline.py DELETED
@@ -1,78 +0,0 @@
1
- from transformers import AutoModel, AutoTokenizer
2
- from datasets import load_dataset
3
- from sklearn.cluster import KMeans
4
- from torch.nn.functional import normalize
5
- from scipy.stats import spearmanr
6
- from sklearn.datasets import fetch_20newsgroups
7
- import torch
8
- import numpy as np
9
-
10
-
11
- if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
12
- device = torch.device("mps")
13
- print("Using Apple MPS")
14
- else:
15
- device = torch.device("cpu")
16
- print("Using CPU")
17
-
18
- def embed_texts(texts, model, tokenizer, device=device):
19
- ins = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
20
-
21
- with torch.no_grad():
22
- out = model(**ins).last_hidden_state
23
-
24
- vecs = out.mean(dim=1)
25
- return normalize(vecs, dim=-1).cpu().numpy()
26
-
27
- def spearman_eval(model_name="bert-base-uncased", split="validation"):
28
- tokenizer = AutoTokenizer.from_pretrained(model_name)
29
- model = AutoModel.from_pretrained(model_name).eval().to(device)
30
- ds = load_dataset("glue", "stsb", split=split)
31
-
32
- sims, gold = [], []
33
- for ex in ds:
34
- u = embed_texts([ex["sentence1"]], model, tokenizer)[0]
35
- v = embed_texts([ex["sentence2"]], model, tokenizer)[0]
36
-
37
- sims.append(float(np.dot(u, v)))
38
- gold.append(ex["label"] / 5.0)
39
-
40
- corr, _ = spearmanr(sims, gold)
41
- print(f"BERT Baseline Spearman: {corr:.4f}")
42
-
43
-
44
- def embed_in_batches(texts, model, tokenizer, batch_size=100):
45
- all_vecs = []
46
- for i in range(0, len(texts), batch_size):
47
- batch = texts[i : i + batch_size]
48
- vecs = embed_texts(batch, model, tokenizer)
49
- all_vecs.append(vecs)
50
- if device.type == "mps":
51
- torch.mps.empty_cache()
52
- return np.vstack(all_vecs)
53
-
54
-
55
- def clustering_purity(model_name="bert-base-uncased", sample_size=2000, batch_size=100):
56
- tokenizer = AutoTokenizer.from_pretrained(model_name)
57
- model = AutoModel.from_pretrained(model_name).eval().to(device)
58
-
59
- ds = load_dataset("SetFit/20_newsgroups", split="train")
60
- texts = ds["text"][:sample_size]
61
- labels = np.array(ds["label"][:sample_size])
62
-
63
- vecs = embed_in_batches(texts, model, tokenizer, batch_size)
64
-
65
- clusters = KMeans(n_clusters=len(set(labels)),
66
- random_state=0).fit_predict(vecs)
67
- purity = (clusters == labels).sum() / len(labels)
68
- print(f"Purity (N={sample_size}): {purity:.4f}")
69
-
70
-
71
-
72
- if __name__ == "__main__":
73
- # spearman_eval()
74
- clustering_purity()
75
-
76
-
77
-
78
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/eval/ir_eval.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, concatenate_datasets
2
+ from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
3
+ from sentence_transformers.util import cos_sim
4
+ from src.utils.paths import TRAIN_JSON, TEST_JSON
5
+
6
+ def build_eval(matryoshka_dims: list[int] | tuple[int, ...]):
7
+ test_dataset = load_dataset("json", data_files=str(TEST_JSON), split="train")
8
+ train_dataset = load_dataset("json", data_files=str(TRAIN_JSON), split="train")
9
+
10
+ aws_dataset = concatenate_datasets([train_dataset, test_dataset])
11
+
12
+ corpus = dict(zip(aws_dataset["id"], aws_dataset["positive"]))
13
+
14
+ queries = dict(zip(test_dataset["id"], test_dataset["anchor"]))
15
+
16
+ relevant_docs: dict[int, list[int]] = {}
17
+ g2c = {}
18
+ for cid, g in zip(aws_dataset["id"], aws_dataset["global_id"]):
19
+ g2c.setdefault(g, []).append(cid)
20
+
21
+ for qid, g in zip(test_dataset["id"], test_dataset["global_id"]):
22
+ relevant_docs[qid] = g2c.get(g, [])
23
+
24
+ evaluators = []
25
+ for dim in matryoshka_dims:
26
+ ir = InformationRetrievalEvaluator(
27
+ queries=queries,
28
+ corpus=corpus,
29
+ relevant_docs=relevant_docs,
30
+ name=f"dim_{dim}",
31
+ truncate_dim=dim,
32
+ score_functions={"cosine": cos_sim},
33
+ )
34
+ evaluators.append(ir)
35
+
36
+ return SequentialEvaluator(evaluators)
src/eval/log_metrics.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def print_results_table(title: str, results: dict, dims: list[int] | tuple[int, ...]):
2
+ print(f"\n{title}")
3
+ print("-" * 85)
4
+ header = f"{'Metric':15} " + " ".join([f"{d:>12}d" for d in dims])
5
+ print(header)
6
+ print("-" * 85)
7
+
8
+ metrics = [
9
+ "ndcg@10", "mrr@10", "map@100",
10
+ "accuracy@1", "accuracy@3", "accuracy@5", "accuracy@10",
11
+ "precision@1", "precision@3", "precision@5", "precision@10",
12
+ "recall@1", "recall@3", "recall@5", "recall@10",
13
+ ]
14
+
15
+ for m in metrics:
16
+ row = [f"{'=='+m+'==' if m=='ndcg@10' else m:15}"]
17
+ for d in dims:
18
+ key = f"dim_{d}_cosine_{m}"
19
+ row.append(f"{results[key]:12.4f}")
20
+ print(" ".join(row))
21
+ print("-" * 85)
22
+ print(f"seq_score: {results['sequential_score']:1f}")
src/eval/model_baseline.py DELETED
@@ -1,10 +0,0 @@
1
- import torch
2
-
3
- from sentence_transformers import SentenceTransformer, SentenceTransformerModelCardData, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
4
- from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
5
- from sentence_transformers.util import cos_sim
6
- from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
7
- from sentence_transformers.training_args import BatchSamplers
8
-
9
- from datasets import load_dataset, concatenate_datasets
10
-
 
 
 
 
 
 
 
 
 
 
 
src/eval/run_base_eval.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from sentence_transformers import SentenceTransformer
3
+ from src.utils.config import CFG
4
+ from src.eval.ir_eval import build_eval
5
+ from src.eval.log_metrics import print_results_table
6
+
7
+ def main():
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ model = SentenceTransformer(CFG.model_id, device=device)
10
+
11
+ evaluator = build_eval(CFG.matryoshka_dims)
12
+ base_results = evaluator(model)
13
+
14
+ print_results_table("Base Model Evaluation Results", base_results, CFG.matryoshka_dims)
15
+
16
+ if __name__ == "__main__":
17
+ main()
src/eval/run_ft_eval.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from sentence_transformers import SentenceTransformer
3
+ from src.utils.config import CFG
4
+ from src.eval.ir_eval import build_eval
5
+ from src.eval.log_metrics import print_results_table
6
+
7
+ def main():
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ model = SentenceTransformer(CFG.output_dir, device=device)
10
+
11
+ evaluator = build_eval(CFG.matryoshka_dims)
12
+ ft_results = evaluator(model)
13
+
14
+ print_results_table("Fine Tuned Model Evaluation Results", ft_results, CFG.matryoshka_dims)
15
+
16
+ if __name__ == "__main__":
17
+ main()