Update README.md
Browse files
README.md
CHANGED
@@ -37,6 +37,7 @@ import torch
|
|
37 |
from transformers import AutoModel, AutoTokenizer
|
38 |
from transformers import AutoModelForTextEncoding
|
39 |
|
|
|
40 |
def load_model_and_tokenizer(model_name: str) -> Tuple[AutoModel, AutoTokenizer]:
|
41 |
"""
|
42 |
Load the model and tokenizer based on the given model name.
|
@@ -47,7 +48,9 @@ def load_model_and_tokenizer(model_name: str) -> Tuple[AutoModel, AutoTokenizer]
|
|
47 |
Returns:
|
48 |
Tuple[AutoModelForTextEncoding, AutoTokenizer]: The loaded model and tokenizer.
|
49 |
"""
|
50 |
-
model = AutoModelForTextEncoding.from_pretrained(
|
|
|
|
|
51 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
52 |
model.eval() # Deactivate Dropout
|
53 |
return model, tokenizer
|
@@ -64,9 +67,11 @@ This computes the embeddings for the given texts given the model and tokenizer v
|
|
64 |
|
65 |
|
66 |
```python
|
67 |
-
def get_embeddings(
|
|
|
|
|
68 |
"""
|
69 |
-
|
70 |
|
71 |
Args:
|
72 |
model (AutoModel): The model to be used for getting embeddings.
|
@@ -81,7 +86,9 @@ def get_embeddings(model: AutoModel, tokenizer: AutoTokenizer, texts: List[str])
|
|
81 |
|
82 |
# Get the embeddings
|
83 |
with torch.no_grad():
|
84 |
-
last_hidden_state = model(
|
|
|
|
|
85 |
|
86 |
# Get weights
|
87 |
weights = (
|
@@ -89,7 +96,8 @@ def get_embeddings(model: AutoModel, tokenizer: AutoTokenizer, texts: List[str])
|
|
89 |
.unsqueeze(0)
|
90 |
.unsqueeze(-1)
|
91 |
.expand(last_hidden_state.size())
|
92 |
-
.float()
|
|
|
93 |
)
|
94 |
|
95 |
# Get attn mask
|
@@ -123,17 +131,14 @@ Helper fn to compute and print out cosine similarity
|
|
123 |
from scipy.spatial.distance import cosine
|
124 |
|
125 |
def calculate_cosine_similarity(embeddings: torch.Tensor, texts: List[str]) -> None:
|
126 |
-
"""
|
127 |
-
Calculate and print the cosine similarity between the first text and all other texts.
|
128 |
-
|
129 |
-
Args:
|
130 |
-
embeddings (torch.Tensor): The embeddings for the texts.
|
131 |
-
texts (List[str]): The texts for which cosine similarity is to be calculated.
|
132 |
-
"""
|
133 |
# Calculate cosine similarities
|
134 |
for i in range(1, len(embeddings)):
|
135 |
cosine_sim = 1 - cosine(embeddings[0], embeddings[i])
|
136 |
-
print(
|
|
|
|
|
|
|
137 |
```
|
138 |
|
139 |
</details>
|
|
|
37 |
from transformers import AutoModel, AutoTokenizer
|
38 |
from transformers import AutoModelForTextEncoding
|
39 |
|
40 |
+
|
41 |
def load_model_and_tokenizer(model_name: str) -> Tuple[AutoModel, AutoTokenizer]:
|
42 |
"""
|
43 |
Load the model and tokenizer based on the given model name.
|
|
|
48 |
Returns:
|
49 |
Tuple[AutoModelForTextEncoding, AutoTokenizer]: The loaded model and tokenizer.
|
50 |
"""
|
51 |
+
model = AutoModelForTextEncoding.from_pretrained(
|
52 |
+
model_name, torch_dtype=torch.bfloat16, device_map="auto"
|
53 |
+
)
|
54 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
55 |
model.eval() # Deactivate Dropout
|
56 |
return model, tokenizer
|
|
|
67 |
|
68 |
|
69 |
```python
|
70 |
+
def get_embeddings(
|
71 |
+
model: AutoModel, tokenizer: AutoTokenizer, texts: List[str]
|
72 |
+
) -> torch.Tensor:
|
73 |
"""
|
74 |
+
compute text embeddings via weighted mean pooling across seq_len
|
75 |
|
76 |
Args:
|
77 |
model (AutoModel): The model to be used for getting embeddings.
|
|
|
86 |
|
87 |
# Get the embeddings
|
88 |
with torch.no_grad():
|
89 |
+
last_hidden_state = model(
|
90 |
+
**batch_tokens, output_hidden_states=True, return_dict=True
|
91 |
+
).last_hidden_state
|
92 |
|
93 |
# Get weights
|
94 |
weights = (
|
|
|
96 |
.unsqueeze(0)
|
97 |
.unsqueeze(-1)
|
98 |
.expand(last_hidden_state.size())
|
99 |
+
.float()
|
100 |
+
.to(last_hidden_state.device)
|
101 |
)
|
102 |
|
103 |
# Get attn mask
|
|
|
131 |
from scipy.spatial.distance import cosine
|
132 |
|
133 |
def calculate_cosine_similarity(embeddings: torch.Tensor, texts: List[str]) -> None:
|
134 |
+
"""compute and print the cosine sim between the first text and all others"""
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
# Calculate cosine similarities
|
136 |
for i in range(1, len(embeddings)):
|
137 |
cosine_sim = 1 - cosine(embeddings[0], embeddings[i])
|
138 |
+
print(
|
139 |
+
'Cosine similarity between "%s" and "%s" is: %.3f'
|
140 |
+
% (texts[0], texts[i], cosine_sim)
|
141 |
+
)
|
142 |
```
|
143 |
|
144 |
</details>
|