pszemraj
/

flan-ul2-text-encoder

@@ -37,6 +37,7 @@ import torch
 from transformers import AutoModel, AutoTokenizer
 from transformers import AutoModelForTextEncoding
 def load_model_and_tokenizer(model_name: str) -> Tuple[AutoModel, AutoTokenizer]:
     """
     Load the model and tokenizer based on the given model name.
@@ -47,7 +48,9 @@ def load_model_and_tokenizer(model_name: str) -> Tuple[AutoModel, AutoTokenizer]
     Returns:
         Tuple[AutoModelForTextEncoding, AutoTokenizer]: The loaded model and tokenizer.
     """
-    model = AutoModelForTextEncoding.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model.eval()  # Deactivate Dropout
     return model, tokenizer
@@ -64,9 +67,11 @@ This computes the embeddings for the given texts given the model and tokenizer v
 ```python
-def get_embeddings(model: AutoModel, tokenizer: AutoTokenizer, texts: List[str]) -> torch.Tensor:
     """
-    Get the embeddings via weighted mean pooling across seq_len
     Args:
         model (AutoModel): The model to be used for getting embeddings.
@@ -81,7 +86,9 @@ def get_embeddings(model: AutoModel, tokenizer: AutoTokenizer, texts: List[str])
     # Get the embeddings
     with torch.no_grad():
-        last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
     # Get weights
     weights = (
@@ -89,7 +96,8 @@ def get_embeddings(model: AutoModel, tokenizer: AutoTokenizer, texts: List[str])
         .unsqueeze(0)
         .unsqueeze(-1)
         .expand(last_hidden_state.size())
-        .float().to(last_hidden_state.device)
     )
     # Get attn mask
@@ -123,17 +131,14 @@ Helper fn to compute and print out cosine similarity
 from scipy.spatial.distance import cosine
 def calculate_cosine_similarity(embeddings: torch.Tensor, texts: List[str]) -> None:
-    """
-    Calculate and print the cosine similarity between the first text and all other texts.
-    Args:
-        embeddings (torch.Tensor): The embeddings for the texts.
-        texts (List[str]): The texts for which cosine similarity is to be calculated.
-    """
     # Calculate cosine similarities
     for i in range(1, len(embeddings)):
         cosine_sim = 1 - cosine(embeddings[0], embeddings[i])
-        print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[i], cosine_sim))
 ```
 </details>

 from transformers import AutoModel, AutoTokenizer
 from transformers import AutoModelForTextEncoding
 def load_model_and_tokenizer(model_name: str) -> Tuple[AutoModel, AutoTokenizer]:
     """
     Load the model and tokenizer based on the given model name.
     Returns:
         Tuple[AutoModelForTextEncoding, AutoTokenizer]: The loaded model and tokenizer.
     """
+    model = AutoModelForTextEncoding.from_pretrained(
+        model_name, torch_dtype=torch.bfloat16, device_map="auto"
+    )
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model.eval()  # Deactivate Dropout
     return model, tokenizer
 ```python
+def get_embeddings(
+    model: AutoModel, tokenizer: AutoTokenizer, texts: List[str]
+) -> torch.Tensor:
     """
+    compute text embeddings via weighted mean pooling across seq_len
     Args:
         model (AutoModel): The model to be used for getting embeddings.
     # Get the embeddings
     with torch.no_grad():
+        last_hidden_state = model(
+            **batch_tokens, output_hidden_states=True, return_dict=True
+        ).last_hidden_state
     # Get weights
     weights = (
         .unsqueeze(0)
         .unsqueeze(-1)
         .expand(last_hidden_state.size())
+        .float()
+        .to(last_hidden_state.device)
     )
     # Get attn mask
 from scipy.spatial.distance import cosine
 def calculate_cosine_similarity(embeddings: torch.Tensor, texts: List[str]) -> None:
+    """compute and print the cosine sim between the first text and all others"""
     # Calculate cosine similarities
     for i in range(1, len(embeddings)):
         cosine_sim = 1 - cosine(embeddings[0], embeddings[i])
+        print(
+            'Cosine similarity between "%s" and "%s" is: %.3f'
+            % (texts[0], texts[i], cosine_sim)
+        )
 ```
 </details>