pszemraj commited on
Commit
21b3e85
·
1 Parent(s): 77f7907

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +18 -13
README.md CHANGED
@@ -37,6 +37,7 @@ import torch
37
  from transformers import AutoModel, AutoTokenizer
38
  from transformers import AutoModelForTextEncoding
39
 
 
40
  def load_model_and_tokenizer(model_name: str) -> Tuple[AutoModel, AutoTokenizer]:
41
  """
42
  Load the model and tokenizer based on the given model name.
@@ -47,7 +48,9 @@ def load_model_and_tokenizer(model_name: str) -> Tuple[AutoModel, AutoTokenizer]
47
  Returns:
48
  Tuple[AutoModelForTextEncoding, AutoTokenizer]: The loaded model and tokenizer.
49
  """
50
- model = AutoModelForTextEncoding.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
 
 
51
  tokenizer = AutoTokenizer.from_pretrained(model_name)
52
  model.eval() # Deactivate Dropout
53
  return model, tokenizer
@@ -64,9 +67,11 @@ This computes the embeddings for the given texts given the model and tokenizer v
64
 
65
 
66
  ```python
67
- def get_embeddings(model: AutoModel, tokenizer: AutoTokenizer, texts: List[str]) -> torch.Tensor:
 
 
68
  """
69
- Get the embeddings via weighted mean pooling across seq_len
70
 
71
  Args:
72
  model (AutoModel): The model to be used for getting embeddings.
@@ -81,7 +86,9 @@ def get_embeddings(model: AutoModel, tokenizer: AutoTokenizer, texts: List[str])
81
 
82
  # Get the embeddings
83
  with torch.no_grad():
84
- last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
 
 
85
 
86
  # Get weights
87
  weights = (
@@ -89,7 +96,8 @@ def get_embeddings(model: AutoModel, tokenizer: AutoTokenizer, texts: List[str])
89
  .unsqueeze(0)
90
  .unsqueeze(-1)
91
  .expand(last_hidden_state.size())
92
- .float().to(last_hidden_state.device)
 
93
  )
94
 
95
  # Get attn mask
@@ -123,17 +131,14 @@ Helper fn to compute and print out cosine similarity
123
  from scipy.spatial.distance import cosine
124
 
125
  def calculate_cosine_similarity(embeddings: torch.Tensor, texts: List[str]) -> None:
126
- """
127
- Calculate and print the cosine similarity between the first text and all other texts.
128
-
129
- Args:
130
- embeddings (torch.Tensor): The embeddings for the texts.
131
- texts (List[str]): The texts for which cosine similarity is to be calculated.
132
- """
133
  # Calculate cosine similarities
134
  for i in range(1, len(embeddings)):
135
  cosine_sim = 1 - cosine(embeddings[0], embeddings[i])
136
- print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[i], cosine_sim))
 
 
 
137
  ```
138
 
139
  </details>
 
37
  from transformers import AutoModel, AutoTokenizer
38
  from transformers import AutoModelForTextEncoding
39
 
40
+
41
  def load_model_and_tokenizer(model_name: str) -> Tuple[AutoModel, AutoTokenizer]:
42
  """
43
  Load the model and tokenizer based on the given model name.
 
48
  Returns:
49
  Tuple[AutoModelForTextEncoding, AutoTokenizer]: The loaded model and tokenizer.
50
  """
51
+ model = AutoModelForTextEncoding.from_pretrained(
52
+ model_name, torch_dtype=torch.bfloat16, device_map="auto"
53
+ )
54
  tokenizer = AutoTokenizer.from_pretrained(model_name)
55
  model.eval() # Deactivate Dropout
56
  return model, tokenizer
 
67
 
68
 
69
  ```python
70
+ def get_embeddings(
71
+ model: AutoModel, tokenizer: AutoTokenizer, texts: List[str]
72
+ ) -> torch.Tensor:
73
  """
74
+ compute text embeddings via weighted mean pooling across seq_len
75
 
76
  Args:
77
  model (AutoModel): The model to be used for getting embeddings.
 
86
 
87
  # Get the embeddings
88
  with torch.no_grad():
89
+ last_hidden_state = model(
90
+ **batch_tokens, output_hidden_states=True, return_dict=True
91
+ ).last_hidden_state
92
 
93
  # Get weights
94
  weights = (
 
96
  .unsqueeze(0)
97
  .unsqueeze(-1)
98
  .expand(last_hidden_state.size())
99
+ .float()
100
+ .to(last_hidden_state.device)
101
  )
102
 
103
  # Get attn mask
 
131
  from scipy.spatial.distance import cosine
132
 
133
  def calculate_cosine_similarity(embeddings: torch.Tensor, texts: List[str]) -> None:
134
+ """compute and print the cosine sim between the first text and all others"""
 
 
 
 
 
 
135
  # Calculate cosine similarities
136
  for i in range(1, len(embeddings)):
137
  cosine_sim = 1 - cosine(embeddings[0], embeddings[i])
138
+ print(
139
+ 'Cosine similarity between "%s" and "%s" is: %.3f'
140
+ % (texts[0], texts[i], cosine_sim)
141
+ )
142
  ```
143
 
144
  </details>