cv / embedder.py
saherPervaiz's picture
Update embedder.py
bc8d8e3 verified
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
# Use a model with PyTorch weights available
MODEL_NAME = "thenlper/gte-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
def get_embeddings(texts, max_length=512):
"""
Generate embeddings for long text by chunking and averaging.
Args:
texts (str or list): One or multiple texts to embed.
max_length (int): Maximum tokens per chunk (default is 512).
Returns:
np.ndarray: Averaged embeddings.
"""
if isinstance(texts, str):
texts = [texts]
final_embeddings = []
for text in texts:
# Tokenize and split into chunks
tokens = tokenizer.tokenize(text)
chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
chunk_embeddings = []
for chunk in chunks:
input_ids = tokenizer.convert_tokens_to_ids(chunk)
input_ids = torch.tensor([input_ids])
with torch.no_grad():
output = model(input_ids=input_ids)
embedding = output.last_hidden_state.mean(dim=1) # Mean pooling
chunk_embeddings.append(embedding)
# Average embeddings of all chunks
if chunk_embeddings:
avg_embedding = torch.stack(chunk_embeddings).mean(dim=0)
final_embeddings.append(avg_embedding.squeeze(0).numpy())
else:
final_embeddings.append(np.zeros(model.config.hidden_size))
return np.array(final_embeddings)