|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
import numpy as np |
|
from tqdm import tqdm |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base") |
|
model = AutoModel.from_pretrained("allenai/specter2_base") |
|
model.eval() |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
def embed_texts_specter2(texts: list[str], batch_size=16) -> np.ndarray: |
|
embeddings = [] |
|
|
|
for i in tqdm(range(0, len(texts), batch_size), desc="Embedding with SPECTER2"): |
|
batch_texts = texts[i:i+batch_size] |
|
inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
cls_embeddings = outputs.last_hidden_state[:, 0, :] |
|
cls_embeddings = torch.nn.functional.normalize(cls_embeddings, p=2, dim=1) |
|
embeddings.append(cls_embeddings.cpu().numpy()) |
|
|
|
return np.vstack(embeddings) |