|
import torch.distributed |
|
import faiss |
|
import pandas as pd |
|
import faiss |
|
import numpy as np |
|
import jsonlines, json |
|
from transformers import AutoModel |
|
import os |
|
import torch |
|
''' |
|
data format: |
|
{ |
|
"bibkey": "some_bibkey", |
|
"text": "The abstract or text of the paper." |
|
} |
|
example: |
|
{ |
|
"bibkey": "arxivid1234.5678", |
|
"text": "Title: A Study on Something\nAbstract: This paper discusses the findings of a study on something important in the field of research.\nAuthors: John Doe" |
|
} |
|
''' |
|
|
|
model_name = "openbmb/MiniCPM-Embedding-Light" |
|
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda") |
|
|
|
input_path = "./data/arxiv.jsonl" |
|
|
|
with jsonlines.open(input_path) as f: |
|
survey_data = list(f) |
|
|
|
|
|
xids = [item["bibkey"] for item in survey_data] |
|
passages = [item["text"] for item in survey_data] |
|
|
|
embeddings_doc_dense, _ = model.encode_corpus(passages, max_length=1024) |
|
|
|
|
|
|
|
index = faiss.IndexFlatIP(embeddings_doc_dense.shape[1]) |
|
id_map_index = faiss.IndexIDMap(index) |
|
index = faiss.index_cpu_to_all_gpus(id_map_index) |
|
|
|
x_ids_int = np.array(np.arange(len(xids))) |
|
|
|
str_int_ids = {} |
|
for i in range(len(xids)): |
|
str_int_ids[xids[i]] = x_ids_int[i] |
|
str_int_ids_df = pd.DataFrame(str_int_ids, index=[0]).T.reset_index() |
|
str_int_ids_df.columns = ["str_id", "int_id"] |
|
str_int_ids_df.to_csv("./index/str_int_ids_abstract.csv", index=False) |
|
|
|
index.add_with_ids(embeddings_doc_dense, x_ids_int) |
|
|
|
index = faiss.index_gpu_to_cpu(index) |
|
faiss.write_index(index, "./index/index_abstract.faiss") |
|
|