|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain_community.document_loaders import PyPDFDirectoryLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.vectorstores import ElasticsearchStore |
|
|
|
class PDFEmbedding: |
|
def __init__(self, model_path="dragonkue/BGE-m3-ko", pdf_dir="./data/pdf", es_url="http://localhost:9200", index_name="pdf_embeddings"): |
|
self.embeddings = HuggingFaceEmbeddings( |
|
model_name=model_path, |
|
model_kwargs={'device': 'cuda:0'}, |
|
encode_kwargs={'normalize_embeddings': True} |
|
) |
|
self.pdf_dir = pdf_dir |
|
self.es_url = es_url |
|
self.index_name = index_name |
|
|
|
def load_pdf_directory(self): |
|
loader = PyPDFDirectoryLoader(self.pdf_dir) |
|
pages = loader.load() |
|
|
|
|
|
for page in pages: |
|
|
|
page.page_content = page.page_content.replace("-\n", "") |
|
|
|
page.page_content = page.page_content.replace("\n", " ") |
|
|
|
return pages |
|
|
|
def split_documents(self, documents): |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=400, |
|
chunk_overlap=50, |
|
length_function=len, |
|
separators=[r"\n{2,}", r"\n", r"[.!?]", r"[,;:]", r" "], |
|
is_separator_regex=True |
|
) |
|
return text_splitter.split_documents(documents) |
|
|
|
def process_and_store(self): |
|
|
|
pdf_data = self.load_pdf_directory() |
|
|
|
|
|
chunks = self.split_documents(pdf_data) |
|
|
|
|
|
vectorstore = ElasticsearchStore( |
|
es_url=self.es_url, |
|
index_name=self.index_name, |
|
embedding=self.embeddings |
|
) |
|
|
|
|
|
vectorstore.add_documents(chunks) |
|
|
|
|