from langchain.embeddings import HuggingFaceEmbeddings from langchain_community.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import ElasticsearchStore class PDFEmbedding: def __init__(self, model_path="dragonkue/BGE-m3-ko", pdf_dir="./data/pdf", es_url="http://localhost:9200", index_name="pdf_embeddings"): self.embeddings = HuggingFaceEmbeddings( model_name=model_path, model_kwargs={'device': 'cuda:0'}, encode_kwargs={'normalize_embeddings': True} ) self.pdf_dir = pdf_dir self.es_url = es_url self.index_name = index_name def load_pdf_directory(self): loader = PyPDFDirectoryLoader(self.pdf_dir) pages = loader.load() # 줄바꿈 노이즈 정리 for page in pages: # 하이픈으로 줄바꿈된 단어 복원 page.page_content = page.page_content.replace("-\n", "") # 일반 줄바꿈은 공백으로 변환 page.page_content = page.page_content.replace("\n", " ") return pages def split_documents(self, documents): text_splitter = RecursiveCharacterTextSplitter( chunk_size=400, chunk_overlap=50, length_function=len, separators=[r"\n{2,}", r"\n", r"[.!?]", r"[,;:]", r" "], is_separator_regex=True ) return text_splitter.split_documents(documents) def process_and_store(self): # PDF 로드 pdf_data = self.load_pdf_directory() # 문서 분할 chunks = self.split_documents(pdf_data) # Elasticsearch 벡터 스토어 생성 vectorstore = ElasticsearchStore( es_url=self.es_url, index_name=self.index_name, embedding=self.embeddings ) # 문서 저장 vectorstore.add_documents(chunks)