Spaces:
Build error
Build error
| import os | |
| import glob | |
| from langchain.schema.document import Document | |
| from e5_embeddings import E5Embeddings | |
| from langchain_community.vectorstores import FAISS | |
| from document_processor import load_pdf_with_pymupdf, split_documents | |
| # Path configuration | |
| FOLDER = "cleaned_pdfs" # Folder containing the cleaned PDFs | |
| VECTOR_STORE_PATH = "vector_db" | |
| # 1. Load the embedding model | |
| def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"): | |
| return E5Embeddings( | |
| model_name=model_name, | |
| model_kwargs={'device': device}, | |
| encode_kwargs={'normalize_embeddings': True} | |
| ) | |
| # 2. Load existing vector store | |
| def load_vector_store(embeddings, load_path=VECTOR_STORE_PATH): | |
| if not os.path.exists(load_path): | |
| raise FileNotFoundError(f"Cannot find vector store: {load_path}") | |
| return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True) | |
| # 3. Embed only the cleaned PDFs | |
| def embed_cleaned_pdfs(folder, vectorstore, embeddings): | |
| pattern = os.path.join(folder, "cleaned*.pdf") | |
| pdf_files = glob.glob(pattern) | |
| print(f"Number of target PDFs: {len(pdf_files)}") | |
| new_documents = [] | |
| for pdf_path in pdf_files: | |
| print(f"Processing: {pdf_path}") | |
| text = load_pdf_with_pymupdf(pdf_path) | |
| if text.strip(): | |
| new_documents.append(Document(page_content=text, metadata={"source": pdf_path})) | |
| print(f"Number of documents: {len(new_documents)}") | |
| chunks = split_documents(new_documents, chunk_size=300, chunk_overlap=50) | |
| print(f"Number of chunks: {len(chunks)}") | |
| print(f"Vector count before addition: {vectorstore.index.ntotal}") | |
| vectorstore.add_documents(chunks) | |
| print(f"Vector count after addition: {vectorstore.index.ntotal}") | |
| vectorstore.save_local(VECTOR_STORE_PATH) | |
| print(f"Save completed: {VECTOR_STORE_PATH}") | |
| # Execution | |
| if __name__ == "__main__": | |
| embeddings = get_embeddings() | |
| vectorstore = load_vector_store(embeddings) | |
| embed_cleaned_pdfs(FOLDER, vectorstore, embeddings) |