import os import pdfplumber from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS def preprocess_pdfs(folder_path, save_vectorstore_path): all_text = "" pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')] for file_path in pdf_files: with pdfplumber.open(file_path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: all_text += page_text if all_text: text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) text_chunks = text_splitter.split_text(all_text) embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function) # Ensure the save directory exists os.makedirs(save_vectorstore_path, exist_ok=True) vector_store.save_local(save_vectorstore_path) print("Data preprocessing and vector store creation completed!") # Define your folder paths data_folder = 'documents1' # Replace with the path to your PDFs vectorstore_path = 'vector_store_data/faiss_vectorstore' # Path to save vector store # Run preprocessing preprocess_pdfs(data_folder, vectorstore_path)