File size: 1,459 Bytes
b3dde21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import os
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
def preprocess_pdfs(folder_path, save_vectorstore_path):
all_text = ""
pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
for file_path in pdf_files:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
all_text += page_text
if all_text:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
text_chunks = text_splitter.split_text(all_text)
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
# Ensure the save directory exists
os.makedirs(save_vectorstore_path, exist_ok=True)
vector_store.save_local(save_vectorstore_path)
print("Data preprocessing and vector store creation completed!")
# Define your folder paths
data_folder = 'documents1' # Replace with the path to your PDFs
vectorstore_path = 'vector_store_data/faiss_vectorstore' # Path to save vector store
# Run preprocessing
preprocess_pdfs(data_folder, vectorstore_path)
|