|
import os |
|
import pdfplumber |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.vectorstores import FAISS |
|
|
|
def preprocess_pdfs(folder_path, save_vectorstore_path): |
|
all_text = "" |
|
pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')] |
|
|
|
for file_path in pdf_files: |
|
with pdfplumber.open(file_path) as pdf: |
|
for page in pdf.pages: |
|
page_text = page.extract_text() |
|
if page_text: |
|
all_text += page_text |
|
|
|
if all_text: |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) |
|
text_chunks = text_splitter.split_text(all_text) |
|
|
|
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function) |
|
|
|
|
|
os.makedirs(save_vectorstore_path, exist_ok=True) |
|
vector_store.save_local(save_vectorstore_path) |
|
print("Data preprocessing and vector store creation completed!") |
|
|
|
|
|
data_folder = 'documents1' |
|
vectorstore_path = 'vector_store_data/faiss_vectorstore' |
|
|
|
|
|
preprocess_pdfs(data_folder, vectorstore_path) |
|
|