File size: 1,459 Bytes
b3dde21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

def preprocess_pdfs(folder_path, save_vectorstore_path):
    all_text = ""
    pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]

    for file_path in pdf_files:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    all_text += page_text

    if all_text:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
        text_chunks = text_splitter.split_text(all_text)

        embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)

        # Ensure the save directory exists
        os.makedirs(save_vectorstore_path, exist_ok=True)
        vector_store.save_local(save_vectorstore_path)
        print("Data preprocessing and vector store creation completed!")

# Define your folder paths
data_folder = 'documents1'  # Replace with the path to your PDFs
vectorstore_path = 'vector_store_data/faiss_vectorstore'  # Path to save vector store

# Run preprocessing
preprocess_pdfs(data_folder, vectorstore_path)