ZeeAI1 commited on
Commit
b3dde21
·
verified ·
1 Parent(s): 9692911

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -0
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pdfplumber
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.vectorstores import FAISS
6
+
7
+ def preprocess_pdfs(folder_path, save_vectorstore_path):
8
+ all_text = ""
9
+ pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
10
+
11
+ for file_path in pdf_files:
12
+ with pdfplumber.open(file_path) as pdf:
13
+ for page in pdf.pages:
14
+ page_text = page.extract_text()
15
+ if page_text:
16
+ all_text += page_text
17
+
18
+ if all_text:
19
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
20
+ text_chunks = text_splitter.split_text(all_text)
21
+
22
+ embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
23
+ vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
24
+
25
+ # Ensure the save directory exists
26
+ os.makedirs(save_vectorstore_path, exist_ok=True)
27
+ vector_store.save_local(save_vectorstore_path)
28
+ print("Data preprocessing and vector store creation completed!")
29
+
30
+ # Define your folder paths
31
+ data_folder = 'documents1' # Replace with the path to your PDFs
32
+ vectorstore_path = 'vector_store_data/faiss_vectorstore' # Path to save vector store
33
+
34
+ # Run preprocessing
35
+ preprocess_pdfs(data_folder, vectorstore_path)