jarif commited on
Commit
d343dde
·
verified ·
1 Parent(s): d06c2d8

Update ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +66 -27
ingest.py CHANGED
@@ -1,39 +1,78 @@
1
  import os
 
2
  from langchain.document_loaders import PyPDFLoader
 
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
  from langchain_community.vectorstores import FAISS
5
 
 
 
 
 
6
  def create_faiss_index():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  try:
8
- # Ensure the 'docs' directory exists and contains files
9
- docs_directory = 'docs'
10
- if not os.path.exists(docs_directory) or not os.listdir(docs_directory):
11
- raise ValueError(f"Directory '{docs_directory}' is empty or does not exist.")
12
-
13
- # Load all documents from the 'docs' directory
14
- documents = []
15
- for file in os.listdir(docs_directory):
16
- if file.endswith('.pdf'):
17
- loader = PyPDFLoader(os.path.join(docs_directory, file))
18
- documents.extend(loader.load())
19
-
20
- if not documents:
21
- raise ValueError("No valid documents found in the 'docs' directory.")
22
-
23
- # Create embeddings using HuggingFace's 'sentence-transformers/all-MiniLM-L6-v2' model
24
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
-
26
- # Create the FAISS vector store index
27
- faiss_index = FAISS.from_documents(documents, embeddings)
28
-
29
- # Save the FAISS index locally
30
- index_path = "faiss_index"
31
- os.makedirs(index_path, exist_ok=True)
32
- faiss_index.save_local(index_path)
33
-
34
- print("FAISS index created and saved successfully.")
35
  except Exception as e:
36
- print(f"An error occurred during FAISS index creation: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  if __name__ == "__main__":
39
  create_faiss_index()
 
1
  import os
2
+ import logging
3
  from langchain.document_loaders import PyPDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
7
 
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
  def create_faiss_index():
13
+ documents = []
14
+ docs_dir = "docs"
15
+
16
+ if not os.path.exists(docs_dir):
17
+ logger.error(f"The directory '{docs_dir}' does not exist.")
18
+ return
19
+
20
+ if not os.listdir(docs_dir):
21
+ logger.error(f"The directory '{docs_dir}' is empty.")
22
+ return
23
+
24
+ for root, dirs, files in os.walk(docs_dir):
25
+ for file in files:
26
+ if file.endswith(".pdf"):
27
+ file_path = os.path.join(root, file)
28
+ logger.info(f"Loading document: {file_path}")
29
+ try:
30
+ loader = PyPDFLoader(file_path)
31
+ documents.extend(loader.load())
32
+ logger.info(f"Successfully loaded document: {file_path}")
33
+ except Exception as e:
34
+ logger.error(f"Error loading {file_path}: {e}")
35
+
36
+ if not documents:
37
+ logger.error("No documents were loaded. Check the 'docs' directory and file paths.")
38
+ return
39
+
40
+ logger.info(f"Loaded {len(documents)} documents.")
41
+
42
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
43
+ texts = text_splitter.split_documents(documents)
44
+
45
+ if not texts:
46
+ logger.error("No text chunks were created. Check the text splitting process.")
47
+ return
48
+
49
+ logger.info(f"Created {len(texts)} text chunks.")
50
+
51
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
 
 
 
 
 
 
 
 
 
53
  except Exception as e:
54
+ logger.error(f"Failed to initialize embeddings: {e}")
55
+ return
56
+
57
+ try:
58
+ db = FAISS.from_documents(texts, embeddings)
59
+ logger.info(f"Created FAISS index with {len(texts)} vectors")
60
+ except Exception as e:
61
+ logger.error(f"Failed to create FAISS index: {e}")
62
+ return
63
+
64
+ index_dir = "faiss_index"
65
+ if not os.path.exists(index_dir):
66
+ os.makedirs(index_dir)
67
+
68
+ try:
69
+ db.save_local(index_dir)
70
+ index_path = os.path.join(index_dir, "index.faiss")
71
+ logger.info(f"FAISS index successfully saved to {index_dir}")
72
+ logger.info(f"Index file size after creation: {os.path.getsize(index_path)} bytes")
73
+ logger.info(f"Index file permissions: {oct(os.stat(index_path).st_mode)[-3:]}")
74
+ except Exception as e:
75
+ logger.error(f"Failed to save FAISS index: {e}")
76
 
77
  if __name__ == "__main__":
78
  create_faiss_index()