jarif commited on
Commit
e502b01
·
verified ·
1 Parent(s): b36c927

Update ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +81 -99
ingest.py CHANGED
@@ -1,99 +1,81 @@
1
- import os
2
- import logging
3
- from langchain_community.document_loaders import PDFMinerLoader
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_community.embeddings import HuggingFaceEmbeddings
6
- from langchain_community.vectorstores import FAISS
7
-
8
- # Set up logging
9
- logging.basicConfig(level=logging.INFO)
10
- logger = logging.getLogger(__name__)
11
-
12
- def create_faiss_index(texts, embeddings):
13
- """
14
- Create a FAISS index from text chunks and embeddings.
15
- :param texts: List of text chunks.
16
- :param embeddings: HuggingFaceEmbeddings object.
17
- :return: FAISS index object.
18
- """
19
- try:
20
- db = FAISS.from_documents(texts, embeddings)
21
- logger.info(f"Created FAISS index with {len(texts)} vectors")
22
- # Check the FAISS index size
23
- if len(db.index) > 0:
24
- logger.info(f"FAISS index contains {len(db.index)} vectors.")
25
- else:
26
- logger.error("FAISS index contains 0 vectors after creation. Check the data and embeddings.")
27
- except Exception as e:
28
- logger.error(f"Failed to create FAISS index: {e}")
29
- return None
30
-
31
- return db
32
-
33
- def save_faiss_index(db, index_path):
34
- """
35
- Save the FAISS index to a specified path.
36
- :param db: FAISS index object.
37
- :param index_path: Path to save the index.
38
- """
39
- try:
40
- db.save_local(index_path)
41
- # Check the file size
42
- index_file_path = os.path.join(index_path, "index.faiss")
43
- file_size = os.path.getsize(index_file_path)
44
- logger.info(f"FAISS index saved to {index_path}")
45
- logger.info(f"Index file size: {file_size} bytes")
46
- if file_size == 0:
47
- logger.error(f"Index file '{index_file_path}' is empty.")
48
- except Exception as e:
49
- logger.error(f"Failed to save FAISS index to {index_path}: {e}")
50
-
51
- def create_faiss_index_from_pdfs():
52
- documents = []
53
- docs_dir = "docs"
54
-
55
- if not os.path.exists(docs_dir):
56
- logger.error(f"The directory '{docs_dir}' does not exist.")
57
- return
58
-
59
- for root, dirs, files in os.walk(docs_dir):
60
- for file in files:
61
- if file.endswith(".pdf"):
62
- file_path = os.path.join(root, file)
63
- logger.info(f"Loading document: {file_path}")
64
- try:
65
- loader = PDFMinerLoader(file_path)
66
- documents.extend(loader.load())
67
- except Exception as e:
68
- logger.error(f"Error loading {file_path}: {e}")
69
-
70
- if not documents:
71
- logger.error("No documents were loaded. Check the 'docs' directory and file paths.")
72
- return
73
-
74
- logger.info(f"Loaded {len(documents)} documents.")
75
-
76
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
77
- texts = text_splitter.split_documents(documents)
78
-
79
- if not texts:
80
- logger.error("No text chunks were created. Check the text splitting process.")
81
- return
82
-
83
- logger.info(f"Created {len(texts)} text chunks.")
84
-
85
- try:
86
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
87
- except Exception as e:
88
- logger.error(f"Failed to initialize embeddings: {e}")
89
- return
90
-
91
- db = create_faiss_index(texts, embeddings)
92
- if db:
93
- index_dir = "faiss_index"
94
- if not os.path.exists(index_dir):
95
- os.makedirs(index_dir)
96
- save_faiss_index(db, index_dir)
97
-
98
- if __name__ == "__main__":
99
- create_faiss_index_from_pdfs()
 
1
+ import os
2
+ import logging
3
+ from langchain_community.document_loaders import PDFMinerLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
+ def create_faiss_index():
12
+ documents = []
13
+ docs_dir = "docs"
14
+
15
+ if not os.path.exists(docs_dir):
16
+ print(f"The directory '{docs_dir}' does not exist.")
17
+ return
18
+
19
+ for root, dirs, files in os.walk(docs_dir):
20
+ for file in files:
21
+ if file.endswith(".pdf"):
22
+ file_path = os.path.join(root, file)
23
+ print(f"Loading document: {file_path}")
24
+ try:
25
+ loader = PDFMinerLoader(file_path)
26
+ loaded_docs = loader.load()
27
+ if loaded_docs:
28
+ print(f"Loaded {len(loaded_docs)} documents from {file_path}")
29
+ else:
30
+ print(f"No documents loaded from {file_path}")
31
+ documents.extend(loaded_docs)
32
+ except Exception as e:
33
+ print(f"Error loading {file_path}: {e}")
34
+
35
+ if not documents:
36
+ print("No documents were loaded. Check the 'docs' directory and file paths.")
37
+ return
38
+
39
+ print(f"Loaded {len(documents)} documents.")
40
+
41
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
42
+ texts = text_splitter.split_documents(documents)
43
+ print(f"Created {len(texts)} text chunks.")
44
+ if not texts:
45
+ print("No text chunks created. Check the text splitting process.")
46
+ return
47
+
48
+ try:
49
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
50
+ print("Embeddings initialized successfully.")
51
+ except Exception as e:
52
+ print(f"Failed to initialize embeddings: {e}")
53
+ return
54
+
55
+ try:
56
+ db = FAISS.from_documents(texts, embeddings)
57
+ if db.index.ntotal > 0:
58
+ print(f"Created FAISS index with {db.index.ntotal} vectors.")
59
+ else:
60
+ print("FAISS index contains 0 vectors.")
61
+ except Exception as e:
62
+ print(f"Failed to create FAISS index: {e}")
63
+ return
64
+
65
+ index_dir = "faiss_index"
66
+ if not os.path.exists(index_dir):
67
+ os.makedirs(index_dir)
68
+
69
+ try:
70
+ db.save_local(index_dir)
71
+ index_file_path = os.path.join(index_dir, "index.faiss")
72
+ file_size = os.path.getsize(index_file_path)
73
+ print(f"FAISS index saved to {index_dir}")
74
+ print(f"Index file size: {file_size} bytes")
75
+ if file_size == 0:
76
+ print(f"Index file '{index_file_path}' is empty.")
77
+ except Exception as e:
78
+ print(f"Failed to save FAISS index: {e}")
79
+
80
+ if __name__ == "__main__":
81
+ create_faiss_index()