jarif commited on
Commit
13c2214
·
verified ·
1 Parent(s): 485bd05

Update ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +63 -84
ingest.py CHANGED
@@ -1,84 +1,63 @@
1
- import os
2
- import logging
3
- from langchain_community.document_loaders import PDFMinerLoader
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_community.embeddings import HuggingFaceEmbeddings
6
- from langchain_community.vectorstores import FAISS
7
-
8
- logging.basicConfig(level=logging.INFO)
9
- logger = logging.getLogger(__name__)
10
-
11
- def create_faiss_index():
12
- documents = []
13
- docs_dir = "docs"
14
-
15
- if not os.path.exists(docs_dir):
16
- logger.error(f"The directory '{docs_dir}' does not exist.")
17
- return
18
-
19
- for root, dirs, files in os.walk(docs_dir):
20
- for file in files:
21
- if file.endswith(".pdf"):
22
- file_path = os.path.join(root, file)
23
- logger.info(f"Loading document: {file_path}")
24
- try:
25
- loader = PDFMinerLoader(file_path)
26
- loaded_docs = loader.load()
27
- if loaded_docs:
28
- logger.info(f"Loaded {len(loaded_docs)} documents from {file_path}")
29
- documents.extend(loaded_docs)
30
- else:
31
- logger.warning(f"No documents loaded from {file_path}")
32
- except Exception as e:
33
- logger.error(f"Error loading {file_path}: {e}")
34
-
35
- if not documents:
36
- logger.error("No documents were loaded. Check the 'docs' directory and file paths.")
37
- return
38
-
39
- logger.info(f"Loaded {len(documents)} documents.")
40
-
41
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
42
- texts = text_splitter.split_documents(documents)
43
- logger.info(f"Created {len(texts)} text chunks.")
44
- if not texts:
45
- logger.error("No text chunks created. Check the text splitting process.")
46
- return
47
-
48
- try:
49
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
50
- logger.info("Embeddings initialized successfully.")
51
- except Exception as e:
52
- logger.error(f"Failed to initialize embeddings: {e}")
53
- return
54
-
55
- try:
56
- db = FAISS.from_documents(texts, embeddings)
57
- if db.index.ntotal > 0:
58
- logger.info(f"Created FAISS index with {db.index.ntotal} vectors.")
59
- else:
60
- logger.error("FAISS index contains 0 vectors.")
61
- return
62
- except Exception as e:
63
- logger.error(f"Failed to create FAISS index: {e}")
64
- return
65
-
66
- index_dir = "faiss_index"
67
- if not os.path.exists(index_dir):
68
- os.makedirs(index_dir)
69
-
70
- try:
71
- db.save_local(index_dir)
72
- index_file_path = os.path.join(index_dir, "index.faiss")
73
- file_size = os.path.getsize(index_file_path)
74
- logger.info(f"FAISS index saved to {index_dir}")
75
- logger.info(f"Index file size: {file_size} bytes")
76
- if file_size == 0:
77
- logger.error(f"Index file '{index_file_path}' is empty.")
78
- else:
79
- logger.info(f"Index file '{index_file_path}' created successfully.")
80
- except Exception as e:
81
- logger.error(f"Failed to save FAISS index: {e}")
82
-
83
- if __name__ == "__main__":
84
- create_faiss_index()
 
1
+ import os
2
+ import logging
3
+ from langchain_community.document_loaders import PDFMinerLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import Chroma
7
+
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
+ def create_chroma_db():
12
+ documents = []
13
+ docs_dir = "docs"
14
+
15
+ if not os.path.exists(docs_dir):
16
+ logger.error(f"The directory '{docs_dir}' does not exist.")
17
+ return
18
+
19
+ for root, dirs, files in os.walk(docs_dir):
20
+ for file in files:
21
+ if file.endswith(".pdf"):
22
+ file_path = os.path.join(root, file)
23
+ logger.info(f"Loading document: {file_path}")
24
+ try:
25
+ loader = PDFMinerLoader(file_path)
26
+ loaded_docs = loader.load()
27
+ if loaded_docs:
28
+ logger.info(f"Loaded {len(loaded_docs)} documents from {file_path}")
29
+ documents.extend(loaded_docs)
30
+ else:
31
+ logger.warning(f"No documents loaded from {file_path}")
32
+ except Exception as e:
33
+ logger.error(f"Error loading {file_path}: {e}")
34
+
35
+ if not documents:
36
+ logger.error("No documents were loaded. Check the 'docs' directory and file paths.")
37
+ return
38
+
39
+ logger.info(f"Loaded {len(documents)} documents.")
40
+
41
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
42
+ texts = text_splitter.split_documents(documents)
43
+ logger.info(f"Created {len(texts)} text chunks.")
44
+ if not texts:
45
+ logger.error("No text chunks created. Check the text splitting process.")
46
+ return
47
+
48
+ try:
49
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
50
+ logger.info("Embeddings initialized successfully.")
51
+ except Exception as e:
52
+ logger.error(f"Failed to initialize embeddings: {e}")
53
+ return
54
+
55
+ try:
56
+ db = Chroma.from_documents(texts, embeddings, persist_directory="chroma_db")
57
+ logger.info(f"Created Chroma database with {len(texts)} vectors.")
58
+ except Exception as e:
59
+ logger.error(f"Failed to create Chroma database: {e}")
60
+ return
61
+
62
+ if __name__ == "__main__":
63
+ create_chroma_db()