jarif commited on
Commit
7ae0de5
·
verified ·
1 Parent(s): e1f0f6b

Update ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +21 -20
ingest.py CHANGED
@@ -13,53 +13,54 @@ def create_faiss_index():
13
  docs_dir = "docs"
14
 
15
  if not os.path.exists(docs_dir):
16
- print(f"The directory '{docs_dir}' does not exist.")
17
  return
18
 
19
  for root, dirs, files in os.walk(docs_dir):
20
  for file in files:
21
  if file.endswith(".pdf"):
22
  file_path = os.path.join(root, file)
23
- print(f"Loading document: {file_path}")
24
  try:
25
  loader = PDFMinerLoader(file_path)
26
  loaded_docs = loader.load()
27
  if loaded_docs:
28
- print(f"Loaded {len(loaded_docs)} documents from {file_path}")
 
29
  else:
30
- print(f"No documents loaded from {file_path}")
31
- documents.extend(loaded_docs)
32
  except Exception as e:
33
- print(f"Error loading {file_path}: {e}")
34
 
35
  if not documents:
36
- print("No documents were loaded. Check the 'docs' directory and file paths.")
37
  return
38
 
39
- print(f"Loaded {len(documents)} documents.")
40
 
41
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
42
  texts = text_splitter.split_documents(documents)
43
- print(f"Created {len(texts)} text chunks.")
44
  if not texts:
45
- print("No text chunks created. Check the text splitting process.")
46
  return
47
 
48
  try:
49
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
50
- print("Embeddings initialized successfully.")
51
  except Exception as e:
52
- print(f"Failed to initialize embeddings: {e}")
53
  return
54
 
55
  try:
56
  db = FAISS.from_documents(texts, embeddings)
57
  if db.index.ntotal > 0:
58
- print(f"Created FAISS index with {db.index.ntotal} vectors.")
59
  else:
60
- print("FAISS index contains 0 vectors.")
 
61
  except Exception as e:
62
- print(f"Failed to create FAISS index: {e}")
63
  return
64
 
65
  index_dir = "faiss_index"
@@ -70,14 +71,14 @@ def create_faiss_index():
70
  db.save_local(index_dir)
71
  index_file_path = os.path.join(index_dir, "index.faiss")
72
  file_size = os.path.getsize(index_file_path)
73
- print(f"FAISS index saved to {index_dir}")
74
- print(f"Index file size: {file_size} bytes")
75
  if file_size == 0:
76
- print(f"Index file '{index_file_path}' is empty.")
77
  else:
78
- print(f"Index file '{index_file_path}' created successfully.")
79
  except Exception as e:
80
- print(f"Failed to save FAISS index: {e}")
81
 
82
  if __name__ == "__main__":
83
  create_faiss_index()
 
13
  docs_dir = "docs"
14
 
15
  if not os.path.exists(docs_dir):
16
+ logger.error(f"The directory '{docs_dir}' does not exist.")
17
  return
18
 
19
  for root, dirs, files in os.walk(docs_dir):
20
  for file in files:
21
  if file.endswith(".pdf"):
22
  file_path = os.path.join(root, file)
23
+ logger.info(f"Loading document: {file_path}")
24
  try:
25
  loader = PDFMinerLoader(file_path)
26
  loaded_docs = loader.load()
27
  if loaded_docs:
28
+ logger.info(f"Loaded {len(loaded_docs)} documents from {file_path}")
29
+ documents.extend(loaded_docs)
30
  else:
31
+ logger.warning(f"No documents loaded from {file_path}")
 
32
  except Exception as e:
33
+ logger.error(f"Error loading {file_path}: {e}")
34
 
35
  if not documents:
36
+ logger.error("No documents were loaded. Check the 'docs' directory and file paths.")
37
  return
38
 
39
+ logger.info(f"Loaded {len(documents)} documents.")
40
 
41
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
42
  texts = text_splitter.split_documents(documents)
43
+ logger.info(f"Created {len(texts)} text chunks.")
44
  if not texts:
45
+ logger.error("No text chunks created. Check the text splitting process.")
46
  return
47
 
48
  try:
49
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
50
+ logger.info("Embeddings initialized successfully.")
51
  except Exception as e:
52
+ logger.error(f"Failed to initialize embeddings: {e}")
53
  return
54
 
55
  try:
56
  db = FAISS.from_documents(texts, embeddings)
57
  if db.index.ntotal > 0:
58
+ logger.info(f"Created FAISS index with {db.index.ntotal} vectors.")
59
  else:
60
+ logger.error("FAISS index contains 0 vectors.")
61
+ return
62
  except Exception as e:
63
+ logger.error(f"Failed to create FAISS index: {e}")
64
  return
65
 
66
  index_dir = "faiss_index"
 
71
  db.save_local(index_dir)
72
  index_file_path = os.path.join(index_dir, "index.faiss")
73
  file_size = os.path.getsize(index_file_path)
74
+ logger.info(f"FAISS index saved to {index_dir}")
75
+ logger.info(f"Index file size: {file_size} bytes")
76
  if file_size == 0:
77
+ logger.error(f"Index file '{index_file_path}' is empty.")
78
  else:
79
+ logger.info(f"Index file '{index_file_path}' created successfully.")
80
  except Exception as e:
81
+ logger.error(f"Failed to save FAISS index: {e}")
82
 
83
  if __name__ == "__main__":
84
  create_faiss_index()