jarif commited on
Commit
be6c976
·
verified ·
1 Parent(s): 7d0301f

Update ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +30 -5
ingest.py CHANGED
@@ -1,15 +1,20 @@
1
  import os
2
  import logging
3
- from langchain.document_loaders import PyPDFLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain.embeddings import HuggingFaceEmbeddings
6
- from langchain.vectorstores import FAISS
7
 
8
- # Setup logging
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
  def load_documents(docs_dir):
 
 
 
 
 
13
  documents = []
14
  for root, dirs, files in os.walk(docs_dir):
15
  for file in files:
@@ -29,6 +34,11 @@ def load_documents(docs_dir):
29
  return documents
30
 
31
  def split_text(documents):
 
 
 
 
 
32
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
33
  texts = text_splitter.split_documents(documents)
34
 
@@ -43,6 +53,10 @@ def split_text(documents):
43
  return texts
44
 
45
  def create_embeddings():
 
 
 
 
46
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
47
  embeddings = HuggingFaceEmbeddings(model_name=model_name)
48
 
@@ -56,6 +70,12 @@ def create_embeddings():
56
  return embeddings
57
 
58
  def create_faiss_index(texts, embeddings):
 
 
 
 
 
 
59
  try:
60
  db = FAISS.from_documents(texts, embeddings)
61
  logger.info(f"Created FAISS index with {len(texts)} vectors")
@@ -71,6 +91,11 @@ def create_faiss_index(texts, embeddings):
71
  return db
72
 
73
  def save_faiss_index(db, index_path):
 
 
 
 
 
74
  try:
75
  db.save_local(index_path)
76
  logger.info(f"FAISS index saved to {index_path}")
@@ -110,5 +135,5 @@ def main():
110
  # Save FAISS index
111
  save_faiss_index(db, index_path)
112
 
113
- if __name__ == "__main__":
114
  main()
 
1
  import os
2
  import logging
3
+ from langchain_community.document_loaders import PyPDFLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
 
8
+ # Set up logging
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
  def load_documents(docs_dir):
13
+ """
14
+ Load documents from a directory.
15
+ :param docs_dir: Directory containing PDF documents.
16
+ :return: List of loaded documents.
17
+ """
18
  documents = []
19
  for root, dirs, files in os.walk(docs_dir):
20
  for file in files:
 
34
  return documents
35
 
36
  def split_text(documents):
37
+ """
38
+ Split documents into text chunks.
39
+ :param documents: List of documents to be split.
40
+ :return: List of text chunks.
41
+ """
42
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
43
  texts = text_splitter.split_documents(documents)
44
 
 
53
  return texts
54
 
55
  def create_embeddings():
56
+ """
57
+ Create embeddings using a HuggingFace model.
58
+ :return: HuggingFaceEmbeddings object.
59
+ """
60
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
61
  embeddings = HuggingFaceEmbeddings(model_name=model_name)
62
 
 
70
  return embeddings
71
 
72
  def create_faiss_index(texts, embeddings):
73
+ """
74
+ Create a FAISS index from text chunks and embeddings.
75
+ :param texts: List of text chunks.
76
+ :param embeddings: HuggingFaceEmbeddings object.
77
+ :return: FAISS index object.
78
+ """
79
  try:
80
  db = FAISS.from_documents(texts, embeddings)
81
  logger.info(f"Created FAISS index with {len(texts)} vectors")
 
91
  return db
92
 
93
  def save_faiss_index(db, index_path):
94
+ """
95
+ Save the FAISS index to a specified path.
96
+ :param db: FAISS index object.
97
+ :param index_path: Path to save the index.
98
+ """
99
  try:
100
  db.save_local(index_path)
101
  logger.info(f"FAISS index saved to {index_path}")
 
135
  # Save FAISS index
136
  save_faiss_index(db, index_path)
137
 
138
+ if __name__ == '__main__':
139
  main()