Update ingest.py
Browse files
ingest.py
CHANGED
@@ -1,15 +1,20 @@
|
|
1 |
import os
|
2 |
import logging
|
3 |
-
from
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
-
from
|
6 |
-
from
|
7 |
|
8 |
-
#
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
def load_documents(docs_dir):
|
|
|
|
|
|
|
|
|
|
|
13 |
documents = []
|
14 |
for root, dirs, files in os.walk(docs_dir):
|
15 |
for file in files:
|
@@ -29,6 +34,11 @@ def load_documents(docs_dir):
|
|
29 |
return documents
|
30 |
|
31 |
def split_text(documents):
|
|
|
|
|
|
|
|
|
|
|
32 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
33 |
texts = text_splitter.split_documents(documents)
|
34 |
|
@@ -43,6 +53,10 @@ def split_text(documents):
|
|
43 |
return texts
|
44 |
|
45 |
def create_embeddings():
|
|
|
|
|
|
|
|
|
46 |
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
47 |
embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
48 |
|
@@ -56,6 +70,12 @@ def create_embeddings():
|
|
56 |
return embeddings
|
57 |
|
58 |
def create_faiss_index(texts, embeddings):
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
try:
|
60 |
db = FAISS.from_documents(texts, embeddings)
|
61 |
logger.info(f"Created FAISS index with {len(texts)} vectors")
|
@@ -71,6 +91,11 @@ def create_faiss_index(texts, embeddings):
|
|
71 |
return db
|
72 |
|
73 |
def save_faiss_index(db, index_path):
|
|
|
|
|
|
|
|
|
|
|
74 |
try:
|
75 |
db.save_local(index_path)
|
76 |
logger.info(f"FAISS index saved to {index_path}")
|
@@ -110,5 +135,5 @@ def main():
|
|
110 |
# Save FAISS index
|
111 |
save_faiss_index(db, index_path)
|
112 |
|
113 |
-
if __name__ ==
|
114 |
main()
|
|
|
1 |
import os
|
2 |
import logging
|
3 |
+
from langchain_community.document_loaders import PyPDFLoader
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
6 |
+
from langchain_community.vectorstores import FAISS
|
7 |
|
8 |
+
# Set up logging
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
def load_documents(docs_dir):
|
13 |
+
"""
|
14 |
+
Load documents from a directory.
|
15 |
+
:param docs_dir: Directory containing PDF documents.
|
16 |
+
:return: List of loaded documents.
|
17 |
+
"""
|
18 |
documents = []
|
19 |
for root, dirs, files in os.walk(docs_dir):
|
20 |
for file in files:
|
|
|
34 |
return documents
|
35 |
|
36 |
def split_text(documents):
|
37 |
+
"""
|
38 |
+
Split documents into text chunks.
|
39 |
+
:param documents: List of documents to be split.
|
40 |
+
:return: List of text chunks.
|
41 |
+
"""
|
42 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
43 |
texts = text_splitter.split_documents(documents)
|
44 |
|
|
|
53 |
return texts
|
54 |
|
55 |
def create_embeddings():
|
56 |
+
"""
|
57 |
+
Create embeddings using a HuggingFace model.
|
58 |
+
:return: HuggingFaceEmbeddings object.
|
59 |
+
"""
|
60 |
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
61 |
embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
62 |
|
|
|
70 |
return embeddings
|
71 |
|
72 |
def create_faiss_index(texts, embeddings):
|
73 |
+
"""
|
74 |
+
Create a FAISS index from text chunks and embeddings.
|
75 |
+
:param texts: List of text chunks.
|
76 |
+
:param embeddings: HuggingFaceEmbeddings object.
|
77 |
+
:return: FAISS index object.
|
78 |
+
"""
|
79 |
try:
|
80 |
db = FAISS.from_documents(texts, embeddings)
|
81 |
logger.info(f"Created FAISS index with {len(texts)} vectors")
|
|
|
91 |
return db
|
92 |
|
93 |
def save_faiss_index(db, index_path):
|
94 |
+
"""
|
95 |
+
Save the FAISS index to a specified path.
|
96 |
+
:param db: FAISS index object.
|
97 |
+
:param index_path: Path to save the index.
|
98 |
+
"""
|
99 |
try:
|
100 |
db.save_local(index_path)
|
101 |
logger.info(f"FAISS index saved to {index_path}")
|
|
|
135 |
# Save FAISS index
|
136 |
save_faiss_index(db, index_path)
|
137 |
|
138 |
+
if __name__ == '__main__':
|
139 |
main()
|