Update ingest.py
Browse files
ingest.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
import os
|
2 |
import logging
|
|
|
|
|
3 |
from langchain_community.document_loaders import PDFMinerLoader
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
6 |
-
from langchain_community.vectorstores import Chroma
|
7 |
|
8 |
logging.basicConfig(level=logging.INFO)
|
9 |
logger = logging.getLogger(__name__)
|
10 |
|
11 |
-
def
|
12 |
documents = []
|
13 |
docs_dir = "docs"
|
14 |
|
@@ -52,12 +53,17 @@ def create_chroma_db():
|
|
52 |
logger.error(f"Failed to initialize embeddings: {e}")
|
53 |
return
|
54 |
|
|
|
|
|
|
|
55 |
try:
|
56 |
-
|
57 |
-
|
|
|
|
|
58 |
except Exception as e:
|
59 |
-
logger.error(f"Failed to create
|
60 |
return
|
61 |
|
62 |
if __name__ == "__main__":
|
63 |
-
|
|
|
1 |
import os
|
2 |
import logging
|
3 |
+
import faiss
|
4 |
+
import numpy as np
|
5 |
from langchain_community.document_loaders import PDFMinerLoader
|
6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
|
8 |
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
+
def create_faiss_index():
|
13 |
documents = []
|
14 |
docs_dir = "docs"
|
15 |
|
|
|
53 |
logger.error(f"Failed to initialize embeddings: {e}")
|
54 |
return
|
55 |
|
56 |
+
embedding_vectors = np.array([embeddings.embed(text) for text in texts])
|
57 |
+
dimension = embedding_vectors.shape[1]
|
58 |
+
|
59 |
try:
|
60 |
+
faiss_index = faiss.IndexFlatL2(dimension)
|
61 |
+
faiss_index.add(embedding_vectors)
|
62 |
+
faiss.write_index(faiss_index, "faiss_index.index")
|
63 |
+
logger.info(f"Created FAISS index with {len(texts)} vectors.")
|
64 |
except Exception as e:
|
65 |
+
logger.error(f"Failed to create FAISS index: {e}")
|
66 |
return
|
67 |
|
68 |
if __name__ == "__main__":
|
69 |
+
create_faiss_index()
|