adarsh-maurya commited on
Commit
d883f65
·
verified ·
1 Parent(s): a965de4

Update Ingest.py

Browse files
Files changed (1) hide show
  1. Ingest.py +53 -43
Ingest.py CHANGED
@@ -1,10 +1,11 @@
1
  import ray
2
  import logging
 
3
  from langchain_community.document_loaders import DirectoryLoader
4
  from langchain_community.embeddings import HuggingFaceEmbeddings
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_community.vectorstores import FAISS
7
- from faiss import IndexFlatL2
8
 
9
  # Initialize Ray
10
  ray.init()
@@ -12,62 +13,71 @@ ray.init()
12
  # Set up basic configuration for logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Load documents with logging
16
  logging.info("Loading documents...")
17
  loader = DirectoryLoader('data', glob="./*.txt")
18
  documents = loader.load()
19
 
20
- # Extract text from documents and split into manageable chunks with logging
21
  logging.info("Extracting and splitting texts from documents...")
22
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
23
  texts = []
24
  for document in documents:
25
- try:
26
- if hasattr(document, 'get_text'):
27
- text_content = document.get_text() # Adjust according to actual method
28
- else:
29
- text_content = "" # Default to empty string if no text method is available
30
-
31
- texts.extend(text_splitter.split_text(text_content))
32
- except Exception as e:
33
- logging.error(f"Error processing document {document}: {e}")
34
-
35
- # Initialize embedding model once outside the loop
36
- embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
37
 
38
- # Define embedding function (optimized to use pre-initialized model)
39
  def embedding_function(text):
 
40
  return embeddings_model.embed_query(text)
41
 
42
- # Create FAISS index for embeddings (adjust dimension as needed)
43
- index = IndexFlatL2(768) # Dimension of embeddings, adjust as needed
44
 
45
- # Assuming docstore as a simple dictionary to store document texts
46
- docstore = {i: text for i, text in enumerate(texts)}
47
- index_to_docstore_id = {i: i for i in range(len(texts))}
48
-
49
- # Initialize FAISS
50
- faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
51
-
52
- # Process and store embeddings
53
- logging.info("Storing embeddings in FAISS...")
54
- for i, text in enumerate(texts):
55
- try:
56
- embedding = embedding_function(text)
57
- faiss_db.add_documents([embedding])
58
- except Exception as e:
59
- logging.error(f"Error embedding document {i}: {e}")
60
-
61
- # Exporting the vector embeddings database with logging
62
- logging.info("Exporting the vector embeddings database...")
63
- try:
64
- faiss_db.save_local("ipc_embed_db")
65
- logging.info("Export completed successfully.")
66
- except Exception as e:
67
- logging.error(f"Error exporting FAISS database: {e}")
68
-
69
- # Log a message to indicate the completion of the process
70
- logging.info("Process completed successfully.")
71
 
72
  # Shutdown Ray after the process
73
  ray.shutdown()
 
 
 
1
  import ray
2
  import logging
3
+ import os
4
  from langchain_community.document_loaders import DirectoryLoader
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.vectorstores import FAISS
8
+ from faiss import IndexFlatL2 # Assuming using L2 distance for simplicity
9
 
10
  # Initialize Ray
11
  ray.init()
 
13
  # Set up basic configuration for logging
14
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
 
16
+ # Directory where the FAISS index is saved
17
+ index_directory = 'ipc_embed_db'
18
+ index_filename = 'index.faiss'
19
+ index_path = os.path.join(index_directory, index_filename)
20
+
21
+ # Function to create a new FAISS index if it doesn't exist
22
+ def create_faiss_index(texts, embedding_function):
23
+ # Create the FAISS index with L2 distance
24
+ logging.info("Creating a new FAISS index...")
25
+ index = IndexFlatL2(768) # Dimensionality of the embeddings
26
+ docstore = {i: text for i, text in enumerate(texts)}
27
+ index_to_docstore_id = {i: i for i in range(len(texts))}
28
+ faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
29
+
30
+ # Adding documents to the FAISS index
31
+ logging.info("Adding documents to FAISS index...")
32
+ for text in texts:
33
+ embedding = embedding_function(text)
34
+ faiss_db.add_documents([embedding])
35
+
36
+ # Save the FAISS index to disk
37
+ logging.info("Saving FAISS index to disk...")
38
+ faiss_db.save_local(index_directory)
39
+ logging.info("FAISS index saved successfully.")
40
+ return faiss_db
41
+
42
+ # Function to load an existing FAISS index
43
+ def load_faiss_index(embedding_function):
44
+ if os.path.exists(index_path):
45
+ logging.info("Loading existing FAISS index...")
46
+ faiss_db = FAISS.load_local(index_directory, embedding_function)
47
+ logging.info("FAISS index loaded successfully.")
48
+ return faiss_db
49
+ else:
50
+ logging.info("FAISS index not found, creating a new one...")
51
+ return create_faiss_index(texts, embedding_function)
52
+
53
  # Load documents with logging
54
  logging.info("Loading documents...")
55
  loader = DirectoryLoader('data', glob="./*.txt")
56
  documents = loader.load()
57
 
58
+ # Extract text from documents and split into manageable chunks
59
  logging.info("Extracting and splitting texts from documents...")
60
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
61
  texts = []
62
  for document in documents:
63
+ if hasattr(document, 'get_text'):
64
+ text_content = document.get_text() # Adjust according to actual method
65
+ else:
66
+ text_content = "" # Default to empty string if no text method is available
67
+ texts.extend(text_splitter.split_text(text_content))
 
 
 
 
 
 
 
68
 
69
+ # Define embedding function
70
  def embedding_function(text):
71
+ embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
72
  return embeddings_model.embed_query(text)
73
 
74
+ # Load or create the FAISS index dynamically
75
+ faiss_db = load_faiss_index(embedding_function)
76
 
77
+ # If you need to perform a search or interact with the FAISS index:
78
+ # db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  # Shutdown Ray after the process
81
  ray.shutdown()
82
+
83
+ logging.info("Process completed successfully.")