Spaces:

adarsh-maurya
/

ApnaLawyer

Sleeping

App Files Files Community

adarsh-maurya commited on Apr 14

Commit

d883f65

verified ·

1 Parent(s): a965de4

Update Ingest.py

Browse files

Files changed (1) hide show

Ingest.py +53 -43

Ingest.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import ray
 import logging
 from langchain_community.document_loaders import DirectoryLoader
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
-from faiss import IndexFlatL2
 # Initialize Ray
 ray.init()
@@ -12,62 +13,71 @@ ray.init()
 # Set up basic configuration for logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # Load documents with logging
 logging.info("Loading documents...")
 loader = DirectoryLoader('data', glob="./*.txt")
 documents = loader.load()
-# Extract text from documents and split into manageable chunks with logging
 logging.info("Extracting and splitting texts from documents...")
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
 texts = []
 for document in documents:
-    try:
-        if hasattr(document, 'get_text'):
-            text_content = document.get_text()  # Adjust according to actual method
-        else:
-            text_content = ""  # Default to empty string if no text method is available
-        texts.extend(text_splitter.split_text(text_content))
-    except Exception as e:
-        logging.error(f"Error processing document {document}: {e}")
-# Initialize embedding model once outside the loop
-embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
-# Define embedding function (optimized to use pre-initialized model)
 def embedding_function(text):
     return embeddings_model.embed_query(text)
-# Create FAISS index for embeddings (adjust dimension as needed)
-index = IndexFlatL2(768)  # Dimension of embeddings, adjust as needed
-# Assuming docstore as a simple dictionary to store document texts
-docstore = {i: text for i, text in enumerate(texts)}
-index_to_docstore_id = {i: i for i in range(len(texts))}
-# Initialize FAISS
-faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
-# Process and store embeddings
-logging.info("Storing embeddings in FAISS...")
-for i, text in enumerate(texts):
-    try:
-        embedding = embedding_function(text)
-        faiss_db.add_documents([embedding])
-    except Exception as e:
-        logging.error(f"Error embedding document {i}: {e}")
-# Exporting the vector embeddings database with logging
-logging.info("Exporting the vector embeddings database...")
-try:
-    faiss_db.save_local("ipc_embed_db")
-    logging.info("Export completed successfully.")
-except Exception as e:
-    logging.error(f"Error exporting FAISS database: {e}")
-# Log a message to indicate the completion of the process
-logging.info("Process completed successfully.")
 # Shutdown Ray after the process
 ray.shutdown()

 import ray
 import logging
+import os
 from langchain_community.document_loaders import DirectoryLoader
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
+from faiss import IndexFlatL2  # Assuming using L2 distance for simplicity
 # Initialize Ray
 ray.init()
 # Set up basic configuration for logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Directory where the FAISS index is saved
+index_directory = 'ipc_embed_db'
+index_filename = 'index.faiss'
+index_path = os.path.join(index_directory, index_filename)
+# Function to create a new FAISS index if it doesn't exist
+def create_faiss_index(texts, embedding_function):
+    # Create the FAISS index with L2 distance
+    logging.info("Creating a new FAISS index...")
+    index = IndexFlatL2(768)  # Dimensionality of the embeddings
+    docstore = {i: text for i, text in enumerate(texts)}
+    index_to_docstore_id = {i: i for i in range(len(texts))}
+    faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
+    # Adding documents to the FAISS index
+    logging.info("Adding documents to FAISS index...")
+    for text in texts:
+        embedding = embedding_function(text)
+        faiss_db.add_documents([embedding])
+    # Save the FAISS index to disk
+    logging.info("Saving FAISS index to disk...")
+    faiss_db.save_local(index_directory)
+    logging.info("FAISS index saved successfully.")
+    return faiss_db
+# Function to load an existing FAISS index
+def load_faiss_index(embedding_function):
+    if os.path.exists(index_path):
+        logging.info("Loading existing FAISS index...")
+        faiss_db = FAISS.load_local(index_directory, embedding_function)
+        logging.info("FAISS index loaded successfully.")
+        return faiss_db
+    else:
+        logging.info("FAISS index not found, creating a new one...")
+        return create_faiss_index(texts, embedding_function)
 # Load documents with logging
 logging.info("Loading documents...")
 loader = DirectoryLoader('data', glob="./*.txt")
 documents = loader.load()
+# Extract text from documents and split into manageable chunks
 logging.info("Extracting and splitting texts from documents...")
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
 texts = []
 for document in documents:
+    if hasattr(document, 'get_text'):
+        text_content = document.get_text()  # Adjust according to actual method
+    else:
+        text_content = ""  # Default to empty string if no text method is available
+    texts.extend(text_splitter.split_text(text_content))
+# Define embedding function
 def embedding_function(text):
+    embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
     return embeddings_model.embed_query(text)
+# Load or create the FAISS index dynamically
+faiss_db = load_faiss_index(embedding_function)
+# If you need to perform a search or interact with the FAISS index:
+# db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
 # Shutdown Ray after the process
 ray.shutdown()
+logging.info("Process completed successfully.")