Spaces:

adarsh-maurya
/

ApnaLawyer

Sleeping

App Files Files Community

adarsh-maurya commited on Apr 14

Commit

a1f5731

verified ·

1 Parent(s): d883f65

Update Ingest.py

Browse files

Files changed (1) hide show

Ingest.py +31 -50

Ingest.py CHANGED Viewed

@@ -5,7 +5,6 @@ from langchain_community.document_loaders import DirectoryLoader
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
-from faiss import IndexFlatL2  # Assuming using L2 distance for simplicity
 # Initialize Ray
 ray.init()
@@ -15,69 +14,51 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 # Directory where the FAISS index is saved
 index_directory = 'ipc_embed_db'
-index_filename = 'index.faiss'
-index_path = os.path.join(index_directory, index_filename)
-# Function to create a new FAISS index if it doesn't exist
-def create_faiss_index(texts, embedding_function):
-    # Create the FAISS index with L2 distance
-    logging.info("Creating a new FAISS index...")
-    index = IndexFlatL2(768)  # Dimensionality of the embeddings
-    docstore = {i: text for i, text in enumerate(texts)}
-    index_to_docstore_id = {i: i for i in range(len(texts))}
-    faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
-    # Adding documents to the FAISS index
-    logging.info("Adding documents to FAISS index...")
-    for text in texts:
-        embedding = embedding_function(text)
-        faiss_db.add_documents([embedding])
-    # Save the FAISS index to disk
-    logging.info("Saving FAISS index to disk...")
     faiss_db.save_local(index_directory)
-    logging.info("FAISS index saved successfully.")
     return faiss_db
-# Function to load an existing FAISS index
-def load_faiss_index(embedding_function):
-    if os.path.exists(index_path):
         logging.info("Loading existing FAISS index...")
-        faiss_db = FAISS.load_local(index_directory, embedding_function)
         logging.info("FAISS index loaded successfully.")
         return faiss_db
     else:
-        logging.info("FAISS index not found, creating a new one...")
-        return create_faiss_index(texts, embedding_function)
-# Load documents with logging
-logging.info("Loading documents...")
-loader = DirectoryLoader('data', glob="./*.txt")
-documents = loader.load()
-# Extract text from documents and split into manageable chunks
-logging.info("Extracting and splitting texts from documents...")
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
-texts = []
-for document in documents:
-    if hasattr(document, 'get_text'):
-        text_content = document.get_text()  # Adjust according to actual method
-    else:
-        text_content = ""  # Default to empty string if no text method is available
-    texts.extend(text_splitter.split_text(text_content))
-# Define embedding function
-def embedding_function(text):
-    embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
-    return embeddings_model.embed_query(text)
-# Load or create the FAISS index dynamically
-faiss_db = load_faiss_index(embedding_function)
-# If you need to perform a search or interact with the FAISS index:
 # db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
 # Shutdown Ray after the process
 ray.shutdown()
 logging.info("Process completed successfully.")

 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 # Initialize Ray
 ray.init()
 # Directory where the FAISS index is saved
 index_directory = 'ipc_embed_db'
+index_path_faiss = os.path.join(index_directory, 'index.faiss')
+index_path_pkl = os.path.join(index_directory, 'index.pkl')
+# Ensure the index directory exists
+os.makedirs(index_directory, exist_ok=True)
+# Load documents
+logging.info("Loading documents...")
+loader = DirectoryLoader('data', glob="./*.txt")
+documents = loader.load()
+# Split documents into manageable chunks
+logging.info("Splitting documents into chunks...")
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
+texts = text_splitter.split_documents(documents)
+# Load embedding model once
+logging.info("Loading embedding model...")
+embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
+# Function to create and save FAISS index
+def create_faiss_index():
+    logging.info("Creating new FAISS index from documents...")
+    faiss_db = FAISS.from_documents(texts, embeddings)
     faiss_db.save_local(index_directory)
+    logging.info("FAISS index created and saved.")
     return faiss_db
+# Function to load or create FAISS index
+def load_or_create_faiss_index():
+    if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl):
         logging.info("Loading existing FAISS index...")
+        faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True)
         logging.info("FAISS index loaded successfully.")
         return faiss_db
     else:
+        logging.info("FAISS index not found. Creating a new one...")
+        return create_faiss_index()
+# Load or create the index
+faiss_db = load_or_create_faiss_index()
+# Optional: If you want to use the retriever later
 # db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
 # Shutdown Ray after the process
 ray.shutdown()
 logging.info("Process completed successfully.")