ApnaLawyer / Ingest.py
adarsh-maurya's picture
Update Ingest.py
5402334 verified
import ray
import logging
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
# Initialize Ray (safe even if already running)
ray.init(ignore_reinit_error=True)
# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Define FAISS index paths
index_directory = 'ipc_embed_db'
index_path_faiss = os.path.join(index_directory, 'index.faiss')
index_path_pkl = os.path.join(index_directory, 'index.pkl')
# Ensure index directory exists
os.makedirs(index_directory, exist_ok=True)
# Load documents
logging.info("πŸ“ Loading legal documents from 'data/' directory...")
loader = DirectoryLoader('data', glob="**/*.txt") # Recursively load .txt files
documents = loader.load()
# Check if any documents were found
if not documents:
logging.error("❌ No documents found in 'data/'. Please add .txt files to proceed.")
ray.shutdown()
exit()
# Split documents into chunks
logging.info("βœ‚οΈ Splitting documents for embedding...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
# Load the InLegalBERT embedding model
logging.info("πŸ“¦ Loading HuggingFace embedding model: 'law-ai/InLegalBERT'...")
embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
# Create and save the FAISS index
def create_faiss_index():
logging.info("βš™οΈ Creating new FAISS index...")
faiss_db = FAISS.from_documents(texts, embeddings)
faiss_db.save_local(index_directory)
logging.info("βœ… FAISS index saved in '%s'.", index_directory)
return faiss_db
# Load existing index or create if missing
def load_or_create_faiss_index():
if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl):
logging.info("πŸ“‚ Loading existing FAISS index...")
try:
faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True)
logging.info("βœ… FAISS index loaded successfully.")
return faiss_db
except Exception as e:
logging.warning("⚠️ Failed to load existing index. Recreating... (%s)", str(e))
else:
logging.info("❌ FAISS index files not found. Creating new index...")
return create_faiss_index()
# Build the index
faiss_db = load_or_create_faiss_index()
# Optional: if you want to use the retriever later
# db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
# Shut down Ray
ray.shutdown()
logging.info("βœ… Indexing process completed successfully.")