import ray import logging import os from langchain_community.document_loaders import DirectoryLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS # Initialize Ray (safe even if already running) ray.init(ignore_reinit_error=True) # Logging setup logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Define FAISS index paths index_directory = 'ipc_embed_db' index_path_faiss = os.path.join(index_directory, 'index.faiss') index_path_pkl = os.path.join(index_directory, 'index.pkl') # Ensure index directory exists os.makedirs(index_directory, exist_ok=True) # Load documents logging.info("📁 Loading legal documents from 'data/' directory...") loader = DirectoryLoader('data', glob="**/*.txt") # Recursively load .txt files documents = loader.load() # Check if any documents were found if not documents: logging.error("❌ No documents found in 'data/'. Please add .txt files to proceed.") ray.shutdown() exit() # Split documents into chunks logging.info("✂️ Splitting documents for embedding...") text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200) texts = text_splitter.split_documents(documents) # Load the InLegalBERT embedding model logging.info("📦 Loading HuggingFace embedding model: 'law-ai/InLegalBERT'...") embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT") # Create and save the FAISS index def create_faiss_index(): logging.info("⚙️ Creating new FAISS index...") faiss_db = FAISS.from_documents(texts, embeddings) faiss_db.save_local(index_directory) logging.info("✅ FAISS index saved in '%s'.", index_directory) return faiss_db # Load existing index or create if missing def load_or_create_faiss_index(): if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl): logging.info("📂 Loading existing FAISS index...") try: faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True) logging.info("✅ FAISS index loaded successfully.") return faiss_db except Exception as e: logging.warning("⚠️ Failed to load existing index. Recreating... (%s)", str(e)) else: logging.info("❌ FAISS index files not found. Creating new index...") return create_faiss_index() # Build the index faiss_db = load_or_create_faiss_index() # Optional: if you want to use the retriever later # db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3}) # Shut down Ray ray.shutdown() logging.info("✅ Indexing process completed successfully.")