Spaces:

adarsh-maurya
/

ApnaLawyer

Sleeping

App Files Files Community

ApnaLawyer / Ingest.py

adarsh-maurya

Update Ingest.py

5402334 verified 5 months ago

raw

history blame contribute delete

2.79 kB

	import ray
	import logging
	import os
	from langchain_community.document_loaders import DirectoryLoader
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS

	# Initialize Ray (safe even if already running)
	ray.init(ignore_reinit_error=True)

	# Logging setup
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	# Define FAISS index paths
	index_directory = 'ipc_embed_db'
	index_path_faiss = os.path.join(index_directory, 'index.faiss')
	index_path_pkl = os.path.join(index_directory, 'index.pkl')

	# Ensure index directory exists
	os.makedirs(index_directory, exist_ok=True)

	# Load documents
	logging.info("📁 Loading legal documents from 'data/' directory...")
	loader = DirectoryLoader('data', glob="*/.txt") # Recursively load .txt files
	documents = loader.load()

	# Check if any documents were found
	if not documents:
	logging.error("❌ No documents found in 'data/'. Please add .txt files to proceed.")
	ray.shutdown()
	exit()

	# Split documents into chunks
	logging.info("✂️ Splitting documents for embedding...")
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
	texts = text_splitter.split_documents(documents)

	# Load the InLegalBERT embedding model
	logging.info("📦 Loading HuggingFace embedding model: 'law-ai/InLegalBERT'...")
	embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")

	# Create and save the FAISS index
	def create_faiss_index():
	logging.info("⚙️ Creating new FAISS index...")
	faiss_db = FAISS.from_documents(texts, embeddings)
	faiss_db.save_local(index_directory)
	logging.info("✅ FAISS index saved in '%s'.", index_directory)
	return faiss_db

	# Load existing index or create if missing
	def load_or_create_faiss_index():
	if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl):
	logging.info("📂 Loading existing FAISS index...")
	try:
	faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True)
	logging.info("✅ FAISS index loaded successfully.")
	return faiss_db
	except Exception as e:
	logging.warning("⚠️ Failed to load existing index. Recreating... (%s)", str(e))
	else:
	logging.info("❌ FAISS index files not found. Creating new index...")

	return create_faiss_index()

	# Build the index
	faiss_db = load_or_create_faiss_index()

	# Optional: if you want to use the retriever later
	# db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

	# Shut down Ray
	ray.shutdown()
	logging.info("✅ Indexing process completed successfully.")