Spaces:
Sleeping
Sleeping
import ray | |
import logging | |
import os | |
from langchain_community.document_loaders import DirectoryLoader | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores import FAISS | |
# Initialize Ray (safe even if already running) | |
ray.init(ignore_reinit_error=True) | |
# Logging setup | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Define FAISS index paths | |
index_directory = 'ipc_embed_db' | |
index_path_faiss = os.path.join(index_directory, 'index.faiss') | |
index_path_pkl = os.path.join(index_directory, 'index.pkl') | |
# Ensure index directory exists | |
os.makedirs(index_directory, exist_ok=True) | |
# Load documents | |
logging.info("π Loading legal documents from 'data/' directory...") | |
loader = DirectoryLoader('data', glob="**/*.txt") # Recursively load .txt files | |
documents = loader.load() | |
# Check if any documents were found | |
if not documents: | |
logging.error("β No documents found in 'data/'. Please add .txt files to proceed.") | |
ray.shutdown() | |
exit() | |
# Split documents into chunks | |
logging.info("βοΈ Splitting documents for embedding...") | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200) | |
texts = text_splitter.split_documents(documents) | |
# Load the InLegalBERT embedding model | |
logging.info("π¦ Loading HuggingFace embedding model: 'law-ai/InLegalBERT'...") | |
embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT") | |
# Create and save the FAISS index | |
def create_faiss_index(): | |
logging.info("βοΈ Creating new FAISS index...") | |
faiss_db = FAISS.from_documents(texts, embeddings) | |
faiss_db.save_local(index_directory) | |
logging.info("β FAISS index saved in '%s'.", index_directory) | |
return faiss_db | |
# Load existing index or create if missing | |
def load_or_create_faiss_index(): | |
if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl): | |
logging.info("π Loading existing FAISS index...") | |
try: | |
faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True) | |
logging.info("β FAISS index loaded successfully.") | |
return faiss_db | |
except Exception as e: | |
logging.warning("β οΈ Failed to load existing index. Recreating... (%s)", str(e)) | |
else: | |
logging.info("β FAISS index files not found. Creating new index...") | |
return create_faiss_index() | |
# Build the index | |
faiss_db = load_or_create_faiss_index() | |
# Optional: if you want to use the retriever later | |
# db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3}) | |
# Shut down Ray | |
ray.shutdown() | |
logging.info("β Indexing process completed successfully.") | |