Spaces:
Sleeping
Sleeping
File size: 2,793 Bytes
07524cb d883f65 07524cb 5402334 07524cb 5402334 07524cb 5402334 d883f65 a1f5731 d883f65 5402334 a1f5731 d883f65 a1f5731 5402334 a1f5731 5402334 a1f5731 5402334 a1f5731 d883f65 5402334 a1f5731 5402334 a1f5731 d883f65 5402334 d883f65 5402334 a1f5731 5402334 d883f65 5402334 d883f65 5402334 a1f5731 a965de4 5402334 d883f65 07524cb 5402334 07524cb 5402334 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import ray
import logging
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
# Initialize Ray (safe even if already running)
ray.init(ignore_reinit_error=True)
# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Define FAISS index paths
index_directory = 'ipc_embed_db'
index_path_faiss = os.path.join(index_directory, 'index.faiss')
index_path_pkl = os.path.join(index_directory, 'index.pkl')
# Ensure index directory exists
os.makedirs(index_directory, exist_ok=True)
# Load documents
logging.info("π Loading legal documents from 'data/' directory...")
loader = DirectoryLoader('data', glob="**/*.txt") # Recursively load .txt files
documents = loader.load()
# Check if any documents were found
if not documents:
logging.error("β No documents found in 'data/'. Please add .txt files to proceed.")
ray.shutdown()
exit()
# Split documents into chunks
logging.info("βοΈ Splitting documents for embedding...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
# Load the InLegalBERT embedding model
logging.info("π¦ Loading HuggingFace embedding model: 'law-ai/InLegalBERT'...")
embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
# Create and save the FAISS index
def create_faiss_index():
logging.info("βοΈ Creating new FAISS index...")
faiss_db = FAISS.from_documents(texts, embeddings)
faiss_db.save_local(index_directory)
logging.info("β
FAISS index saved in '%s'.", index_directory)
return faiss_db
# Load existing index or create if missing
def load_or_create_faiss_index():
if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl):
logging.info("π Loading existing FAISS index...")
try:
faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True)
logging.info("β
FAISS index loaded successfully.")
return faiss_db
except Exception as e:
logging.warning("β οΈ Failed to load existing index. Recreating... (%s)", str(e))
else:
logging.info("β FAISS index files not found. Creating new index...")
return create_faiss_index()
# Build the index
faiss_db = load_or_create_faiss_index()
# Optional: if you want to use the retriever later
# db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
# Shut down Ray
ray.shutdown()
logging.info("β
Indexing process completed successfully.")
|