# src/document_processor/processor.py from langchain_text_splitters import RecursiveCharacterTextSplitter # cite: embed_pipeline.py from langchain.schema import Document # cite: embed_pipeline.py from config.settings import CHUNK_SIZE, CHUNK_OVERLAP import logging import os logger = logging.getLogger(__name__) def split_documents(docs: list[Document]) -> list[Document]: """ Splits loaded documents into smaller chunks. Args: docs: A list of Langchain Document objects. Returns: A list of Langchain Document objects representing the chunks. """ # --- Financial Ministry Adaptation --- # TODO: Implement a splitting strategy that understands the structure of financial documents. # This might involve splitting by sections, articles, or using semantic chunking # based on document structures, rather than just character count. # Ensure metadata is carried over or enriched during splitting. # ------------------------------------ splitter = RecursiveCharacterTextSplitter( # cite: embed_pipeline.py chunk_size=CHUNK_SIZE, # cite: embed_pipeline.py chunk_overlap=CHUNK_OVERLAP # cite: embed_pipeline.py ) chunks = splitter.split_documents(docs) # cite: embed_pipeline.py logger.info(f"Split {len(docs)} documents into {len(chunks)} chunks.") return chunks def extract_metadata(doc: Document) -> dict: """ Extracts relevant metadata from a document. Args: doc: A Langchain Document object. Returns: A dictionary of extracted metadata. """ # --- Financial Ministry Adaptation --- # TODO: Implement robust metadata extraction logic specifically for government rulings. # This should parse the document content or use pre-extracted information to get: # - Date of ruling # - Relevant law or statute references # - Topic(s) of the ruling # - Case number or identifier # - Source file path (already included in your script) # - Any other relevant identifiers or classifications. # This metadata is CRITICAL for accurate filtering and retrieval. # ------------------------------------ metadata = doc.metadata.copy() # Example: Placeholder for parsing date from content or filename # try: # # Attempt to parse date from filename or content # metadata['ruling_date'] = parse_date_from_doc(doc) # except Exception as e: # logger.warning(f"Could not extract date for {metadata.get('source', 'unknown')}: {e}") # metadata['ruling_date'] = None # Or a default value # Example: Placeholder for extracting topic from content # metadata['topic'] = extract_topic_from_doc(doc) return metadata def process_documents(docs: list[Document]) -> list[Document]: """ Processes a list of raw documents by splitting and extracting metadata. Args: docs: A list of raw Langchain Document objects. Returns: A list of processed Langchain Document chunks with enriched metadata. """ chunks = split_documents(docs) processed_chunks = [] for chunk in chunks: # Extract/enrich metadata for each chunk chunk.metadata = extract_metadata(chunk) processed_chunks.append(chunk) logger.info(f"Processed {len(chunks)} chunks with metadata.") return processed_chunks # Placeholder functions for metadata extraction (to be implemented) def parse_date_from_doc(doc: Document): """Placeholder for date extraction logic.""" pass def extract_topic_from_doc(doc: Document): """Placeholder for topic extraction logic.""" pass