Spaces:

insight-ai
/

api

Sleeping

api

File size: 3,623 Bytes

10b392a

# src/document_processor/processor.py
from langchain_text_splitters import RecursiveCharacterTextSplitter # cite: embed_pipeline.py
from langchain.schema import Document # cite: embed_pipeline.py
from config.settings import CHUNK_SIZE, CHUNK_OVERLAP
import logging
import os

logger = logging.getLogger(__name__)

def split_documents(docs: list[Document]) -> list[Document]:
    """
    Splits loaded documents into smaller chunks.

    Args:
        docs: A list of Langchain Document objects.

    Returns:
        A list of Langchain Document objects representing the chunks.
    """
    # --- Financial Ministry Adaptation ---
    # TODO: Implement a splitting strategy that understands the structure of financial documents.
    # This might involve splitting by sections, articles, or using semantic chunking
    # based on document structures, rather than just character count.
    # Ensure metadata is carried over or enriched during splitting.
    # ------------------------------------
    splitter = RecursiveCharacterTextSplitter( # cite: embed_pipeline.py
        chunk_size=CHUNK_SIZE, # cite: embed_pipeline.py
        chunk_overlap=CHUNK_OVERLAP # cite: embed_pipeline.py
    )
    chunks = splitter.split_documents(docs) # cite: embed_pipeline.py
    logger.info(f"Split {len(docs)} documents into {len(chunks)} chunks.")
    return chunks

def extract_metadata(doc: Document) -> dict:
    """
    Extracts relevant metadata from a document.

    Args:
        doc: A Langchain Document object.

    Returns:
        A dictionary of extracted metadata.
    """
    # --- Financial Ministry Adaptation ---
    # TODO: Implement robust metadata extraction logic specifically for government rulings.
    # This should parse the document content or use pre-extracted information to get:
    # - Date of ruling
    # - Relevant law or statute references
    # - Topic(s) of the ruling
    # - Case number or identifier
    # - Source file path (already included in your script)
    # - Any other relevant identifiers or classifications.
    # This metadata is CRITICAL for accurate filtering and retrieval.
    # ------------------------------------
    metadata = doc.metadata.copy()
    # Example: Placeholder for parsing date from content or filename
    # try:
    #     # Attempt to parse date from filename or content
    #     metadata['ruling_date'] = parse_date_from_doc(doc)
    # except Exception as e:
    #     logger.warning(f"Could not extract date for {metadata.get('source', 'unknown')}: {e}")
    #     metadata['ruling_date'] = None # Or a default value

    # Example: Placeholder for extracting topic from content
    # metadata['topic'] = extract_topic_from_doc(doc)

    return metadata

def process_documents(docs: list[Document]) -> list[Document]:
    """
    Processes a list of raw documents by splitting and extracting metadata.

    Args:
        docs: A list of raw Langchain Document objects.

    Returns:
        A list of processed Langchain Document chunks with enriched metadata.
    """
    chunks = split_documents(docs)
    processed_chunks = []
    for chunk in chunks:
        # Extract/enrich metadata for each chunk
        chunk.metadata = extract_metadata(chunk)
        processed_chunks.append(chunk)
    logger.info(f"Processed {len(chunks)} chunks with metadata.")
    return processed_chunks

# Placeholder functions for metadata extraction (to be implemented)
def parse_date_from_doc(doc: Document):
    """Placeholder for date extraction logic."""
    pass

def extract_topic_from_doc(doc: Document):
    """Placeholder for topic extraction logic."""
    pass