Spaces:
Running
Running
# src/document_processor/processor.py | |
from langchain_text_splitters import RecursiveCharacterTextSplitter # cite: embed_pipeline.py | |
from langchain.schema import Document # cite: embed_pipeline.py | |
from config.settings import CHUNK_SIZE, CHUNK_OVERLAP | |
import logging | |
import os | |
logger = logging.getLogger(__name__) | |
def split_documents(docs: list[Document]) -> list[Document]: | |
""" | |
Splits loaded documents into smaller chunks. | |
Args: | |
docs: A list of Langchain Document objects. | |
Returns: | |
A list of Langchain Document objects representing the chunks. | |
""" | |
# --- Financial Ministry Adaptation --- | |
# TODO: Implement a splitting strategy that understands the structure of financial documents. | |
# This might involve splitting by sections, articles, or using semantic chunking | |
# based on document structures, rather than just character count. | |
# Ensure metadata is carried over or enriched during splitting. | |
# ------------------------------------ | |
splitter = RecursiveCharacterTextSplitter( # cite: embed_pipeline.py | |
chunk_size=CHUNK_SIZE, # cite: embed_pipeline.py | |
chunk_overlap=CHUNK_OVERLAP # cite: embed_pipeline.py | |
) | |
chunks = splitter.split_documents(docs) # cite: embed_pipeline.py | |
logger.info(f"Split {len(docs)} documents into {len(chunks)} chunks.") | |
return chunks | |
def extract_metadata(doc: Document) -> dict: | |
""" | |
Extracts relevant metadata from a document. | |
Args: | |
doc: A Langchain Document object. | |
Returns: | |
A dictionary of extracted metadata. | |
""" | |
# --- Financial Ministry Adaptation --- | |
# TODO: Implement robust metadata extraction logic specifically for government rulings. | |
# This should parse the document content or use pre-extracted information to get: | |
# - Date of ruling | |
# - Relevant law or statute references | |
# - Topic(s) of the ruling | |
# - Case number or identifier | |
# - Source file path (already included in your script) | |
# - Any other relevant identifiers or classifications. | |
# This metadata is CRITICAL for accurate filtering and retrieval. | |
# ------------------------------------ | |
metadata = doc.metadata.copy() | |
# Example: Placeholder for parsing date from content or filename | |
# try: | |
# # Attempt to parse date from filename or content | |
# metadata['ruling_date'] = parse_date_from_doc(doc) | |
# except Exception as e: | |
# logger.warning(f"Could not extract date for {metadata.get('source', 'unknown')}: {e}") | |
# metadata['ruling_date'] = None # Or a default value | |
# Example: Placeholder for extracting topic from content | |
# metadata['topic'] = extract_topic_from_doc(doc) | |
return metadata | |
def process_documents(docs: list[Document]) -> list[Document]: | |
""" | |
Processes a list of raw documents by splitting and extracting metadata. | |
Args: | |
docs: A list of raw Langchain Document objects. | |
Returns: | |
A list of processed Langchain Document chunks with enriched metadata. | |
""" | |
chunks = split_documents(docs) | |
processed_chunks = [] | |
for chunk in chunks: | |
# Extract/enrich metadata for each chunk | |
chunk.metadata = extract_metadata(chunk) | |
processed_chunks.append(chunk) | |
logger.info(f"Processed {len(chunks)} chunks with metadata.") | |
return processed_chunks | |
# Placeholder functions for metadata extraction (to be implemented) | |
def parse_date_from_doc(doc: Document): | |
"""Placeholder for date extraction logic.""" | |
pass | |
def extract_topic_from_doc(doc: Document): | |
"""Placeholder for topic extraction logic.""" | |
pass | |