Spaces:
Running
Running
File size: 3,623 Bytes
10b392a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# src/document_processor/processor.py
from langchain_text_splitters import RecursiveCharacterTextSplitter # cite: embed_pipeline.py
from langchain.schema import Document # cite: embed_pipeline.py
from config.settings import CHUNK_SIZE, CHUNK_OVERLAP
import logging
import os
logger = logging.getLogger(__name__)
def split_documents(docs: list[Document]) -> list[Document]:
"""
Splits loaded documents into smaller chunks.
Args:
docs: A list of Langchain Document objects.
Returns:
A list of Langchain Document objects representing the chunks.
"""
# --- Financial Ministry Adaptation ---
# TODO: Implement a splitting strategy that understands the structure of financial documents.
# This might involve splitting by sections, articles, or using semantic chunking
# based on document structures, rather than just character count.
# Ensure metadata is carried over or enriched during splitting.
# ------------------------------------
splitter = RecursiveCharacterTextSplitter( # cite: embed_pipeline.py
chunk_size=CHUNK_SIZE, # cite: embed_pipeline.py
chunk_overlap=CHUNK_OVERLAP # cite: embed_pipeline.py
)
chunks = splitter.split_documents(docs) # cite: embed_pipeline.py
logger.info(f"Split {len(docs)} documents into {len(chunks)} chunks.")
return chunks
def extract_metadata(doc: Document) -> dict:
"""
Extracts relevant metadata from a document.
Args:
doc: A Langchain Document object.
Returns:
A dictionary of extracted metadata.
"""
# --- Financial Ministry Adaptation ---
# TODO: Implement robust metadata extraction logic specifically for government rulings.
# This should parse the document content or use pre-extracted information to get:
# - Date of ruling
# - Relevant law or statute references
# - Topic(s) of the ruling
# - Case number or identifier
# - Source file path (already included in your script)
# - Any other relevant identifiers or classifications.
# This metadata is CRITICAL for accurate filtering and retrieval.
# ------------------------------------
metadata = doc.metadata.copy()
# Example: Placeholder for parsing date from content or filename
# try:
# # Attempt to parse date from filename or content
# metadata['ruling_date'] = parse_date_from_doc(doc)
# except Exception as e:
# logger.warning(f"Could not extract date for {metadata.get('source', 'unknown')}: {e}")
# metadata['ruling_date'] = None # Or a default value
# Example: Placeholder for extracting topic from content
# metadata['topic'] = extract_topic_from_doc(doc)
return metadata
def process_documents(docs: list[Document]) -> list[Document]:
"""
Processes a list of raw documents by splitting and extracting metadata.
Args:
docs: A list of raw Langchain Document objects.
Returns:
A list of processed Langchain Document chunks with enriched metadata.
"""
chunks = split_documents(docs)
processed_chunks = []
for chunk in chunks:
# Extract/enrich metadata for each chunk
chunk.metadata = extract_metadata(chunk)
processed_chunks.append(chunk)
logger.info(f"Processed {len(chunks)} chunks with metadata.")
return processed_chunks
# Placeholder functions for metadata extraction (to be implemented)
def parse_date_from_doc(doc: Document):
"""Placeholder for date extraction logic."""
pass
def extract_topic_from_doc(doc: Document):
"""Placeholder for topic extraction logic."""
pass
|