import os import time import logging import json from dataclasses import dataclass from typing import Optional # Don't import DocumentConverter at module level to prevent early initialization # from docling.document_converter import DocumentConverter from processing.sections import SectionExtractor # Remove global converter initialization - will be done lazily # _docling_converter = DocumentConverter() logger = logging.getLogger(__name__) # Logger for this module @dataclass class DocumentResult: """Holds processed results for a document.""" file_path: str structured_markdown: str structured_json: dict redacted_markdown: str redacted_json: dict class DocumentProcessor: """Handles parsing of documents with Docling and redacting specified sections.""" def __init__(self, section_extractor: Optional[SectionExtractor] = None): """ Initialize with an optional SectionExtractor for removing specific sections. If None, no redaction will be performed (original structure only). The Docling DocumentConverter will be initialized lazily when needed. """ self.section_extractor = section_extractor self._converter = None # Lazy initialization @property def converter(self): """Lazy initialization of DocumentConverter to prevent early Hugging Face Hub initialization.""" if self._converter is None: # Import here to ensure environment variables are set first from docling.document_converter import DocumentConverter logger.info("Initializing Docling DocumentConverter...") self._converter = DocumentConverter() logger.info("Docling DocumentConverter initialized successfully") return self._converter def process(self, file_path: str) -> DocumentResult: """Parse the document and optionally remove specified sections. Returns a DocumentResult.""" logger.info(f"Starting processing for file: {file_path}") start_time = time.time() # Ensure environment variables are set before processing self._ensure_cache_directories() # Convert the document using Docling conv_result = self.converter.convert(file_path) elapsed = time.time() - start_time logger.info(f"Docling conversion completed in {elapsed:.2f} seconds") # Export results from Docling structured_md = conv_result.document.export_to_markdown() structured_text = conv_result.document.export_to_text() doc_json = conv_result.document.export_to_dict() logger.info(f"Extracted document content (text length {len(structured_text)} characters)") # Use SectionExtractor to remove target sections if provided if self.section_extractor: # Use the new JSON-based approach for better section removal redacted_json = self.section_extractor.remove_sections_from_json(doc_json) # Convert the redacted JSON back to markdown using Docling's export method # Create a modified document structure for proper markdown export redacted_md = self._export_redacted_markdown(conv_result.document, redacted_json) logger.info("Applied section redaction to remove specified sections") else: redacted_md = structured_md # No redaction, use original redacted_json = doc_json # No redaction, use original logger.info("No section redaction applied (showing original structure)") # Persist outputs to files (JSON and redacted text) for auditing base_name = os.path.splitext(os.path.basename(file_path))[0] # Use temp directory for output files - try to use the same temp dir as the main app temp_dir = "temp_files" try: os.makedirs(temp_dir, exist_ok=True) except PermissionError: # Fallback to system temp directory if we can't create in current directory import tempfile temp_dir = os.path.join(tempfile.gettempdir(), "docling_temp_files") os.makedirs(temp_dir, exist_ok=True) json_path = os.path.join(temp_dir, f"{base_name}_structured.json") redacted_path = os.path.join(temp_dir, f"{base_name}_redacted.txt") redacted_json_path = os.path.join(temp_dir, f"{base_name}_redacted.json") try: with open(json_path, "w", encoding="utf-8") as jf: json.dump(doc_json, jf, ensure_ascii=False, indent=2) with open(redacted_path, "w", encoding="utf-8") as tf: tf.write(redacted_md) with open(redacted_json_path, "w", encoding="utf-8") as jf: json.dump(redacted_json, jf, ensure_ascii=False, indent=2) logger.info(f"Saved structured JSON to {json_path}, redacted text to {redacted_path}, and redacted JSON to {redacted_json_path}") except Exception as e: logger.error(f"Error saving outputs to files: {e}") # Prepare result object result = DocumentResult( file_path=file_path, structured_markdown=structured_md, structured_json=doc_json, redacted_markdown=redacted_md, redacted_json=redacted_json ) logger.info(f"Finished processing for file: {file_path}") return result def _ensure_cache_directories(self): """Ensure all necessary cache directories exist before processing.""" cache_dirs = [ os.environ.get('HF_HOME', '/tmp/docling_temp/huggingface'), os.environ.get('HF_CACHE_HOME', '/tmp/docling_temp/huggingface_cache'), os.environ.get('HF_HUB_CACHE', '/tmp/docling_temp/huggingface_cache'), os.environ.get('TRANSFORMERS_CACHE', '/tmp/docling_temp/transformers_cache'), os.environ.get('HF_DATASETS_CACHE', '/tmp/docling_temp/datasets_cache'), os.environ.get('DIFFUSERS_CACHE', '/tmp/docling_temp/diffusers_cache'), os.environ.get('ACCELERATE_CACHE', '/tmp/docling_temp/accelerate_cache'), os.environ.get('TORCH_HOME', '/tmp/docling_temp/torch'), os.environ.get('TENSORFLOW_HOME', '/tmp/docling_temp/tensorflow'), os.environ.get('KERAS_HOME', '/tmp/docling_temp/keras'), ] for cache_dir in cache_dirs: try: os.makedirs(cache_dir, exist_ok=True) logger.debug(f"Ensured cache directory exists: {cache_dir}") except Exception as e: logger.warning(f"Could not create cache directory {cache_dir}: {e}") def _export_redacted_markdown(self, document, redacted_json): """Export redacted markdown using the redacted JSON structure.""" # Simply convert the redacted JSON back to markdown return self._json_to_markdown(redacted_json) def _json_to_markdown(self, json_data: dict) -> str: """Convert JSON document structure back to markdown format using Docling's structure.""" markdown_lines = [] # Get all text elements from the JSON texts = json_data.get("texts", []) for text_elem in texts: text_content = text_elem.get("text", "") label = text_elem.get("label", "") level = text_elem.get("level", 0) if not text_content.strip(): continue # Format based on the label and level (following Docling's structure) if label == "section_header": # Add appropriate markdown headers if level == 1: markdown_lines.append(f"# {text_content}") elif level == 2: markdown_lines.append(f"## {text_content}") elif level == 3: markdown_lines.append(f"### {text_content}") else: markdown_lines.append(f"#### {text_content}") elif label == "list_item": # Handle list items - preserve the original marker marker = text_elem.get("marker", "-") markdown_lines.append(f"{marker} {text_content}") elif label == "text": # Regular text content - preserve as-is markdown_lines.append(text_content) else: # Default to regular text markdown_lines.append(text_content) # Join without extra spacing to match Docling's formatting return "\n".join(markdown_lines)