File size: 8,710 Bytes
bb68eb6
 
 
 
 
 
98aae70
 
 
bb68eb6
 
98aae70
 
bb68eb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98aae70
bb68eb6
 
98aae70
 
 
 
 
 
 
 
 
 
 
 
bb68eb6
 
 
 
 
98aae70
 
 
 
bb68eb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0633369
bb68eb6
0633369
 
 
 
 
 
 
bb68eb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98aae70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb68eb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import os
import time
import logging
import json
from dataclasses import dataclass
from typing import Optional

# Don't import DocumentConverter at module level to prevent early initialization
# from docling.document_converter import DocumentConverter
from processing.sections import SectionExtractor

# Remove global converter initialization - will be done lazily
# _docling_converter = DocumentConverter()

logger = logging.getLogger(__name__)  # Logger for this module

@dataclass
class DocumentResult:
    """Holds processed results for a document."""
    file_path: str
    structured_markdown: str
    structured_json: dict
    redacted_markdown: str
    redacted_json: dict

class DocumentProcessor:
    """Handles parsing of documents with Docling and redacting specified sections."""
    def __init__(self, section_extractor: Optional[SectionExtractor] = None):
        """
        Initialize with an optional SectionExtractor for removing specific sections.
        If None, no redaction will be performed (original structure only).
        The Docling DocumentConverter will be initialized lazily when needed.
        """
        self.section_extractor = section_extractor
        self._converter = None  # Lazy initialization
    
    @property
    def converter(self):
        """Lazy initialization of DocumentConverter to prevent early Hugging Face Hub initialization."""
        if self._converter is None:
            # Import here to ensure environment variables are set first
            from docling.document_converter import DocumentConverter
            logger.info("Initializing Docling DocumentConverter...")
            self._converter = DocumentConverter()
            logger.info("Docling DocumentConverter initialized successfully")
        return self._converter
    
    def process(self, file_path: str) -> DocumentResult:
        """Parse the document and optionally remove specified sections. Returns a DocumentResult."""
        logger.info(f"Starting processing for file: {file_path}")
        start_time = time.time()
        
        # Ensure environment variables are set before processing
        self._ensure_cache_directories()
        
        # Convert the document using Docling
        conv_result = self.converter.convert(file_path)
        elapsed = time.time() - start_time
        logger.info(f"Docling conversion completed in {elapsed:.2f} seconds")
        
        # Export results from Docling
        structured_md = conv_result.document.export_to_markdown()
        structured_text = conv_result.document.export_to_text()
        doc_json = conv_result.document.export_to_dict()
        logger.info(f"Extracted document content (text length {len(structured_text)} characters)")
        
        # Use SectionExtractor to remove target sections if provided
        if self.section_extractor:
            # Use the new JSON-based approach for better section removal
            redacted_json = self.section_extractor.remove_sections_from_json(doc_json)
            
            # Convert the redacted JSON back to markdown using Docling's export method
            # Create a modified document structure for proper markdown export
            redacted_md = self._export_redacted_markdown(conv_result.document, redacted_json)
            logger.info("Applied section redaction to remove specified sections")
        else:
            redacted_md = structured_md  # No redaction, use original
            redacted_json = doc_json  # No redaction, use original
            logger.info("No section redaction applied (showing original structure)")
        
        # Persist outputs to files (JSON and redacted text) for auditing
        base_name = os.path.splitext(os.path.basename(file_path))[0]
        # Use temp directory for output files - try to use the same temp dir as the main app
        temp_dir = "temp_files"
        try:
            os.makedirs(temp_dir, exist_ok=True)
        except PermissionError:
            # Fallback to system temp directory if we can't create in current directory
            import tempfile
            temp_dir = os.path.join(tempfile.gettempdir(), "docling_temp_files")
            os.makedirs(temp_dir, exist_ok=True)
        
        json_path = os.path.join(temp_dir, f"{base_name}_structured.json")
        redacted_path = os.path.join(temp_dir, f"{base_name}_redacted.txt")
        redacted_json_path = os.path.join(temp_dir, f"{base_name}_redacted.json")
        
        try:
            with open(json_path, "w", encoding="utf-8") as jf:
                json.dump(doc_json, jf, ensure_ascii=False, indent=2)
            with open(redacted_path, "w", encoding="utf-8") as tf:
                tf.write(redacted_md)
            with open(redacted_json_path, "w", encoding="utf-8") as jf:
                json.dump(redacted_json, jf, ensure_ascii=False, indent=2)
            logger.info(f"Saved structured JSON to {json_path}, redacted text to {redacted_path}, and redacted JSON to {redacted_json_path}")
        except Exception as e:
            logger.error(f"Error saving outputs to files: {e}")
        
        # Prepare result object
        result = DocumentResult(
            file_path=file_path,
            structured_markdown=structured_md,
            structured_json=doc_json,
            redacted_markdown=redacted_md,
            redacted_json=redacted_json
        )
        logger.info(f"Finished processing for file: {file_path}")
        return result

    def _ensure_cache_directories(self):
        """Ensure all necessary cache directories exist before processing."""
        cache_dirs = [
            os.environ.get('HF_HOME', '/tmp/docling_temp/huggingface'),
            os.environ.get('HF_CACHE_HOME', '/tmp/docling_temp/huggingface_cache'),
            os.environ.get('HF_HUB_CACHE', '/tmp/docling_temp/huggingface_cache'),
            os.environ.get('TRANSFORMERS_CACHE', '/tmp/docling_temp/transformers_cache'),
            os.environ.get('HF_DATASETS_CACHE', '/tmp/docling_temp/datasets_cache'),
            os.environ.get('DIFFUSERS_CACHE', '/tmp/docling_temp/diffusers_cache'),
            os.environ.get('ACCELERATE_CACHE', '/tmp/docling_temp/accelerate_cache'),
            os.environ.get('TORCH_HOME', '/tmp/docling_temp/torch'),
            os.environ.get('TENSORFLOW_HOME', '/tmp/docling_temp/tensorflow'),
            os.environ.get('KERAS_HOME', '/tmp/docling_temp/keras'),
        ]
        
        for cache_dir in cache_dirs:
            try:
                os.makedirs(cache_dir, exist_ok=True)
                logger.debug(f"Ensured cache directory exists: {cache_dir}")
            except Exception as e:
                logger.warning(f"Could not create cache directory {cache_dir}: {e}")

    def _export_redacted_markdown(self, document, redacted_json):
        """Export redacted markdown using the redacted JSON structure."""
        # Simply convert the redacted JSON back to markdown
        return self._json_to_markdown(redacted_json)
    
    def _json_to_markdown(self, json_data: dict) -> str:
        """Convert JSON document structure back to markdown format using Docling's structure."""
        markdown_lines = []
        
        # Get all text elements from the JSON
        texts = json_data.get("texts", [])
        
        for text_elem in texts:
            text_content = text_elem.get("text", "")
            label = text_elem.get("label", "")
            level = text_elem.get("level", 0)
            
            if not text_content.strip():
                continue
                
            # Format based on the label and level (following Docling's structure)
            if label == "section_header":
                # Add appropriate markdown headers
                if level == 1:
                    markdown_lines.append(f"# {text_content}")
                elif level == 2:
                    markdown_lines.append(f"## {text_content}")
                elif level == 3:
                    markdown_lines.append(f"### {text_content}")
                else:
                    markdown_lines.append(f"#### {text_content}")
            elif label == "list_item":
                # Handle list items - preserve the original marker
                marker = text_elem.get("marker", "-")
                markdown_lines.append(f"{marker} {text_content}")
            elif label == "text":
                # Regular text content - preserve as-is
                markdown_lines.append(text_content)
            else:
                # Default to regular text
                markdown_lines.append(text_content)
        
        # Join without extra spacing to match Docling's formatting
        return "\n".join(markdown_lines)