Spaces:
Sleeping
Sleeping
File size: 8,710 Bytes
bb68eb6 98aae70 bb68eb6 98aae70 bb68eb6 98aae70 bb68eb6 98aae70 bb68eb6 98aae70 bb68eb6 0633369 bb68eb6 0633369 bb68eb6 98aae70 bb68eb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import os
import time
import logging
import json
from dataclasses import dataclass
from typing import Optional
# Don't import DocumentConverter at module level to prevent early initialization
# from docling.document_converter import DocumentConverter
from processing.sections import SectionExtractor
# Remove global converter initialization - will be done lazily
# _docling_converter = DocumentConverter()
logger = logging.getLogger(__name__) # Logger for this module
@dataclass
class DocumentResult:
"""Holds processed results for a document."""
file_path: str
structured_markdown: str
structured_json: dict
redacted_markdown: str
redacted_json: dict
class DocumentProcessor:
"""Handles parsing of documents with Docling and redacting specified sections."""
def __init__(self, section_extractor: Optional[SectionExtractor] = None):
"""
Initialize with an optional SectionExtractor for removing specific sections.
If None, no redaction will be performed (original structure only).
The Docling DocumentConverter will be initialized lazily when needed.
"""
self.section_extractor = section_extractor
self._converter = None # Lazy initialization
@property
def converter(self):
"""Lazy initialization of DocumentConverter to prevent early Hugging Face Hub initialization."""
if self._converter is None:
# Import here to ensure environment variables are set first
from docling.document_converter import DocumentConverter
logger.info("Initializing Docling DocumentConverter...")
self._converter = DocumentConverter()
logger.info("Docling DocumentConverter initialized successfully")
return self._converter
def process(self, file_path: str) -> DocumentResult:
"""Parse the document and optionally remove specified sections. Returns a DocumentResult."""
logger.info(f"Starting processing for file: {file_path}")
start_time = time.time()
# Ensure environment variables are set before processing
self._ensure_cache_directories()
# Convert the document using Docling
conv_result = self.converter.convert(file_path)
elapsed = time.time() - start_time
logger.info(f"Docling conversion completed in {elapsed:.2f} seconds")
# Export results from Docling
structured_md = conv_result.document.export_to_markdown()
structured_text = conv_result.document.export_to_text()
doc_json = conv_result.document.export_to_dict()
logger.info(f"Extracted document content (text length {len(structured_text)} characters)")
# Use SectionExtractor to remove target sections if provided
if self.section_extractor:
# Use the new JSON-based approach for better section removal
redacted_json = self.section_extractor.remove_sections_from_json(doc_json)
# Convert the redacted JSON back to markdown using Docling's export method
# Create a modified document structure for proper markdown export
redacted_md = self._export_redacted_markdown(conv_result.document, redacted_json)
logger.info("Applied section redaction to remove specified sections")
else:
redacted_md = structured_md # No redaction, use original
redacted_json = doc_json # No redaction, use original
logger.info("No section redaction applied (showing original structure)")
# Persist outputs to files (JSON and redacted text) for auditing
base_name = os.path.splitext(os.path.basename(file_path))[0]
# Use temp directory for output files - try to use the same temp dir as the main app
temp_dir = "temp_files"
try:
os.makedirs(temp_dir, exist_ok=True)
except PermissionError:
# Fallback to system temp directory if we can't create in current directory
import tempfile
temp_dir = os.path.join(tempfile.gettempdir(), "docling_temp_files")
os.makedirs(temp_dir, exist_ok=True)
json_path = os.path.join(temp_dir, f"{base_name}_structured.json")
redacted_path = os.path.join(temp_dir, f"{base_name}_redacted.txt")
redacted_json_path = os.path.join(temp_dir, f"{base_name}_redacted.json")
try:
with open(json_path, "w", encoding="utf-8") as jf:
json.dump(doc_json, jf, ensure_ascii=False, indent=2)
with open(redacted_path, "w", encoding="utf-8") as tf:
tf.write(redacted_md)
with open(redacted_json_path, "w", encoding="utf-8") as jf:
json.dump(redacted_json, jf, ensure_ascii=False, indent=2)
logger.info(f"Saved structured JSON to {json_path}, redacted text to {redacted_path}, and redacted JSON to {redacted_json_path}")
except Exception as e:
logger.error(f"Error saving outputs to files: {e}")
# Prepare result object
result = DocumentResult(
file_path=file_path,
structured_markdown=structured_md,
structured_json=doc_json,
redacted_markdown=redacted_md,
redacted_json=redacted_json
)
logger.info(f"Finished processing for file: {file_path}")
return result
def _ensure_cache_directories(self):
"""Ensure all necessary cache directories exist before processing."""
cache_dirs = [
os.environ.get('HF_HOME', '/tmp/docling_temp/huggingface'),
os.environ.get('HF_CACHE_HOME', '/tmp/docling_temp/huggingface_cache'),
os.environ.get('HF_HUB_CACHE', '/tmp/docling_temp/huggingface_cache'),
os.environ.get('TRANSFORMERS_CACHE', '/tmp/docling_temp/transformers_cache'),
os.environ.get('HF_DATASETS_CACHE', '/tmp/docling_temp/datasets_cache'),
os.environ.get('DIFFUSERS_CACHE', '/tmp/docling_temp/diffusers_cache'),
os.environ.get('ACCELERATE_CACHE', '/tmp/docling_temp/accelerate_cache'),
os.environ.get('TORCH_HOME', '/tmp/docling_temp/torch'),
os.environ.get('TENSORFLOW_HOME', '/tmp/docling_temp/tensorflow'),
os.environ.get('KERAS_HOME', '/tmp/docling_temp/keras'),
]
for cache_dir in cache_dirs:
try:
os.makedirs(cache_dir, exist_ok=True)
logger.debug(f"Ensured cache directory exists: {cache_dir}")
except Exception as e:
logger.warning(f"Could not create cache directory {cache_dir}: {e}")
def _export_redacted_markdown(self, document, redacted_json):
"""Export redacted markdown using the redacted JSON structure."""
# Simply convert the redacted JSON back to markdown
return self._json_to_markdown(redacted_json)
def _json_to_markdown(self, json_data: dict) -> str:
"""Convert JSON document structure back to markdown format using Docling's structure."""
markdown_lines = []
# Get all text elements from the JSON
texts = json_data.get("texts", [])
for text_elem in texts:
text_content = text_elem.get("text", "")
label = text_elem.get("label", "")
level = text_elem.get("level", 0)
if not text_content.strip():
continue
# Format based on the label and level (following Docling's structure)
if label == "section_header":
# Add appropriate markdown headers
if level == 1:
markdown_lines.append(f"# {text_content}")
elif level == 2:
markdown_lines.append(f"## {text_content}")
elif level == 3:
markdown_lines.append(f"### {text_content}")
else:
markdown_lines.append(f"#### {text_content}")
elif label == "list_item":
# Handle list items - preserve the original marker
marker = text_elem.get("marker", "-")
markdown_lines.append(f"{marker} {text_content}")
elif label == "text":
# Regular text content - preserve as-is
markdown_lines.append(text_content)
else:
# Default to regular text
markdown_lines.append(text_content)
# Join without extra spacing to match Docling's formatting
return "\n".join(markdown_lines)
|