Spaces:

levalencia
/

docling

Sleeping

App Files Files Community

docling / src /processing /document_processor.py

levalencia

Enhance Dockerfile and Streamlit app for comprehensive environment setup and permission testing

98aae70 2 months ago

raw

history blame

8.71 kB

	import os
	import time
	import logging
	import json
	from dataclasses import dataclass
	from typing import Optional

	# Don't import DocumentConverter at module level to prevent early initialization
	# from docling.document_converter import DocumentConverter
	from processing.sections import SectionExtractor

	# Remove global converter initialization - will be done lazily
	# _docling_converter = DocumentConverter()

	logger = logging.getLogger(__name__) # Logger for this module

	@dataclass
	class DocumentResult:
	"""Holds processed results for a document."""
	file_path: str
	structured_markdown: str
	structured_json: dict
	redacted_markdown: str
	redacted_json: dict

	class DocumentProcessor:
	"""Handles parsing of documents with Docling and redacting specified sections."""
	def __init__(self, section_extractor: Optional[SectionExtractor] = None):
	"""
	Initialize with an optional SectionExtractor for removing specific sections.
	If None, no redaction will be performed (original structure only).
	The Docling DocumentConverter will be initialized lazily when needed.
	"""
	self.section_extractor = section_extractor
	self._converter = None # Lazy initialization

	@property
	def converter(self):
	"""Lazy initialization of DocumentConverter to prevent early Hugging Face Hub initialization."""
	if self._converter is None:
	# Import here to ensure environment variables are set first
	from docling.document_converter import DocumentConverter
	logger.info("Initializing Docling DocumentConverter...")
	self._converter = DocumentConverter()
	logger.info("Docling DocumentConverter initialized successfully")
	return self._converter

	def process(self, file_path: str) -> DocumentResult:
	"""Parse the document and optionally remove specified sections. Returns a DocumentResult."""
	logger.info(f"Starting processing for file: {file_path}")
	start_time = time.time()

	# Ensure environment variables are set before processing
	self._ensure_cache_directories()

	# Convert the document using Docling
	conv_result = self.converter.convert(file_path)
	elapsed = time.time() - start_time
	logger.info(f"Docling conversion completed in {elapsed:.2f} seconds")

	# Export results from Docling
	structured_md = conv_result.document.export_to_markdown()
	structured_text = conv_result.document.export_to_text()
	doc_json = conv_result.document.export_to_dict()
	logger.info(f"Extracted document content (text length {len(structured_text)} characters)")

	# Use SectionExtractor to remove target sections if provided
	if self.section_extractor:
	# Use the new JSON-based approach for better section removal
	redacted_json = self.section_extractor.remove_sections_from_json(doc_json)

	# Convert the redacted JSON back to markdown using Docling's export method
	# Create a modified document structure for proper markdown export
	redacted_md = self._export_redacted_markdown(conv_result.document, redacted_json)
	logger.info("Applied section redaction to remove specified sections")
	else:
	redacted_md = structured_md # No redaction, use original
	redacted_json = doc_json # No redaction, use original
	logger.info("No section redaction applied (showing original structure)")

	# Persist outputs to files (JSON and redacted text) for auditing
	base_name = os.path.splitext(os.path.basename(file_path))[0]
	# Use temp directory for output files - try to use the same temp dir as the main app
	temp_dir = "temp_files"
	try:
	os.makedirs(temp_dir, exist_ok=True)
	except PermissionError:
	# Fallback to system temp directory if we can't create in current directory
	import tempfile
	temp_dir = os.path.join(tempfile.gettempdir(), "docling_temp_files")
	os.makedirs(temp_dir, exist_ok=True)

	json_path = os.path.join(temp_dir, f"{base_name}_structured.json")
	redacted_path = os.path.join(temp_dir, f"{base_name}_redacted.txt")
	redacted_json_path = os.path.join(temp_dir, f"{base_name}_redacted.json")

	try:
	with open(json_path, "w", encoding="utf-8") as jf:
	json.dump(doc_json, jf, ensure_ascii=False, indent=2)
	with open(redacted_path, "w", encoding="utf-8") as tf:
	tf.write(redacted_md)
	with open(redacted_json_path, "w", encoding="utf-8") as jf:
	json.dump(redacted_json, jf, ensure_ascii=False, indent=2)
	logger.info(f"Saved structured JSON to {json_path}, redacted text to {redacted_path}, and redacted JSON to {redacted_json_path}")
	except Exception as e:
	logger.error(f"Error saving outputs to files: {e}")

	# Prepare result object
	result = DocumentResult(
	file_path=file_path,
	structured_markdown=structured_md,
	structured_json=doc_json,
	redacted_markdown=redacted_md,
	redacted_json=redacted_json
	)
	logger.info(f"Finished processing for file: {file_path}")
	return result

	def _ensure_cache_directories(self):
	"""Ensure all necessary cache directories exist before processing."""
	cache_dirs = [
	os.environ.get('HF_HOME', '/tmp/docling_temp/huggingface'),
	os.environ.get('HF_CACHE_HOME', '/tmp/docling_temp/huggingface_cache'),
	os.environ.get('HF_HUB_CACHE', '/tmp/docling_temp/huggingface_cache'),
	os.environ.get('TRANSFORMERS_CACHE', '/tmp/docling_temp/transformers_cache'),
	os.environ.get('HF_DATASETS_CACHE', '/tmp/docling_temp/datasets_cache'),
	os.environ.get('DIFFUSERS_CACHE', '/tmp/docling_temp/diffusers_cache'),
	os.environ.get('ACCELERATE_CACHE', '/tmp/docling_temp/accelerate_cache'),
	os.environ.get('TORCH_HOME', '/tmp/docling_temp/torch'),
	os.environ.get('TENSORFLOW_HOME', '/tmp/docling_temp/tensorflow'),
	os.environ.get('KERAS_HOME', '/tmp/docling_temp/keras'),
	]

	for cache_dir in cache_dirs:
	try:
	os.makedirs(cache_dir, exist_ok=True)
	logger.debug(f"Ensured cache directory exists: {cache_dir}")
	except Exception as e:
	logger.warning(f"Could not create cache directory {cache_dir}: {e}")

	def _export_redacted_markdown(self, document, redacted_json):
	"""Export redacted markdown using the redacted JSON structure."""
	# Simply convert the redacted JSON back to markdown
	return self._json_to_markdown(redacted_json)

	def _json_to_markdown(self, json_data: dict) -> str:
	"""Convert JSON document structure back to markdown format using Docling's structure."""
	markdown_lines = []

	# Get all text elements from the JSON
	texts = json_data.get("texts", [])

	for text_elem in texts:
	text_content = text_elem.get("text", "")
	label = text_elem.get("label", "")
	level = text_elem.get("level", 0)

	if not text_content.strip():
	continue

	# Format based on the label and level (following Docling's structure)
	if label == "section_header":
	# Add appropriate markdown headers
	if level == 1:
	markdown_lines.append(f"# {text_content}")
	elif level == 2:
	markdown_lines.append(f"## {text_content}")
	elif level == 3:
	markdown_lines.append(f"### {text_content}")
	else:
	markdown_lines.append(f"#### {text_content}")
	elif label == "list_item":
	# Handle list items - preserve the original marker
	marker = text_elem.get("marker", "-")
	markdown_lines.append(f"{marker} {text_content}")
	elif label == "text":
	# Regular text content - preserve as-is
	markdown_lines.append(text_content)
	else:
	# Default to regular text
	markdown_lines.append(text_content)

	# Join without extra spacing to match Docling's formatting
	return "\n".join(markdown_lines)