Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / letterhead_handler.py

milwright

Fix OCR processing variable scope issue by using explicit module reference for apply_preprocessing_to_file

3dd2ff2 6 months ago

raw

history blame

7.44 kB

	"""
	Specialized handler for letterhead and marginalia documents.
	Enhances OCR quality by providing document-specific prompts for common layouts.
	"""

	import re
	import logging
	from pathlib import Path
	from typing import Union, Dict, Any, Optional, List

	# Configure logging
	logging.basicConfig(level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool:
	"""
	Detect if an image is likely a letterhead document with marginalia.
	Uses path/filename patterns and optional image features (if provided).

	Args:
	image_path: Path to the image file
	features: Optional dict of image features from preprocessing

	Returns:
	bool: True if likely a letterhead document
	"""
	# Convert to string path for pattern matching
	path_str = str(image_path).lower()

	# Check for common letterhead filename patterns
	letterhead_patterns = [
	r'letter(head)?[^/]*\.jpg',
	r'hotel[^/]*\.jpg',
	r'baldwin.*\.jpg',
	r'business.letter.\.jpg',
	r'correspondence.*\.jpg'
	]

	for pattern in letterhead_patterns:
	if re.search(pattern, path_str):
	logger.info(f"Detected likely letterhead document: {Path(image_path).name}")
	return True

	# If features are provided, use them for additional detection
	if features:
	# Check for ALL CAPS sections that might be marginalia
	if features.get('uppercase_sections', 0) > 1:
	logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}")
	return True

	return False

	def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]:
	"""
	Generate a specialized prompt for letterhead documents to improve OCR quality.

	Args:
	image_path: Path to the image file
	features: Optional dict of image features from preprocessing

	Returns:
	str: Custom prompt for letterhead OCR or None if not applicable
	"""
	if not is_likely_letterhead(image_path, features):
	return None

	# Path-specific customizations for known problematic documents
	path_str = str(image_path).lower()

	# Most specialized prompt for baldwin documents
	if "baldwin" in path_str:
	return """
	This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines:

	1. Identify and separate the letterhead elements:
	- Header: The hotel name, address, and contact information at the top
	- Marginalia: The amenities description in ALL CAPS along the margins

	2. Extract the main handwritten letter content separately

	3. Note any image captions separately

	4. Format the output as follows:
	- HEADER: [header text]
	- MARGINS: [marginalia text]
	- LETTER: [handwritten letter text]
	- CAPTIONS: [any image captions]

	Be careful not to duplicate content between sections, especially with margin text.
	"""

	# General letterhead prompt
	return """
	This appears to be a letterhead document. Please extract the text with the following guidelines:

	1. Identify the header/letterhead section with company name, logo, address, etc.
	2. Identify any margin text or notes that appear separate from the main content
	3. Extract the main letter/document body separately
	4. Format the output as follows:
	- LETTERHEAD: [letterhead text]
	- MARGIN_NOTES: [any text in margins]
	- BODY: [main document body]

	Be careful not to duplicate content between sections.
	"""

	def clean_letterhead_ocr_output(text: str) -> str:
	"""
	Clean OCR output from letterhead documents by handling section markers
	and reducing duplication.

	Args:
	text: OCR text from letterhead document

	Returns:
	str: Cleaned text with proper section formatting
	"""
	if not text:
	return ""

	# Find any section markers added by the specialized prompt
	section_markers = [
	"HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:",
	"LETTER:", "BODY:", "CAPTIONS:"
	]

	# Check if the text has any section markers
	has_sections = any(marker in text for marker in section_markers)

	if has_sections:
	# Split text into sections while preserving section headers
	sections = {}
	current_section = "UNKNOWN"
	current_text = []

	for line in text.split('\n'):
	# Check if this line is a section marker
	is_marker = False
	for marker in section_markers:
	if marker in line:
	# Save previous section
	if current_text:
	sections[current_section] = '\n'.join(current_text).strip()
	current_text = []

	# Start new section
	current_section = marker.replace(':', '')
	# Keep any text after the marker on this line
	remainder = line.split(marker, 1)[1].strip()
	if remainder:
	current_text.append(remainder)
	is_marker = True
	break

	# If not a marker, add to current section
	if not is_marker:
	current_text.append(line)

	# Save the last section
	if current_text:
	sections[current_section] = '\n'.join(current_text).strip()

	# Format with standard order and clear section headers
	formatted_sections = []

	# First add letterhead/header info
	if "LETTERHEAD" in sections:
	formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}")
	elif "HEADER" in sections:
	formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}")

	# Add margins/notes
	if "MARGIN_NOTES" in sections:
	formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}")
	elif "MARGINS" in sections:
	formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}")

	# Add main content
	if "BODY" in sections:
	formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}")
	elif "LETTER" in sections:
	formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}")

	# Add captions if present
	if "CAPTIONS" in sections:
	formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}")

	# Add unknown sections
	if "UNKNOWN" in sections and sections["UNKNOWN"]:
	formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}")

	# Join everything with clear separation
	return "\n\n".join(formatted_sections)
	else:
	# If no section markers were found, return the original text
	return text