historical-ocr / letterhead_handler.py
milwright's picture
Fix OCR processing variable scope issue by using explicit module reference for apply_preprocessing_to_file
3dd2ff2
"""
Specialized handler for letterhead and marginalia documents.
Enhances OCR quality by providing document-specific prompts for common layouts.
"""
import re
import logging
from pathlib import Path
from typing import Union, Dict, Any, Optional, List
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool:
"""
Detect if an image is likely a letterhead document with marginalia.
Uses path/filename patterns and optional image features (if provided).
Args:
image_path: Path to the image file
features: Optional dict of image features from preprocessing
Returns:
bool: True if likely a letterhead document
"""
# Convert to string path for pattern matching
path_str = str(image_path).lower()
# Check for common letterhead filename patterns
letterhead_patterns = [
r'letter(head)?[^/]*\.jpg',
r'hotel[^/]*\.jpg',
r'baldwin.*\.jpg',
r'business.*letter.*\.jpg',
r'correspondence.*\.jpg'
]
for pattern in letterhead_patterns:
if re.search(pattern, path_str):
logger.info(f"Detected likely letterhead document: {Path(image_path).name}")
return True
# If features are provided, use them for additional detection
if features:
# Check for ALL CAPS sections that might be marginalia
if features.get('uppercase_sections', 0) > 1:
logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}")
return True
return False
def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]:
"""
Generate a specialized prompt for letterhead documents to improve OCR quality.
Args:
image_path: Path to the image file
features: Optional dict of image features from preprocessing
Returns:
str: Custom prompt for letterhead OCR or None if not applicable
"""
if not is_likely_letterhead(image_path, features):
return None
# Path-specific customizations for known problematic documents
path_str = str(image_path).lower()
# Most specialized prompt for baldwin documents
if "baldwin" in path_str:
return """
This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines:
1. Identify and separate the letterhead elements:
- Header: The hotel name, address, and contact information at the top
- Marginalia: The amenities description in ALL CAPS along the margins
2. Extract the main handwritten letter content separately
3. Note any image captions separately
4. Format the output as follows:
- HEADER: [header text]
- MARGINS: [marginalia text]
- LETTER: [handwritten letter text]
- CAPTIONS: [any image captions]
Be careful not to duplicate content between sections, especially with margin text.
"""
# General letterhead prompt
return """
This appears to be a letterhead document. Please extract the text with the following guidelines:
1. Identify the header/letterhead section with company name, logo, address, etc.
2. Identify any margin text or notes that appear separate from the main content
3. Extract the main letter/document body separately
4. Format the output as follows:
- LETTERHEAD: [letterhead text]
- MARGIN_NOTES: [any text in margins]
- BODY: [main document body]
Be careful not to duplicate content between sections.
"""
def clean_letterhead_ocr_output(text: str) -> str:
"""
Clean OCR output from letterhead documents by handling section markers
and reducing duplication.
Args:
text: OCR text from letterhead document
Returns:
str: Cleaned text with proper section formatting
"""
if not text:
return ""
# Find any section markers added by the specialized prompt
section_markers = [
"HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:",
"LETTER:", "BODY:", "CAPTIONS:"
]
# Check if the text has any section markers
has_sections = any(marker in text for marker in section_markers)
if has_sections:
# Split text into sections while preserving section headers
sections = {}
current_section = "UNKNOWN"
current_text = []
for line in text.split('\n'):
# Check if this line is a section marker
is_marker = False
for marker in section_markers:
if marker in line:
# Save previous section
if current_text:
sections[current_section] = '\n'.join(current_text).strip()
current_text = []
# Start new section
current_section = marker.replace(':', '')
# Keep any text after the marker on this line
remainder = line.split(marker, 1)[1].strip()
if remainder:
current_text.append(remainder)
is_marker = True
break
# If not a marker, add to current section
if not is_marker:
current_text.append(line)
# Save the last section
if current_text:
sections[current_section] = '\n'.join(current_text).strip()
# Format with standard order and clear section headers
formatted_sections = []
# First add letterhead/header info
if "LETTERHEAD" in sections:
formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}")
elif "HEADER" in sections:
formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}")
# Add margins/notes
if "MARGIN_NOTES" in sections:
formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}")
elif "MARGINS" in sections:
formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}")
# Add main content
if "BODY" in sections:
formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}")
elif "LETTER" in sections:
formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}")
# Add captions if present
if "CAPTIONS" in sections:
formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}")
# Add unknown sections
if "UNKNOWN" in sections and sections["UNKNOWN"]:
formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}")
# Join everything with clear separation
return "\n\n".join(formatted_sections)
else:
# If no section markers were found, return the original text
return text