Spaces:
Running
Running
""" | |
Specialized handler for letterhead and marginalia documents. | |
Enhances OCR quality by providing document-specific prompts for common layouts. | |
""" | |
import re | |
import logging | |
from pathlib import Path | |
from typing import Union, Dict, Any, Optional, List | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool: | |
""" | |
Detect if an image is likely a letterhead document with marginalia. | |
Uses path/filename patterns and optional image features (if provided). | |
Args: | |
image_path: Path to the image file | |
features: Optional dict of image features from preprocessing | |
Returns: | |
bool: True if likely a letterhead document | |
""" | |
# Convert to string path for pattern matching | |
path_str = str(image_path).lower() | |
# Check for common letterhead filename patterns | |
letterhead_patterns = [ | |
r'letter(head)?[^/]*\.jpg', | |
r'hotel[^/]*\.jpg', | |
r'baldwin.*\.jpg', | |
r'business.*letter.*\.jpg', | |
r'correspondence.*\.jpg' | |
] | |
for pattern in letterhead_patterns: | |
if re.search(pattern, path_str): | |
logger.info(f"Detected likely letterhead document: {Path(image_path).name}") | |
return True | |
# If features are provided, use them for additional detection | |
if features: | |
# Check for ALL CAPS sections that might be marginalia | |
if features.get('uppercase_sections', 0) > 1: | |
logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}") | |
return True | |
return False | |
def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]: | |
""" | |
Generate a specialized prompt for letterhead documents to improve OCR quality. | |
Args: | |
image_path: Path to the image file | |
features: Optional dict of image features from preprocessing | |
Returns: | |
str: Custom prompt for letterhead OCR or None if not applicable | |
""" | |
if not is_likely_letterhead(image_path, features): | |
return None | |
# Path-specific customizations for known problematic documents | |
path_str = str(image_path).lower() | |
# Most specialized prompt for baldwin documents | |
if "baldwin" in path_str: | |
return """ | |
This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines: | |
1. Identify and separate the letterhead elements: | |
- Header: The hotel name, address, and contact information at the top | |
- Marginalia: The amenities description in ALL CAPS along the margins | |
2. Extract the main handwritten letter content separately | |
3. Note any image captions separately | |
4. Format the output as follows: | |
- HEADER: [header text] | |
- MARGINS: [marginalia text] | |
- LETTER: [handwritten letter text] | |
- CAPTIONS: [any image captions] | |
Be careful not to duplicate content between sections, especially with margin text. | |
""" | |
# General letterhead prompt | |
return """ | |
This appears to be a letterhead document. Please extract the text with the following guidelines: | |
1. Identify the header/letterhead section with company name, logo, address, etc. | |
2. Identify any margin text or notes that appear separate from the main content | |
3. Extract the main letter/document body separately | |
4. Format the output as follows: | |
- LETTERHEAD: [letterhead text] | |
- MARGIN_NOTES: [any text in margins] | |
- BODY: [main document body] | |
Be careful not to duplicate content between sections. | |
""" | |
def clean_letterhead_ocr_output(text: str) -> str: | |
""" | |
Clean OCR output from letterhead documents by handling section markers | |
and reducing duplication. | |
Args: | |
text: OCR text from letterhead document | |
Returns: | |
str: Cleaned text with proper section formatting | |
""" | |
if not text: | |
return "" | |
# Find any section markers added by the specialized prompt | |
section_markers = [ | |
"HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:", | |
"LETTER:", "BODY:", "CAPTIONS:" | |
] | |
# Check if the text has any section markers | |
has_sections = any(marker in text for marker in section_markers) | |
if has_sections: | |
# Split text into sections while preserving section headers | |
sections = {} | |
current_section = "UNKNOWN" | |
current_text = [] | |
for line in text.split('\n'): | |
# Check if this line is a section marker | |
is_marker = False | |
for marker in section_markers: | |
if marker in line: | |
# Save previous section | |
if current_text: | |
sections[current_section] = '\n'.join(current_text).strip() | |
current_text = [] | |
# Start new section | |
current_section = marker.replace(':', '') | |
# Keep any text after the marker on this line | |
remainder = line.split(marker, 1)[1].strip() | |
if remainder: | |
current_text.append(remainder) | |
is_marker = True | |
break | |
# If not a marker, add to current section | |
if not is_marker: | |
current_text.append(line) | |
# Save the last section | |
if current_text: | |
sections[current_section] = '\n'.join(current_text).strip() | |
# Format with standard order and clear section headers | |
formatted_sections = [] | |
# First add letterhead/header info | |
if "LETTERHEAD" in sections: | |
formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}") | |
elif "HEADER" in sections: | |
formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}") | |
# Add margins/notes | |
if "MARGIN_NOTES" in sections: | |
formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}") | |
elif "MARGINS" in sections: | |
formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}") | |
# Add main content | |
if "BODY" in sections: | |
formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}") | |
elif "LETTER" in sections: | |
formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}") | |
# Add captions if present | |
if "CAPTIONS" in sections: | |
formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}") | |
# Add unknown sections | |
if "UNKNOWN" in sections and sections["UNKNOWN"]: | |
formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}") | |
# Join everything with clear separation | |
return "\n\n".join(formatted_sections) | |
else: | |
# If no section markers were found, return the original text | |
return text |