Spaces:
Running
Running
| """ | |
| Specialized handler for letterhead and marginalia documents. | |
| Enhances OCR quality by providing document-specific prompts for common layouts. | |
| """ | |
| import re | |
| import logging | |
| from pathlib import Path | |
| from typing import Union, Dict, Any, Optional, List | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool: | |
| """ | |
| Detect if an image is likely a letterhead document with marginalia. | |
| Uses path/filename patterns and optional image features (if provided). | |
| Args: | |
| image_path: Path to the image file | |
| features: Optional dict of image features from preprocessing | |
| Returns: | |
| bool: True if likely a letterhead document | |
| """ | |
| # Convert to string path for pattern matching | |
| path_str = str(image_path).lower() | |
| # Check for common letterhead filename patterns | |
| letterhead_patterns = [ | |
| r'letter(head)?[^/]*\.jpg', | |
| r'hotel[^/]*\.jpg', | |
| r'baldwin.*\.jpg', | |
| r'business.*letter.*\.jpg', | |
| r'correspondence.*\.jpg' | |
| ] | |
| for pattern in letterhead_patterns: | |
| if re.search(pattern, path_str): | |
| logger.info(f"Detected likely letterhead document: {Path(image_path).name}") | |
| return True | |
| # If features are provided, use them for additional detection | |
| if features: | |
| # Check for ALL CAPS sections that might be marginalia | |
| if features.get('uppercase_sections', 0) > 1: | |
| logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}") | |
| return True | |
| return False | |
| def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]: | |
| """ | |
| Generate a specialized prompt for letterhead documents to improve OCR quality. | |
| Args: | |
| image_path: Path to the image file | |
| features: Optional dict of image features from preprocessing | |
| Returns: | |
| str: Custom prompt for letterhead OCR or None if not applicable | |
| """ | |
| if not is_likely_letterhead(image_path, features): | |
| return None | |
| # Path-specific customizations for known problematic documents | |
| path_str = str(image_path).lower() | |
| # Most specialized prompt for baldwin documents | |
| if "baldwin" in path_str: | |
| return """ | |
| This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines: | |
| 1. Identify and separate the letterhead elements: | |
| - Header: The hotel name, address, and contact information at the top | |
| - Marginalia: The amenities description in ALL CAPS along the margins | |
| 2. Extract the main handwritten letter content separately | |
| 3. Note any image captions separately | |
| 4. Format the output as follows: | |
| - HEADER: [header text] | |
| - MARGINS: [marginalia text] | |
| - LETTER: [handwritten letter text] | |
| - CAPTIONS: [any image captions] | |
| Be careful not to duplicate content between sections, especially with margin text. | |
| """ | |
| # General letterhead prompt | |
| return """ | |
| This appears to be a letterhead document. Please extract the text with the following guidelines: | |
| 1. Identify the header/letterhead section with company name, logo, address, etc. | |
| 2. Identify any margin text or notes that appear separate from the main content | |
| 3. Extract the main letter/document body separately | |
| 4. Format the output as follows: | |
| - LETTERHEAD: [letterhead text] | |
| - MARGIN_NOTES: [any text in margins] | |
| - BODY: [main document body] | |
| Be careful not to duplicate content between sections. | |
| """ | |
| def clean_letterhead_ocr_output(text: str) -> str: | |
| """ | |
| Clean OCR output from letterhead documents by handling section markers | |
| and reducing duplication. | |
| Args: | |
| text: OCR text from letterhead document | |
| Returns: | |
| str: Cleaned text with proper section formatting | |
| """ | |
| if not text: | |
| return "" | |
| # Find any section markers added by the specialized prompt | |
| section_markers = [ | |
| "HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:", | |
| "LETTER:", "BODY:", "CAPTIONS:" | |
| ] | |
| # Check if the text has any section markers | |
| has_sections = any(marker in text for marker in section_markers) | |
| if has_sections: | |
| # Split text into sections while preserving section headers | |
| sections = {} | |
| current_section = "UNKNOWN" | |
| current_text = [] | |
| for line in text.split('\n'): | |
| # Check if this line is a section marker | |
| is_marker = False | |
| for marker in section_markers: | |
| if marker in line: | |
| # Save previous section | |
| if current_text: | |
| sections[current_section] = '\n'.join(current_text).strip() | |
| current_text = [] | |
| # Start new section | |
| current_section = marker.replace(':', '') | |
| # Keep any text after the marker on this line | |
| remainder = line.split(marker, 1)[1].strip() | |
| if remainder: | |
| current_text.append(remainder) | |
| is_marker = True | |
| break | |
| # If not a marker, add to current section | |
| if not is_marker: | |
| current_text.append(line) | |
| # Save the last section | |
| if current_text: | |
| sections[current_section] = '\n'.join(current_text).strip() | |
| # Format with standard order and clear section headers | |
| formatted_sections = [] | |
| # First add letterhead/header info | |
| if "LETTERHEAD" in sections: | |
| formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}") | |
| elif "HEADER" in sections: | |
| formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}") | |
| # Add margins/notes | |
| if "MARGIN_NOTES" in sections: | |
| formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}") | |
| elif "MARGINS" in sections: | |
| formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}") | |
| # Add main content | |
| if "BODY" in sections: | |
| formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}") | |
| elif "LETTER" in sections: | |
| formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}") | |
| # Add captions if present | |
| if "CAPTIONS" in sections: | |
| formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}") | |
| # Add unknown sections | |
| if "UNKNOWN" in sections and sections["UNKNOWN"]: | |
| formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}") | |
| # Join everything with clear separation | |
| return "\n\n".join(formatted_sections) | |
| else: | |
| # If no section markers were found, return the original text | |
| return text |