Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

milwright commited on May 2

Commit

3dd2ff2

1 Parent(s): 73375a3

Fix OCR processing variable scope issue by using explicit module reference for apply_preprocessing_to_file

Browse files

Files changed (6) hide show

image_segmentation.py +62 -97
letterhead_handler.py +197 -0
ocr_processing.py +144 -2
test_adaptive_segmentation.py +98 -0
utils/image_utils.py +59 -0
utils/text_utils.py +70 -0

image_segmentation.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
 Image segmentation utility for OCR preprocessing.
 Separates text regions from image regions to improve OCR accuracy on mixed-content documents.
-Based on Mistral AI cookbook examples.
 """
 import cv2
@@ -18,33 +18,10 @@ logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-def determine_segmentation_approach(image_path: Union[str, Path]) -> str:
-    """
-    Determine which segmentation approach to use based on the document type.
-    Args:
-        image_path: Path to the image file
-    Returns:
-        str: Segmentation approach to use ('simplified' or 'original')
-    """
-    # Convert to string for easier pattern matching
-    filename = str(image_path).lower()
-    # Document-specific rules based on testing results
-    if "baldwin" in filename and "north" in filename:
-        # Baldwin documents showed better results with original approach
-        return "original"
-    # Default to our simplified approach for most documents
-    return "simplified"
 def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = True, preserve_content: bool = True) -> Dict[str, Union[Image.Image, str]]:
     """
-    Prepare image for OCR processing using the most appropriate segmentation approach.
-    For most documents, this uses a minimal approach that trusts Mistral OCR
-    to handle document understanding and layout analysis. For specific document types
-    that benefit from custom segmentation, a document-specific approach is used.
     Args:
         image_path: Path to the image file
@@ -57,11 +34,8 @@ def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = T
     # Convert to Path object if string
     image_file = Path(image_path) if isinstance(image_path, str) else image_path
-    # Determine the segmentation approach to use
-    approach = determine_segmentation_approach(image_file)
     # Log start of processing
-    logger.info(f"Preparing image for Mistral OCR: {image_file.name} (using {approach} approach)")
     try:
         # Open original image with PIL
@@ -88,80 +62,29 @@ def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = T
             img_np = np.array(pil_img)
             img_width, img_height = pil_img.size
-            # Apply the appropriate segmentation approach based on the document type
-            if approach == "simplified":
-                # SIMPLIFIED APPROACH for most documents:
-                # Let Mistral OCR handle the entire document understanding process
-                # For visualization, mark the entire image as a text region
-                full_image_region = [(0, 0, img_width, img_height)]
-                # Create visualization with a simple border
-                vis_img = img_np.copy()
-                cv2.rectangle(vis_img, (5, 5), (img_width-5, img_height-5), (0, 255, 0), 5)
-                # Add text to indicate this is using Mistral's native processing
-                font = cv2.FONT_HERSHEY_SIMPLEX
-                cv2.putText(vis_img, "Processed by Mistral OCR", (30, 60), font, 1, (0, 255, 0), 2)
-                # Create visualizations and masks
-                text_regions_vis = Image.fromarray(vis_img)
-                image_regions_vis = text_regions_vis.copy()
-                # Create a mask of the entire image (just for visualization)
-                text_mask = np.ones((img_height, img_width), dtype=np.uint8) * 255
-                _, buffer = cv2.imencode('.png', text_mask)
-                text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
-                # Return the original image as the combined result
-                return {
-                    'text_regions': text_regions_vis,
-                    'image_regions': image_regions_vis,
-                    'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
-                    'combined_result': pil_img,
-                    'text_regions_coordinates': full_image_region,
-                    'region_images': [{
-                        'image': img_np,
-                        'pil_image': pil_img,
-                        'coordinates': (0, 0, img_width, img_height),
-                        'padded_coordinates': (0, 0, img_width, img_height),
-                        'order': 0
-                    }]
-                }
-            else:
-                # DOCUMENT-SPECIFIC APPROACH for baldwin-north and similar documents
-                # Use more structured segmentation with customized region detection
-                # This approach is preferred for documents that showed better results in testing
-                # Create a visualization with green borders around the text regions
-                vis_img = img_np.copy()
-                # For baldwin-north type documents, create a more granular segmentation
-                # Define regions with more detailed segmentation for better text capture
-                # Use 3 overlapping regions instead of 2 distinct ones
-                # Define header, middle, and body sections with overlap
-                header_height = int(img_height * 0.3)  # Top 30% as header (increased from 25%)
-                middle_start = int(img_height * 0.2)    # Start middle section with overlap
-                middle_height = int(img_height * 0.4)   # Middle 40%
-                body_start = int(img_height * 0.5)      # Start body with overlap
-                body_height = img_height - body_start   # Remaining height
-                # Define regions with overlap to ensure no text is missed
-                regions = [
-                    (0, 0, img_width, header_height),                  # Header region
-                    (0, middle_start, img_width, middle_height),       # Middle region with overlap
-                    (0, body_start, img_width, body_height)            # Body region with overlap
-                ]
                 # Draw regions on visualization
                 for x, y, w, h in regions:
                     cv2.rectangle(vis_img, (x, y), (x+w, y+h), (0, 255, 0), 3)
-                # Add text to indicate we're using the document-specific approach
                 font = cv2.FONT_HERSHEY_SIMPLEX
-                cv2.putText(vis_img, "Document-specific processing", (30, 60), font, 1, (0, 255, 0), 2)
                 # Create visualization images
                 text_regions_vis = Image.fromarray(vis_img)
@@ -190,14 +113,56 @@ def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = T
                     }
                     region_images.append(region_info)
-                # Return the structured segmentation results
                 return {
                     'text_regions': text_regions_vis,
                     'image_regions': image_regions_vis,
                     'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
                     'combined_result': pil_img,
                     'text_regions_coordinates': regions,
-                    'region_images': region_images
                 }
     except Exception as e:

 """
 Image segmentation utility for OCR preprocessing.
 Separates text regions from image regions to improve OCR accuracy on mixed-content documents.
+Uses content-aware adaptive segmentation for improved results across document types.
 """
 import cv2
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = True, preserve_content: bool = True) -> Dict[str, Union[Image.Image, str]]:
     """
+    Prepare image for OCR processing using content-aware segmentation.
+    Uses adaptive region detection based on text density analysis.
     Args:
         image_path: Path to the image file
     # Convert to Path object if string
     image_file = Path(image_path) if isinstance(image_path, str) else image_path
     # Log start of processing
+    logger.info(f"Preparing image for Mistral OCR: {image_file.name}")
     try:
         # Open original image with PIL
             img_np = np.array(pil_img)
             img_width, img_height = pil_img.size
+            # Analyze text density to determine if advanced segmentation is needed
+            # This replaces document-specific logic with content-aware analysis
+            from utils.image_utils import estimate_text_density
+            text_density = estimate_text_density(img_np)
+            # Use adaptive approach for documents with unusual text distribution
+            if text_density['pattern'] == 'varied' or text_density['uppercase_sections'] > 0:
+                logger.info(f"Using adaptive segmentation for document with varied text density pattern={text_density['pattern']}, uppercase_sections={text_density['uppercase_sections']}")
+                # Detect content regions based on text density
+                from utils.text_utils import detect_content_regions
+                regions = detect_content_regions(img_np)
+                # Create visualization with green borders around the text regions
+                vis_img = img_np.copy()
                 # Draw regions on visualization
                 for x, y, w, h in regions:
                     cv2.rectangle(vis_img, (x, y), (x+w, y+h), (0, 255, 0), 3)
+                # Add text to indicate we're using adaptive processing
                 font = cv2.FONT_HERSHEY_SIMPLEX
+                cv2.putText(vis_img, "Adaptive region processing", (30, 60), font, 1, (0, 255, 0), 2)
                 # Create visualization images
                 text_regions_vis = Image.fromarray(vis_img)
                     }
                     region_images.append(region_info)
+                # Return the adaptive segmentation results
                 return {
                     'text_regions': text_regions_vis,
                     'image_regions': image_regions_vis,
                     'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
                     'combined_result': pil_img,
                     'text_regions_coordinates': regions,
+                    'region_images': region_images,
+                    'segmentation_type': 'adaptive'
+                }
+            else:
+                # SIMPLIFIED APPROACH for most documents
+                # Let Mistral OCR handle the entire document understanding process
+                logger.info(f"Using standard approach for document with uniform text density")
+                # For visualization, mark the entire image as a text region
+                full_image_region = [(0, 0, img_width, img_height)]
+                # Create visualization with a simple border
+                vis_img = img_np.copy()
+                cv2.rectangle(vis_img, (5, 5), (img_width-5, img_height-5), (0, 255, 0), 5)
+                # Add text to indicate this is using Mistral's native processing
+                font = cv2.FONT_HERSHEY_SIMPLEX
+                cv2.putText(vis_img, "Processed by Mistral OCR", (30, 60), font, 1, (0, 255, 0), 2)
+                # Create visualizations and masks
+                text_regions_vis = Image.fromarray(vis_img)
+                image_regions_vis = text_regions_vis.copy()
+                # Create a mask of the entire image (just for visualization)
+                text_mask = np.ones((img_height, img_width), dtype=np.uint8) * 255
+                _, buffer = cv2.imencode('.png', text_mask)
+                text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
+                # Return the original image as the combined result
+                return {
+                    'text_regions': text_regions_vis,
+                    'image_regions': image_regions_vis,
+                    'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
+                    'combined_result': pil_img,
+                    'text_regions_coordinates': full_image_region,
+                    'region_images': [{
+                        'image': img_np,
+                        'pil_image': pil_img,
+                        'coordinates': (0, 0, img_width, img_height),
+                        'padded_coordinates': (0, 0, img_width, img_height),
+                        'order': 0
+                    }],
+                    'segmentation_type': 'simplified'
                 }
     except Exception as e:

letterhead_handler.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""
+Specialized handler for letterhead and marginalia documents.
+Enhances OCR quality by providing document-specific prompts for common layouts.
+"""
+import re
+import logging
+from pathlib import Path
+from typing import Union, Dict, Any, Optional, List
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool:
+    """
+    Detect if an image is likely a letterhead document with marginalia.
+    Uses path/filename patterns and optional image features (if provided).
+    Args:
+        image_path: Path to the image file
+        features: Optional dict of image features from preprocessing
+    Returns:
+        bool: True if likely a letterhead document
+    """
+    # Convert to string path for pattern matching
+    path_str = str(image_path).lower()
+    # Check for common letterhead filename patterns
+    letterhead_patterns = [
+        r'letter(head)?[^/]*\.jpg',
+        r'hotel[^/]*\.jpg',
+        r'baldwin.*\.jpg',
+        r'business.*letter.*\.jpg',
+        r'correspondence.*\.jpg'
+    ]
+    for pattern in letterhead_patterns:
+        if re.search(pattern, path_str):
+            logger.info(f"Detected likely letterhead document: {Path(image_path).name}")
+            return True
+    # If features are provided, use them for additional detection
+    if features:
+        # Check for ALL CAPS sections that might be marginalia
+        if features.get('uppercase_sections', 0) > 1:
+            logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}")
+            return True
+    return False
+def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]:
+    """
+    Generate a specialized prompt for letterhead documents to improve OCR quality.
+    Args:
+        image_path: Path to the image file
+        features: Optional dict of image features from preprocessing
+    Returns:
+        str: Custom prompt for letterhead OCR or None if not applicable
+    """
+    if not is_likely_letterhead(image_path, features):
+        return None
+    # Path-specific customizations for known problematic documents
+    path_str = str(image_path).lower()
+    # Most specialized prompt for baldwin documents
+    if "baldwin" in path_str:
+        return """
+        This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines:
+        1. Identify and separate the letterhead elements:
+           - Header: The hotel name, address, and contact information at the top
+           - Marginalia: The amenities description in ALL CAPS along the margins
+        2. Extract the main handwritten letter content separately
+        3. Note any image captions separately
+        4. Format the output as follows:
+           - HEADER: [header text]
+           - MARGINS: [marginalia text]
+           - LETTER: [handwritten letter text]
+           - CAPTIONS: [any image captions]
+        Be careful not to duplicate content between sections, especially with margin text.
+        """
+    # General letterhead prompt
+    return """
+    This appears to be a letterhead document. Please extract the text with the following guidelines:
+    1. Identify the header/letterhead section with company name, logo, address, etc.
+    2. Identify any margin text or notes that appear separate from the main content
+    3. Extract the main letter/document body separately
+    4. Format the output as follows:
+       - LETTERHEAD: [letterhead text]
+       - MARGIN_NOTES: [any text in margins]
+       - BODY: [main document body]
+    Be careful not to duplicate content between sections.
+    """
+def clean_letterhead_ocr_output(text: str) -> str:
+    """
+    Clean OCR output from letterhead documents by handling section markers
+    and reducing duplication.
+    Args:
+        text: OCR text from letterhead document
+    Returns:
+        str: Cleaned text with proper section formatting
+    """
+    if not text:
+        return ""
+    # Find any section markers added by the specialized prompt
+    section_markers = [
+        "HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:",
+        "LETTER:", "BODY:", "CAPTIONS:"
+    ]
+    # Check if the text has any section markers
+    has_sections = any(marker in text for marker in section_markers)
+    if has_sections:
+        # Split text into sections while preserving section headers
+        sections = {}
+        current_section = "UNKNOWN"
+        current_text = []
+        for line in text.split('\n'):
+            # Check if this line is a section marker
+            is_marker = False
+            for marker in section_markers:
+                if marker in line:
+                    # Save previous section
+                    if current_text:
+                        sections[current_section] = '\n'.join(current_text).strip()
+                        current_text = []
+                    # Start new section
+                    current_section = marker.replace(':', '')
+                    # Keep any text after the marker on this line
+                    remainder = line.split(marker, 1)[1].strip()
+                    if remainder:
+                        current_text.append(remainder)
+                    is_marker = True
+                    break
+            # If not a marker, add to current section
+            if not is_marker:
+                current_text.append(line)
+        # Save the last section
+        if current_text:
+            sections[current_section] = '\n'.join(current_text).strip()
+        # Format with standard order and clear section headers
+        formatted_sections = []
+        # First add letterhead/header info
+        if "LETTERHEAD" in sections:
+            formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}")
+        elif "HEADER" in sections:
+            formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}")
+        # Add margins/notes
+        if "MARGIN_NOTES" in sections:
+            formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}")
+        elif "MARGINS" in sections:
+            formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}")
+        # Add main content
+        if "BODY" in sections:
+            formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}")
+        elif "LETTER" in sections:
+            formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}")
+        # Add captions if present
+        if "CAPTIONS" in sections:
+            formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}")
+        # Add unknown sections
+        if "UNKNOWN" in sections and sections["UNKNOWN"]:
+            formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}")
+        # Join everything with clear separation
+        return "\n\n".join(formatted_sections)
+    else:
+        # If no section markers were found, return the original text
+        return text

ocr_processing.py CHANGED Viewed

@@ -21,7 +21,7 @@ from structured_ocr import StructuredOCR
 from utils.image_utils import clean_ocr_result
 # Temporarily retain old utils imports until they are fully migrated
 from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
-from preprocessing import apply_preprocessing_to_file
 from error_handler import handle_ocr_error, check_file_size
 from image_segmentation import segment_image_for_ocr, process_segmented_image
@@ -182,6 +182,27 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
                 doc_type = preprocessing_options.get("document_type", "standard")
                 modified_custom_prompt = custom_prompt
                 # Add document-type specific instructions based on preprocessing options
                 if doc_type == "handwritten" and not modified_custom_prompt:
                     modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
@@ -214,7 +235,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
             progress_reporter.update(20, "Preparing image for processing...")
             # Apply preprocessing if needed
-            temp_path, preprocessing_applied = apply_preprocessing_to_file(
                 file_bytes,
                 file_ext,
                 preprocessing_options,
@@ -367,6 +388,27 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
                 doc_type = preprocessing_options.get("document_type", "standard")
                 modified_custom_prompt = custom_prompt
                 # Add document-type specific instructions based on preprocessing options
                 if doc_type == "handwritten" and not modified_custom_prompt:
                     modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
@@ -409,6 +451,27 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
                 doc_type = preprocessing_options.get("document_type", "standard")
                 modified_custom_prompt = custom_prompt
                 # Add document-type specific instructions based on preprocessing options
                 if doc_type == "handwritten" and not modified_custom_prompt:
                     modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
@@ -437,6 +500,85 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
         # Make sure file_type is explicitly set for PDFs
         if file_type == "pdf":
             result['file_type'] = "pdf"
         # 🔧 ALWAYS normalize result before returning
         result = clean_ocr_result(

 from utils.image_utils import clean_ocr_result
 # Temporarily retain old utils imports until they are fully migrated
 from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
+import preprocessing
 from error_handler import handle_ocr_error, check_file_size
 from image_segmentation import segment_image_for_ocr, process_segmented_image
                 doc_type = preprocessing_options.get("document_type", "standard")
                 modified_custom_prompt = custom_prompt
+                # Check for letterhead/marginalia document types with specialized handling
+                try:
+                    from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
+                    # Extract text density features if available
+                    features = None
+                    if 'text_density' in preprocessing_options:
+                        features = preprocessing_options['text_density']
+                    # Check if this looks like a letterhead document
+                    if is_likely_letterhead(temp_path, features):
+                        # Get specialized letterhead prompt
+                        letterhead_prompt = get_letterhead_prompt(temp_path, features)
+                        if letterhead_prompt:
+                            logger.info(f"Using specialized letterhead prompt for document")
+                            modified_custom_prompt = letterhead_prompt
+                            # Set document type for tracking
+                            preprocessing_options["document_type"] = "letterhead"
+                            doc_type = "letterhead"
+                except ImportError:
+                    logger.debug("Letterhead handler not available")
                 # Add document-type specific instructions based on preprocessing options
                 if doc_type == "handwritten" and not modified_custom_prompt:
                     modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
             progress_reporter.update(20, "Preparing image for processing...")
             # Apply preprocessing if needed
+            temp_path, preprocessing_applied = preprocessing.apply_preprocessing_to_file(
                 file_bytes,
                 file_ext,
                 preprocessing_options,
                 doc_type = preprocessing_options.get("document_type", "standard")
                 modified_custom_prompt = custom_prompt
+                # Check for letterhead/marginalia document types with specialized handling
+                try:
+                    from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
+                    # Extract text density features if available
+                    features = None
+                    if 'text_density' in preprocessing_options:
+                        features = preprocessing_options['text_density']
+                    # Check if this looks like a letterhead document
+                    if is_likely_letterhead(temp_path, features):
+                        # Get specialized letterhead prompt
+                        letterhead_prompt = get_letterhead_prompt(temp_path, features)
+                        if letterhead_prompt:
+                            logger.info(f"Using specialized letterhead prompt for document")
+                            modified_custom_prompt = letterhead_prompt
+                            # Set document type for tracking
+                            preprocessing_options["document_type"] = "letterhead"
+                            doc_type = "letterhead"
+                except ImportError:
+                    logger.debug("Letterhead handler not available")
                 # Add document-type specific instructions based on preprocessing options
                 if doc_type == "handwritten" and not modified_custom_prompt:
                     modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
                 doc_type = preprocessing_options.get("document_type", "standard")
                 modified_custom_prompt = custom_prompt
+                # Check for letterhead/marginalia document types with specialized handling
+                try:
+                    from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
+                    # Extract text density features if available
+                    features = None
+                    if 'text_density' in preprocessing_options:
+                        features = preprocessing_options['text_density']
+                    # Check if this looks like a letterhead document
+                    if is_likely_letterhead(temp_path, features):
+                        # Get specialized letterhead prompt
+                        letterhead_prompt = get_letterhead_prompt(temp_path, features)
+                        if letterhead_prompt:
+                            logger.info(f"Using specialized letterhead prompt for document")
+                            modified_custom_prompt = letterhead_prompt
+                            # Set document type for tracking
+                            preprocessing_options["document_type"] = "letterhead"
+                            doc_type = "letterhead"
+                except ImportError:
+                    logger.debug("Letterhead handler not available")
                 # Add document-type specific instructions based on preprocessing options
                 if doc_type == "handwritten" and not modified_custom_prompt:
                     modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
         # Make sure file_type is explicitly set for PDFs
         if file_type == "pdf":
             result['file_type'] = "pdf"
+        # Check for duplicated text patterns that indicate handwritten text issues
+        try:
+            from ocr_text_repair import detect_duplicate_text_issues, get_enhanced_preprocessing_options, get_handwritten_specific_prompt, clean_duplicated_text
+            # Check OCR output for duplication issues
+            if result and 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
+                ocr_text = result['ocr_contents']['raw_text']
+                has_duplication, duplication_details = detect_duplicate_text_issues(ocr_text)
+                # If we detect significant duplication in the output
+                if has_duplication and duplication_details.get('duplication_rate', 0) > 0.1:
+                    logger.info(f"Detected text duplication issues. Reprocessing as handwritten document with enhanced settings...")
+                    progress_reporter.update(75, "Detected duplication issues. Reprocessing with enhanced settings...")
+                    # Save original result before reprocessing
+                    original_result = result
+                    # Get enhanced preprocessing options for handwritten text
+                    enhanced_options = get_enhanced_preprocessing_options(preprocessing_options)
+                    # Reprocess with enhanced settings and specialized prompt
+                    handwritten_prompt = get_handwritten_specific_prompt(custom_prompt)
+                    # Process the image with the enhanced settings
+                    try:
+                        # Apply enhanced preprocessing to the original image
+                        enhanced_temp_path, _ = preprocessing.apply_preprocessing_to_file(
+                            open(temp_path, 'rb').read(),
+                            Path(temp_path).suffix.lower(),
+                            enhanced_options,
+                            temp_file_paths
+                        )
+                        # Process with enhanced settings
+                        processor = StructuredOCR()
+                        enhanced_result = processor.process_file(
+                            file_path=enhanced_temp_path,
+                            file_type="image",
+                            use_vision=use_vision,
+                            custom_prompt=handwritten_prompt,
+                            file_size_mb=file_size_mb
+                        )
+                        # Check if the enhanced result is better (less duplication)
+                        if 'ocr_contents' in enhanced_result and 'raw_text' in enhanced_result['ocr_contents']:
+                            enhanced_text = enhanced_result['ocr_contents']['raw_text']
+                            _, enhanced_issues = detect_duplicate_text_issues(enhanced_text)
+                            # Use the enhanced result if it's better
+                            if enhanced_issues.get('duplication_rate', 1.0) < duplication_details.get('duplication_rate', 1.0):
+                                logger.info("Enhanced processing improved OCR quality. Using enhanced result.")
+                                result = enhanced_result
+                                # Preserve document type and preprocessing info
+                                result['document_type'] = 'handwritten'
+                                result['preprocessing'] = enhanced_options
+                            else:
+                                # If enhancement didn't help, clean up the original result
+                                logger.info("Enhanced processing did not improve OCR quality. Cleaning original result.")
+                                result = original_result
+                                # Clean up duplication in the text
+                                if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
+                                    result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
+                        else:
+                            # Fallback to original with cleaning
+                            logger.info("Enhanced processing failed. Cleaning original result.")
+                            result = original_result
+                            # Clean up duplication in the text
+                            if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
+                                result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
+                    except Exception as enh_error:
+                        logger.warning(f"Enhanced processing failed: {str(enh_error)}. Using cleaned original.")
+                        # Fallback to original with cleaning
+                        result = original_result
+                        # Clean up duplication in the text
+                        if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
+                            result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
+        except ImportError:
+            logger.debug("OCR text repair module not available")
         # 🔧 ALWAYS normalize result before returning
         result = clean_ocr_result(

test_adaptive_segmentation.py ADDED Viewed

	@@ -0,0 +1,98 @@

+#!/usr/bin/env python3
+"""
+Test script for adaptive content-aware segmentation.
+Processes sample documents to validate the improved segmentation approach.
+"""
+import os
+import sys
+import logging
+from pathlib import Path
+import cv2
+import numpy as np
+from PIL import Image
+import json
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Import segmentation module
+from image_segmentation import segment_image_for_ocr, process_segmented_image
+# Test documents
+TEST_DOCUMENTS = [
+    "input/baldwin-15th-north.jpg",    # Document with varied text density and uppercase sections
+    "input/americae-retectio.jpg",     # Historical document
+    "input/handwritten-letter.jpg",    # Handwritten document
+]
+def test_adaptive_segmentation():
+    """
+    Run the adaptive segmentation on test documents and visualize the results.
+    """
+    # Create output directory
+    output_dir = Path("output") / "adaptive_test"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    results = {}
+    # Process each test document
+    for document_path in TEST_DOCUMENTS:
+        document_file = Path(document_path)
+        if not document_file.exists():
+            logger.warning(f"Test document not found: {document_path}")
+            continue
+        logger.info(f"Processing test document: {document_file.name}")
+        # Process the document
+        segmentation_results = process_segmented_image(document_file, output_dir)
+        # Create a combined visualization
+        if segmentation_results.get('text_regions_coordinates'):
+            # Print analysis
+            logger.info(f"Document: {document_file.name}")
+            logger.info(f"Found {len(segmentation_results['text_regions_coordinates'])} text regions")
+            logger.info(f"Output files: {segmentation_results.get('output_files', {})}")
+            # Store results
+            results[document_file.name] = {
+                "regions_count": len(segmentation_results['text_regions_coordinates']),
+                "output_files": segmentation_results.get('output_files', {}),
+                "regions": segmentation_results.get('text_regions_coordinates', [])
+            }
+    # Save summary report
+    with open(output_dir / "adaptive_segmentation_results.json", "w") as f:
+        json.dump(results, f, indent=2)
+    # Create a summary report
+    with open(output_dir / "adaptive_segmentation_report.md", "w") as f:
+        f.write("# Adaptive Segmentation Test Results\n\n")
+        f.write("This report summarizes the results of testing the adaptive content-aware segmentation approach.\n\n")
+        for document_name, result in results.items():
+            f.write(f"## {document_name}\n\n")
+            f.write(f"- Regions detected: {result['regions_count']}\n")
+            f.write(f"- Output files:\n")
+            for file_type, file_path in result.get('output_files', {}).items():
+                f.write(f"  - {file_type}: {file_path}\n")
+            f.write("\n")
+            # Add region analysis
+            if result.get('regions'):
+                f.write("### Region Analysis\n\n")
+                f.write("| Region | X | Y | Width | Height |\n")
+                f.write("|--------|---|---|-------|--------|\n")
+                for i, region in enumerate(result['regions']):
+                    x, y, w, h = region
+                    f.write(f"| {i+1} | {x} | {y} | {w} | {h} |\n")
+                f.write("\n")
+    logger.info(f"Test completed. Results saved to {output_dir}")
+    logger.info(f"Summary report: {output_dir / 'adaptive_segmentation_report.md'}")
+if __name__ == "__main__":
+    test_adaptive_segmentation()

utils/image_utils.py CHANGED Viewed

@@ -327,6 +327,65 @@ def calculate_image_entropy(pil_img: Image.Image) -> float:
     entropy = -np.sum(hist * np.log2(hist))
     return float(entropy)
 def serialize_ocr_object(obj):
     """
     Serialize OCR response objects to JSON serializable format.

     entropy = -np.sum(hist * np.log2(hist))
     return float(entropy)
+def estimate_text_density(image_np):
+    """
+    Estimate text density patterns in an image.
+    Returns metrics on text distribution and special cases.
+    Args:
+        image_np: Numpy array of the image
+    Returns:
+        dict: Text density metrics
+    """
+    # Convert to grayscale
+    if len(image_np.shape) > 2 and image_np.shape[2] == 3:
+        gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
+    else:
+        gray = image_np
+    # Binarize image
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    # Analyze vertical text density profile (important for headers/footers)
+    height, width = gray.shape
+    vertical_profile = np.sum(binary, axis=1) / width
+    # Analyze horizontal text density profile
+    horizontal_profile = np.sum(binary, axis=0) / height
+    # Calculate statistics
+    v_mean = np.mean(vertical_profile)
+    v_std = np.std(vertical_profile)
+    v_max = np.max(vertical_profile)
+    # Detect uppercase text regions (common in headers of Baldwin document)
+    # Uppercase text tends to have more consistent height and uniform vertical density
+    section_height = height // 10  # Divide into 10 vertical sections
+    uppercase_sections = 0
+    for i in range(0, height, section_height):
+        section = binary[i:min(i+section_height, height), :]
+        section_profile = np.sum(section, axis=1) / width
+        # Uppercase characteristics: high density with low variation
+        if np.mean(section_profile) > v_mean * 1.5 and np.std(section_profile) < v_std * 0.7:
+            uppercase_sections += 1
+    # Determine overall pattern
+    if v_std / v_mean > 0.8:
+        pattern = 'varied'  # High variance indicates sections with different text densities
+    else:
+        pattern = 'uniform'  # Low variance indicates uniform text distribution
+    return {
+        'mean_density': float(v_mean),
+        'density_variation': float(v_std),
+        'pattern': pattern,
+        'uppercase_sections': uppercase_sections,
+        'max_density': float(v_max)
+    }
 def serialize_ocr_object(obj):
     """
     Serialize OCR response objects to JSON serializable format.

utils/text_utils.py CHANGED Viewed

@@ -119,6 +119,76 @@ def clean_raw_text(text: str) -> str:
     return text.strip()
 def merge_region_texts(regions: List[Dict[str, Any]], min_similarity_threshold: float = 0.7) -> str:
     """
     Intelligently merge text from multiple document regions, handling overlapping content.

     return text.strip()
+def detect_content_regions(image_np):
+    """
+    Detect content regions based on text density analysis.
+    Returns regions with adaptive overlapping.
+    Args:
+        image_np: Numpy array image
+    Returns:
+        list: List of region tuples (x, y, width, height)
+    """
+    # Import necessary modules
+    import numpy as np
+    import cv2
+    # Convert to grayscale for text detection
+    if len(image_np.shape) > 2 and image_np.shape[2] == 3:
+        gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
+    else:
+        gray = image_np
+    # Create text density profile
+    # Sum pixel values horizontally to get vertical text density
+    v_profile = np.sum(255 - gray, axis=1)
+    # Normalize the profile
+    v_profile = v_profile / np.max(v_profile) if np.max(v_profile) > 0 else v_profile
+    # Find significant density changes
+    changes = []
+    threshold = 0.2
+    for i in range(1, len(v_profile)):
+        if abs(v_profile[i] - v_profile[i-1]) > threshold:
+            changes.append(i)
+    # Create adaptive regions based on density changes
+    img_height, img_width = gray.shape
+    # Default to at least 3 regions with overlap
+    if len(changes) < 2:
+        # If no significant changes, use default division with overlapping regions
+        header_height = int(img_height * 0.3)
+        middle_start = int(img_height * 0.2)
+        middle_height = int(img_height * 0.4)
+        body_start = int(img_height * 0.5)
+        body_height = img_height - body_start
+    else:
+        # Use detected density changes for more precise regions
+        changes = sorted(changes)
+        header_height = changes[0] + int(img_height * 0.05)  # Add overlap
+        middle_start = max(0, changes[0] - int(img_height * 0.05))
+        if len(changes) > 1:
+            middle_height = (changes[1] - middle_start) + int(img_height * 0.05)
+            body_start = max(0, changes[1] - int(img_height * 0.05))
+        else:
+            middle_height = int(img_height * 0.4)
+            body_start = int(img_height * 0.5)
+        body_height = img_height - body_start
+    # Define regions with adaptive overlap
+    regions = [
+        (0, 0, img_width, header_height),                  # Header region
+        (0, middle_start, img_width, middle_height),       # Middle region with overlap
+        (0, body_start, img_width, body_height)            # Body region with overlap
+    ]
+    return regions
 def merge_region_texts(regions: List[Dict[str, Any]], min_similarity_threshold: float = 0.7) -> str:
     """
     Intelligently merge text from multiple document regions, handling overlapping content.