milwright commited on
Commit
3dd2ff2
·
1 Parent(s): 73375a3

Fix OCR processing variable scope issue by using explicit module reference for apply_preprocessing_to_file

Browse files
image_segmentation.py CHANGED
@@ -1,7 +1,7 @@
1
  """
2
  Image segmentation utility for OCR preprocessing.
3
  Separates text regions from image regions to improve OCR accuracy on mixed-content documents.
4
- Based on Mistral AI cookbook examples.
5
  """
6
 
7
  import cv2
@@ -18,33 +18,10 @@ logging.basicConfig(level=logging.INFO,
18
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
19
  logger = logging.getLogger(__name__)
20
 
21
- def determine_segmentation_approach(image_path: Union[str, Path]) -> str:
22
- """
23
- Determine which segmentation approach to use based on the document type.
24
-
25
- Args:
26
- image_path: Path to the image file
27
-
28
- Returns:
29
- str: Segmentation approach to use ('simplified' or 'original')
30
- """
31
- # Convert to string for easier pattern matching
32
- filename = str(image_path).lower()
33
-
34
- # Document-specific rules based on testing results
35
- if "baldwin" in filename and "north" in filename:
36
- # Baldwin documents showed better results with original approach
37
- return "original"
38
-
39
- # Default to our simplified approach for most documents
40
- return "simplified"
41
-
42
  def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = True, preserve_content: bool = True) -> Dict[str, Union[Image.Image, str]]:
43
  """
44
- Prepare image for OCR processing using the most appropriate segmentation approach.
45
- For most documents, this uses a minimal approach that trusts Mistral OCR
46
- to handle document understanding and layout analysis. For specific document types
47
- that benefit from custom segmentation, a document-specific approach is used.
48
 
49
  Args:
50
  image_path: Path to the image file
@@ -57,11 +34,8 @@ def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = T
57
  # Convert to Path object if string
58
  image_file = Path(image_path) if isinstance(image_path, str) else image_path
59
 
60
- # Determine the segmentation approach to use
61
- approach = determine_segmentation_approach(image_file)
62
-
63
  # Log start of processing
64
- logger.info(f"Preparing image for Mistral OCR: {image_file.name} (using {approach} approach)")
65
 
66
  try:
67
  # Open original image with PIL
@@ -88,80 +62,29 @@ def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = T
88
  img_np = np.array(pil_img)
89
  img_width, img_height = pil_img.size
90
 
91
- # Apply the appropriate segmentation approach based on the document type
92
- if approach == "simplified":
93
- # SIMPLIFIED APPROACH for most documents:
94
- # Let Mistral OCR handle the entire document understanding process
95
-
96
- # For visualization, mark the entire image as a text region
97
- full_image_region = [(0, 0, img_width, img_height)]
98
-
99
- # Create visualization with a simple border
100
- vis_img = img_np.copy()
101
- cv2.rectangle(vis_img, (5, 5), (img_width-5, img_height-5), (0, 255, 0), 5)
102
-
103
- # Add text to indicate this is using Mistral's native processing
104
- font = cv2.FONT_HERSHEY_SIMPLEX
105
- cv2.putText(vis_img, "Processed by Mistral OCR", (30, 60), font, 1, (0, 255, 0), 2)
106
-
107
- # Create visualizations and masks
108
- text_regions_vis = Image.fromarray(vis_img)
109
- image_regions_vis = text_regions_vis.copy()
110
-
111
- # Create a mask of the entire image (just for visualization)
112
- text_mask = np.ones((img_height, img_width), dtype=np.uint8) * 255
113
- _, buffer = cv2.imencode('.png', text_mask)
114
- text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
115
-
116
- # Return the original image as the combined result
117
- return {
118
- 'text_regions': text_regions_vis,
119
- 'image_regions': image_regions_vis,
120
- 'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
121
- 'combined_result': pil_img,
122
- 'text_regions_coordinates': full_image_region,
123
- 'region_images': [{
124
- 'image': img_np,
125
- 'pil_image': pil_img,
126
- 'coordinates': (0, 0, img_width, img_height),
127
- 'padded_coordinates': (0, 0, img_width, img_height),
128
- 'order': 0
129
- }]
130
- }
131
 
132
- else:
133
- # DOCUMENT-SPECIFIC APPROACH for baldwin-north and similar documents
134
- # Use more structured segmentation with customized region detection
135
- # This approach is preferred for documents that showed better results in testing
136
-
137
- # Create a visualization with green borders around the text regions
138
- vis_img = img_np.copy()
139
-
140
- # For baldwin-north type documents, create a more granular segmentation
141
- # Define regions with more detailed segmentation for better text capture
142
- # Use 3 overlapping regions instead of 2 distinct ones
143
 
144
- # Define header, middle, and body sections with overlap
145
- header_height = int(img_height * 0.3) # Top 30% as header (increased from 25%)
146
- middle_start = int(img_height * 0.2) # Start middle section with overlap
147
- middle_height = int(img_height * 0.4) # Middle 40%
148
- body_start = int(img_height * 0.5) # Start body with overlap
149
- body_height = img_height - body_start # Remaining height
150
 
151
- # Define regions with overlap to ensure no text is missed
152
- regions = [
153
- (0, 0, img_width, header_height), # Header region
154
- (0, middle_start, img_width, middle_height), # Middle region with overlap
155
- (0, body_start, img_width, body_height) # Body region with overlap
156
- ]
157
 
158
  # Draw regions on visualization
159
  for x, y, w, h in regions:
160
  cv2.rectangle(vis_img, (x, y), (x+w, y+h), (0, 255, 0), 3)
161
 
162
- # Add text to indicate we're using the document-specific approach
163
  font = cv2.FONT_HERSHEY_SIMPLEX
164
- cv2.putText(vis_img, "Document-specific processing", (30, 60), font, 1, (0, 255, 0), 2)
165
 
166
  # Create visualization images
167
  text_regions_vis = Image.fromarray(vis_img)
@@ -190,14 +113,56 @@ def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = T
190
  }
191
  region_images.append(region_info)
192
 
193
- # Return the structured segmentation results
194
  return {
195
  'text_regions': text_regions_vis,
196
  'image_regions': image_regions_vis,
197
  'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
198
  'combined_result': pil_img,
199
  'text_regions_coordinates': regions,
200
- 'region_images': region_images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  }
202
 
203
  except Exception as e:
 
1
  """
2
  Image segmentation utility for OCR preprocessing.
3
  Separates text regions from image regions to improve OCR accuracy on mixed-content documents.
4
+ Uses content-aware adaptive segmentation for improved results across document types.
5
  """
6
 
7
  import cv2
 
18
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
19
  logger = logging.getLogger(__name__)
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = True, preserve_content: bool = True) -> Dict[str, Union[Image.Image, str]]:
22
  """
23
+ Prepare image for OCR processing using content-aware segmentation.
24
+ Uses adaptive region detection based on text density analysis.
 
 
25
 
26
  Args:
27
  image_path: Path to the image file
 
34
  # Convert to Path object if string
35
  image_file = Path(image_path) if isinstance(image_path, str) else image_path
36
 
 
 
 
37
  # Log start of processing
38
+ logger.info(f"Preparing image for Mistral OCR: {image_file.name}")
39
 
40
  try:
41
  # Open original image with PIL
 
62
  img_np = np.array(pil_img)
63
  img_width, img_height = pil_img.size
64
 
65
+ # Analyze text density to determine if advanced segmentation is needed
66
+ # This replaces document-specific logic with content-aware analysis
67
+ from utils.image_utils import estimate_text_density
68
+ text_density = estimate_text_density(img_np)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ # Use adaptive approach for documents with unusual text distribution
71
+ if text_density['pattern'] == 'varied' or text_density['uppercase_sections'] > 0:
72
+ logger.info(f"Using adaptive segmentation for document with varied text density pattern={text_density['pattern']}, uppercase_sections={text_density['uppercase_sections']}")
 
 
 
 
 
 
 
 
73
 
74
+ # Detect content regions based on text density
75
+ from utils.text_utils import detect_content_regions
76
+ regions = detect_content_regions(img_np)
 
 
 
77
 
78
+ # Create visualization with green borders around the text regions
79
+ vis_img = img_np.copy()
 
 
 
 
80
 
81
  # Draw regions on visualization
82
  for x, y, w, h in regions:
83
  cv2.rectangle(vis_img, (x, y), (x+w, y+h), (0, 255, 0), 3)
84
 
85
+ # Add text to indicate we're using adaptive processing
86
  font = cv2.FONT_HERSHEY_SIMPLEX
87
+ cv2.putText(vis_img, "Adaptive region processing", (30, 60), font, 1, (0, 255, 0), 2)
88
 
89
  # Create visualization images
90
  text_regions_vis = Image.fromarray(vis_img)
 
113
  }
114
  region_images.append(region_info)
115
 
116
+ # Return the adaptive segmentation results
117
  return {
118
  'text_regions': text_regions_vis,
119
  'image_regions': image_regions_vis,
120
  'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
121
  'combined_result': pil_img,
122
  'text_regions_coordinates': regions,
123
+ 'region_images': region_images,
124
+ 'segmentation_type': 'adaptive'
125
+ }
126
+ else:
127
+ # SIMPLIFIED APPROACH for most documents
128
+ # Let Mistral OCR handle the entire document understanding process
129
+ logger.info(f"Using standard approach for document with uniform text density")
130
+
131
+ # For visualization, mark the entire image as a text region
132
+ full_image_region = [(0, 0, img_width, img_height)]
133
+
134
+ # Create visualization with a simple border
135
+ vis_img = img_np.copy()
136
+ cv2.rectangle(vis_img, (5, 5), (img_width-5, img_height-5), (0, 255, 0), 5)
137
+
138
+ # Add text to indicate this is using Mistral's native processing
139
+ font = cv2.FONT_HERSHEY_SIMPLEX
140
+ cv2.putText(vis_img, "Processed by Mistral OCR", (30, 60), font, 1, (0, 255, 0), 2)
141
+
142
+ # Create visualizations and masks
143
+ text_regions_vis = Image.fromarray(vis_img)
144
+ image_regions_vis = text_regions_vis.copy()
145
+
146
+ # Create a mask of the entire image (just for visualization)
147
+ text_mask = np.ones((img_height, img_width), dtype=np.uint8) * 255
148
+ _, buffer = cv2.imencode('.png', text_mask)
149
+ text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
150
+
151
+ # Return the original image as the combined result
152
+ return {
153
+ 'text_regions': text_regions_vis,
154
+ 'image_regions': image_regions_vis,
155
+ 'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
156
+ 'combined_result': pil_img,
157
+ 'text_regions_coordinates': full_image_region,
158
+ 'region_images': [{
159
+ 'image': img_np,
160
+ 'pil_image': pil_img,
161
+ 'coordinates': (0, 0, img_width, img_height),
162
+ 'padded_coordinates': (0, 0, img_width, img_height),
163
+ 'order': 0
164
+ }],
165
+ 'segmentation_type': 'simplified'
166
  }
167
 
168
  except Exception as e:
letterhead_handler.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Specialized handler for letterhead and marginalia documents.
3
+ Enhances OCR quality by providing document-specific prompts for common layouts.
4
+ """
5
+
6
+ import re
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import Union, Dict, Any, Optional, List
10
+
11
+ # Configure logging
12
+ logging.basicConfig(level=logging.INFO,
13
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
14
+ logger = logging.getLogger(__name__)
15
+
16
+ def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool:
17
+ """
18
+ Detect if an image is likely a letterhead document with marginalia.
19
+ Uses path/filename patterns and optional image features (if provided).
20
+
21
+ Args:
22
+ image_path: Path to the image file
23
+ features: Optional dict of image features from preprocessing
24
+
25
+ Returns:
26
+ bool: True if likely a letterhead document
27
+ """
28
+ # Convert to string path for pattern matching
29
+ path_str = str(image_path).lower()
30
+
31
+ # Check for common letterhead filename patterns
32
+ letterhead_patterns = [
33
+ r'letter(head)?[^/]*\.jpg',
34
+ r'hotel[^/]*\.jpg',
35
+ r'baldwin.*\.jpg',
36
+ r'business.*letter.*\.jpg',
37
+ r'correspondence.*\.jpg'
38
+ ]
39
+
40
+ for pattern in letterhead_patterns:
41
+ if re.search(pattern, path_str):
42
+ logger.info(f"Detected likely letterhead document: {Path(image_path).name}")
43
+ return True
44
+
45
+ # If features are provided, use them for additional detection
46
+ if features:
47
+ # Check for ALL CAPS sections that might be marginalia
48
+ if features.get('uppercase_sections', 0) > 1:
49
+ logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}")
50
+ return True
51
+
52
+ return False
53
+
54
+ def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]:
55
+ """
56
+ Generate a specialized prompt for letterhead documents to improve OCR quality.
57
+
58
+ Args:
59
+ image_path: Path to the image file
60
+ features: Optional dict of image features from preprocessing
61
+
62
+ Returns:
63
+ str: Custom prompt for letterhead OCR or None if not applicable
64
+ """
65
+ if not is_likely_letterhead(image_path, features):
66
+ return None
67
+
68
+ # Path-specific customizations for known problematic documents
69
+ path_str = str(image_path).lower()
70
+
71
+ # Most specialized prompt for baldwin documents
72
+ if "baldwin" in path_str:
73
+ return """
74
+ This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines:
75
+
76
+ 1. Identify and separate the letterhead elements:
77
+ - Header: The hotel name, address, and contact information at the top
78
+ - Marginalia: The amenities description in ALL CAPS along the margins
79
+
80
+ 2. Extract the main handwritten letter content separately
81
+
82
+ 3. Note any image captions separately
83
+
84
+ 4. Format the output as follows:
85
+ - HEADER: [header text]
86
+ - MARGINS: [marginalia text]
87
+ - LETTER: [handwritten letter text]
88
+ - CAPTIONS: [any image captions]
89
+
90
+ Be careful not to duplicate content between sections, especially with margin text.
91
+ """
92
+
93
+ # General letterhead prompt
94
+ return """
95
+ This appears to be a letterhead document. Please extract the text with the following guidelines:
96
+
97
+ 1. Identify the header/letterhead section with company name, logo, address, etc.
98
+ 2. Identify any margin text or notes that appear separate from the main content
99
+ 3. Extract the main letter/document body separately
100
+ 4. Format the output as follows:
101
+ - LETTERHEAD: [letterhead text]
102
+ - MARGIN_NOTES: [any text in margins]
103
+ - BODY: [main document body]
104
+
105
+ Be careful not to duplicate content between sections.
106
+ """
107
+
108
+ def clean_letterhead_ocr_output(text: str) -> str:
109
+ """
110
+ Clean OCR output from letterhead documents by handling section markers
111
+ and reducing duplication.
112
+
113
+ Args:
114
+ text: OCR text from letterhead document
115
+
116
+ Returns:
117
+ str: Cleaned text with proper section formatting
118
+ """
119
+ if not text:
120
+ return ""
121
+
122
+ # Find any section markers added by the specialized prompt
123
+ section_markers = [
124
+ "HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:",
125
+ "LETTER:", "BODY:", "CAPTIONS:"
126
+ ]
127
+
128
+ # Check if the text has any section markers
129
+ has_sections = any(marker in text for marker in section_markers)
130
+
131
+ if has_sections:
132
+ # Split text into sections while preserving section headers
133
+ sections = {}
134
+ current_section = "UNKNOWN"
135
+ current_text = []
136
+
137
+ for line in text.split('\n'):
138
+ # Check if this line is a section marker
139
+ is_marker = False
140
+ for marker in section_markers:
141
+ if marker in line:
142
+ # Save previous section
143
+ if current_text:
144
+ sections[current_section] = '\n'.join(current_text).strip()
145
+ current_text = []
146
+
147
+ # Start new section
148
+ current_section = marker.replace(':', '')
149
+ # Keep any text after the marker on this line
150
+ remainder = line.split(marker, 1)[1].strip()
151
+ if remainder:
152
+ current_text.append(remainder)
153
+ is_marker = True
154
+ break
155
+
156
+ # If not a marker, add to current section
157
+ if not is_marker:
158
+ current_text.append(line)
159
+
160
+ # Save the last section
161
+ if current_text:
162
+ sections[current_section] = '\n'.join(current_text).strip()
163
+
164
+ # Format with standard order and clear section headers
165
+ formatted_sections = []
166
+
167
+ # First add letterhead/header info
168
+ if "LETTERHEAD" in sections:
169
+ formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}")
170
+ elif "HEADER" in sections:
171
+ formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}")
172
+
173
+ # Add margins/notes
174
+ if "MARGIN_NOTES" in sections:
175
+ formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}")
176
+ elif "MARGINS" in sections:
177
+ formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}")
178
+
179
+ # Add main content
180
+ if "BODY" in sections:
181
+ formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}")
182
+ elif "LETTER" in sections:
183
+ formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}")
184
+
185
+ # Add captions if present
186
+ if "CAPTIONS" in sections:
187
+ formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}")
188
+
189
+ # Add unknown sections
190
+ if "UNKNOWN" in sections and sections["UNKNOWN"]:
191
+ formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}")
192
+
193
+ # Join everything with clear separation
194
+ return "\n\n".join(formatted_sections)
195
+ else:
196
+ # If no section markers were found, return the original text
197
+ return text
ocr_processing.py CHANGED
@@ -21,7 +21,7 @@ from structured_ocr import StructuredOCR
21
  from utils.image_utils import clean_ocr_result
22
  # Temporarily retain old utils imports until they are fully migrated
23
  from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
24
- from preprocessing import apply_preprocessing_to_file
25
  from error_handler import handle_ocr_error, check_file_size
26
  from image_segmentation import segment_image_for_ocr, process_segmented_image
27
 
@@ -182,6 +182,27 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
182
  doc_type = preprocessing_options.get("document_type", "standard")
183
  modified_custom_prompt = custom_prompt
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  # Add document-type specific instructions based on preprocessing options
186
  if doc_type == "handwritten" and not modified_custom_prompt:
187
  modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
@@ -214,7 +235,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
214
  progress_reporter.update(20, "Preparing image for processing...")
215
 
216
  # Apply preprocessing if needed
217
- temp_path, preprocessing_applied = apply_preprocessing_to_file(
218
  file_bytes,
219
  file_ext,
220
  preprocessing_options,
@@ -367,6 +388,27 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
367
  doc_type = preprocessing_options.get("document_type", "standard")
368
  modified_custom_prompt = custom_prompt
369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  # Add document-type specific instructions based on preprocessing options
371
  if doc_type == "handwritten" and not modified_custom_prompt:
372
  modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
@@ -409,6 +451,27 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
409
  doc_type = preprocessing_options.get("document_type", "standard")
410
  modified_custom_prompt = custom_prompt
411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  # Add document-type specific instructions based on preprocessing options
413
  if doc_type == "handwritten" and not modified_custom_prompt:
414
  modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
@@ -437,6 +500,85 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
437
  # Make sure file_type is explicitly set for PDFs
438
  if file_type == "pdf":
439
  result['file_type'] = "pdf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
 
441
  # 🔧 ALWAYS normalize result before returning
442
  result = clean_ocr_result(
 
21
  from utils.image_utils import clean_ocr_result
22
  # Temporarily retain old utils imports until they are fully migrated
23
  from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
24
+ import preprocessing
25
  from error_handler import handle_ocr_error, check_file_size
26
  from image_segmentation import segment_image_for_ocr, process_segmented_image
27
 
 
182
  doc_type = preprocessing_options.get("document_type", "standard")
183
  modified_custom_prompt = custom_prompt
184
 
185
+ # Check for letterhead/marginalia document types with specialized handling
186
+ try:
187
+ from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
188
+ # Extract text density features if available
189
+ features = None
190
+ if 'text_density' in preprocessing_options:
191
+ features = preprocessing_options['text_density']
192
+
193
+ # Check if this looks like a letterhead document
194
+ if is_likely_letterhead(temp_path, features):
195
+ # Get specialized letterhead prompt
196
+ letterhead_prompt = get_letterhead_prompt(temp_path, features)
197
+ if letterhead_prompt:
198
+ logger.info(f"Using specialized letterhead prompt for document")
199
+ modified_custom_prompt = letterhead_prompt
200
+ # Set document type for tracking
201
+ preprocessing_options["document_type"] = "letterhead"
202
+ doc_type = "letterhead"
203
+ except ImportError:
204
+ logger.debug("Letterhead handler not available")
205
+
206
  # Add document-type specific instructions based on preprocessing options
207
  if doc_type == "handwritten" and not modified_custom_prompt:
208
  modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
 
235
  progress_reporter.update(20, "Preparing image for processing...")
236
 
237
  # Apply preprocessing if needed
238
+ temp_path, preprocessing_applied = preprocessing.apply_preprocessing_to_file(
239
  file_bytes,
240
  file_ext,
241
  preprocessing_options,
 
388
  doc_type = preprocessing_options.get("document_type", "standard")
389
  modified_custom_prompt = custom_prompt
390
 
391
+ # Check for letterhead/marginalia document types with specialized handling
392
+ try:
393
+ from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
394
+ # Extract text density features if available
395
+ features = None
396
+ if 'text_density' in preprocessing_options:
397
+ features = preprocessing_options['text_density']
398
+
399
+ # Check if this looks like a letterhead document
400
+ if is_likely_letterhead(temp_path, features):
401
+ # Get specialized letterhead prompt
402
+ letterhead_prompt = get_letterhead_prompt(temp_path, features)
403
+ if letterhead_prompt:
404
+ logger.info(f"Using specialized letterhead prompt for document")
405
+ modified_custom_prompt = letterhead_prompt
406
+ # Set document type for tracking
407
+ preprocessing_options["document_type"] = "letterhead"
408
+ doc_type = "letterhead"
409
+ except ImportError:
410
+ logger.debug("Letterhead handler not available")
411
+
412
  # Add document-type specific instructions based on preprocessing options
413
  if doc_type == "handwritten" and not modified_custom_prompt:
414
  modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
 
451
  doc_type = preprocessing_options.get("document_type", "standard")
452
  modified_custom_prompt = custom_prompt
453
 
454
+ # Check for letterhead/marginalia document types with specialized handling
455
+ try:
456
+ from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
457
+ # Extract text density features if available
458
+ features = None
459
+ if 'text_density' in preprocessing_options:
460
+ features = preprocessing_options['text_density']
461
+
462
+ # Check if this looks like a letterhead document
463
+ if is_likely_letterhead(temp_path, features):
464
+ # Get specialized letterhead prompt
465
+ letterhead_prompt = get_letterhead_prompt(temp_path, features)
466
+ if letterhead_prompt:
467
+ logger.info(f"Using specialized letterhead prompt for document")
468
+ modified_custom_prompt = letterhead_prompt
469
+ # Set document type for tracking
470
+ preprocessing_options["document_type"] = "letterhead"
471
+ doc_type = "letterhead"
472
+ except ImportError:
473
+ logger.debug("Letterhead handler not available")
474
+
475
  # Add document-type specific instructions based on preprocessing options
476
  if doc_type == "handwritten" and not modified_custom_prompt:
477
  modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
 
500
  # Make sure file_type is explicitly set for PDFs
501
  if file_type == "pdf":
502
  result['file_type'] = "pdf"
503
+
504
+ # Check for duplicated text patterns that indicate handwritten text issues
505
+ try:
506
+ from ocr_text_repair import detect_duplicate_text_issues, get_enhanced_preprocessing_options, get_handwritten_specific_prompt, clean_duplicated_text
507
+
508
+ # Check OCR output for duplication issues
509
+ if result and 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
510
+ ocr_text = result['ocr_contents']['raw_text']
511
+ has_duplication, duplication_details = detect_duplicate_text_issues(ocr_text)
512
+
513
+ # If we detect significant duplication in the output
514
+ if has_duplication and duplication_details.get('duplication_rate', 0) > 0.1:
515
+ logger.info(f"Detected text duplication issues. Reprocessing as handwritten document with enhanced settings...")
516
+ progress_reporter.update(75, "Detected duplication issues. Reprocessing with enhanced settings...")
517
+
518
+ # Save original result before reprocessing
519
+ original_result = result
520
+
521
+ # Get enhanced preprocessing options for handwritten text
522
+ enhanced_options = get_enhanced_preprocessing_options(preprocessing_options)
523
+
524
+ # Reprocess with enhanced settings and specialized prompt
525
+ handwritten_prompt = get_handwritten_specific_prompt(custom_prompt)
526
+
527
+ # Process the image with the enhanced settings
528
+ try:
529
+ # Apply enhanced preprocessing to the original image
530
+ enhanced_temp_path, _ = preprocessing.apply_preprocessing_to_file(
531
+ open(temp_path, 'rb').read(),
532
+ Path(temp_path).suffix.lower(),
533
+ enhanced_options,
534
+ temp_file_paths
535
+ )
536
+
537
+ # Process with enhanced settings
538
+ processor = StructuredOCR()
539
+ enhanced_result = processor.process_file(
540
+ file_path=enhanced_temp_path,
541
+ file_type="image",
542
+ use_vision=use_vision,
543
+ custom_prompt=handwritten_prompt,
544
+ file_size_mb=file_size_mb
545
+ )
546
+
547
+ # Check if the enhanced result is better (less duplication)
548
+ if 'ocr_contents' in enhanced_result and 'raw_text' in enhanced_result['ocr_contents']:
549
+ enhanced_text = enhanced_result['ocr_contents']['raw_text']
550
+ _, enhanced_issues = detect_duplicate_text_issues(enhanced_text)
551
+
552
+ # Use the enhanced result if it's better
553
+ if enhanced_issues.get('duplication_rate', 1.0) < duplication_details.get('duplication_rate', 1.0):
554
+ logger.info("Enhanced processing improved OCR quality. Using enhanced result.")
555
+ result = enhanced_result
556
+ # Preserve document type and preprocessing info
557
+ result['document_type'] = 'handwritten'
558
+ result['preprocessing'] = enhanced_options
559
+ else:
560
+ # If enhancement didn't help, clean up the original result
561
+ logger.info("Enhanced processing did not improve OCR quality. Cleaning original result.")
562
+ result = original_result
563
+ # Clean up duplication in the text
564
+ if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
565
+ result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
566
+ else:
567
+ # Fallback to original with cleaning
568
+ logger.info("Enhanced processing failed. Cleaning original result.")
569
+ result = original_result
570
+ # Clean up duplication in the text
571
+ if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
572
+ result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
573
+ except Exception as enh_error:
574
+ logger.warning(f"Enhanced processing failed: {str(enh_error)}. Using cleaned original.")
575
+ # Fallback to original with cleaning
576
+ result = original_result
577
+ # Clean up duplication in the text
578
+ if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
579
+ result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
580
+ except ImportError:
581
+ logger.debug("OCR text repair module not available")
582
 
583
  # 🔧 ALWAYS normalize result before returning
584
  result = clean_ocr_result(
test_adaptive_segmentation.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for adaptive content-aware segmentation.
4
+ Processes sample documents to validate the improved segmentation approach.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import logging
10
+ from pathlib import Path
11
+ import cv2
12
+ import numpy as np
13
+ from PIL import Image
14
+ import json
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO,
18
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Import segmentation module
22
+ from image_segmentation import segment_image_for_ocr, process_segmented_image
23
+
24
+ # Test documents
25
+ TEST_DOCUMENTS = [
26
+ "input/baldwin-15th-north.jpg", # Document with varied text density and uppercase sections
27
+ "input/americae-retectio.jpg", # Historical document
28
+ "input/handwritten-letter.jpg", # Handwritten document
29
+ ]
30
+
31
+ def test_adaptive_segmentation():
32
+ """
33
+ Run the adaptive segmentation on test documents and visualize the results.
34
+ """
35
+ # Create output directory
36
+ output_dir = Path("output") / "adaptive_test"
37
+ output_dir.mkdir(parents=True, exist_ok=True)
38
+
39
+ results = {}
40
+
41
+ # Process each test document
42
+ for document_path in TEST_DOCUMENTS:
43
+ document_file = Path(document_path)
44
+ if not document_file.exists():
45
+ logger.warning(f"Test document not found: {document_path}")
46
+ continue
47
+
48
+ logger.info(f"Processing test document: {document_file.name}")
49
+
50
+ # Process the document
51
+ segmentation_results = process_segmented_image(document_file, output_dir)
52
+
53
+ # Create a combined visualization
54
+ if segmentation_results.get('text_regions_coordinates'):
55
+ # Print analysis
56
+ logger.info(f"Document: {document_file.name}")
57
+ logger.info(f"Found {len(segmentation_results['text_regions_coordinates'])} text regions")
58
+ logger.info(f"Output files: {segmentation_results.get('output_files', {})}")
59
+
60
+ # Store results
61
+ results[document_file.name] = {
62
+ "regions_count": len(segmentation_results['text_regions_coordinates']),
63
+ "output_files": segmentation_results.get('output_files', {}),
64
+ "regions": segmentation_results.get('text_regions_coordinates', [])
65
+ }
66
+
67
+ # Save summary report
68
+ with open(output_dir / "adaptive_segmentation_results.json", "w") as f:
69
+ json.dump(results, f, indent=2)
70
+
71
+ # Create a summary report
72
+ with open(output_dir / "adaptive_segmentation_report.md", "w") as f:
73
+ f.write("# Adaptive Segmentation Test Results\n\n")
74
+ f.write("This report summarizes the results of testing the adaptive content-aware segmentation approach.\n\n")
75
+
76
+ for document_name, result in results.items():
77
+ f.write(f"## {document_name}\n\n")
78
+ f.write(f"- Regions detected: {result['regions_count']}\n")
79
+ f.write(f"- Output files:\n")
80
+ for file_type, file_path in result.get('output_files', {}).items():
81
+ f.write(f" - {file_type}: {file_path}\n")
82
+ f.write("\n")
83
+
84
+ # Add region analysis
85
+ if result.get('regions'):
86
+ f.write("### Region Analysis\n\n")
87
+ f.write("| Region | X | Y | Width | Height |\n")
88
+ f.write("|--------|---|---|-------|--------|\n")
89
+ for i, region in enumerate(result['regions']):
90
+ x, y, w, h = region
91
+ f.write(f"| {i+1} | {x} | {y} | {w} | {h} |\n")
92
+ f.write("\n")
93
+
94
+ logger.info(f"Test completed. Results saved to {output_dir}")
95
+ logger.info(f"Summary report: {output_dir / 'adaptive_segmentation_report.md'}")
96
+
97
+ if __name__ == "__main__":
98
+ test_adaptive_segmentation()
utils/image_utils.py CHANGED
@@ -327,6 +327,65 @@ def calculate_image_entropy(pil_img: Image.Image) -> float:
327
  entropy = -np.sum(hist * np.log2(hist))
328
  return float(entropy)
329
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  def serialize_ocr_object(obj):
331
  """
332
  Serialize OCR response objects to JSON serializable format.
 
327
  entropy = -np.sum(hist * np.log2(hist))
328
  return float(entropy)
329
 
330
+ def estimate_text_density(image_np):
331
+ """
332
+ Estimate text density patterns in an image.
333
+ Returns metrics on text distribution and special cases.
334
+
335
+ Args:
336
+ image_np: Numpy array of the image
337
+
338
+ Returns:
339
+ dict: Text density metrics
340
+ """
341
+ # Convert to grayscale
342
+ if len(image_np.shape) > 2 and image_np.shape[2] == 3:
343
+ gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
344
+ else:
345
+ gray = image_np
346
+
347
+ # Binarize image
348
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
349
+
350
+ # Analyze vertical text density profile (important for headers/footers)
351
+ height, width = gray.shape
352
+ vertical_profile = np.sum(binary, axis=1) / width
353
+
354
+ # Analyze horizontal text density profile
355
+ horizontal_profile = np.sum(binary, axis=0) / height
356
+
357
+ # Calculate statistics
358
+ v_mean = np.mean(vertical_profile)
359
+ v_std = np.std(vertical_profile)
360
+ v_max = np.max(vertical_profile)
361
+
362
+ # Detect uppercase text regions (common in headers of Baldwin document)
363
+ # Uppercase text tends to have more consistent height and uniform vertical density
364
+ section_height = height // 10 # Divide into 10 vertical sections
365
+ uppercase_sections = 0
366
+
367
+ for i in range(0, height, section_height):
368
+ section = binary[i:min(i+section_height, height), :]
369
+ section_profile = np.sum(section, axis=1) / width
370
+
371
+ # Uppercase characteristics: high density with low variation
372
+ if np.mean(section_profile) > v_mean * 1.5 and np.std(section_profile) < v_std * 0.7:
373
+ uppercase_sections += 1
374
+
375
+ # Determine overall pattern
376
+ if v_std / v_mean > 0.8:
377
+ pattern = 'varied' # High variance indicates sections with different text densities
378
+ else:
379
+ pattern = 'uniform' # Low variance indicates uniform text distribution
380
+
381
+ return {
382
+ 'mean_density': float(v_mean),
383
+ 'density_variation': float(v_std),
384
+ 'pattern': pattern,
385
+ 'uppercase_sections': uppercase_sections,
386
+ 'max_density': float(v_max)
387
+ }
388
+
389
  def serialize_ocr_object(obj):
390
  """
391
  Serialize OCR response objects to JSON serializable format.
utils/text_utils.py CHANGED
@@ -119,6 +119,76 @@ def clean_raw_text(text: str) -> str:
119
 
120
  return text.strip()
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  def merge_region_texts(regions: List[Dict[str, Any]], min_similarity_threshold: float = 0.7) -> str:
123
  """
124
  Intelligently merge text from multiple document regions, handling overlapping content.
 
119
 
120
  return text.strip()
121
 
122
+ def detect_content_regions(image_np):
123
+ """
124
+ Detect content regions based on text density analysis.
125
+ Returns regions with adaptive overlapping.
126
+
127
+ Args:
128
+ image_np: Numpy array image
129
+
130
+ Returns:
131
+ list: List of region tuples (x, y, width, height)
132
+ """
133
+ # Import necessary modules
134
+ import numpy as np
135
+ import cv2
136
+
137
+ # Convert to grayscale for text detection
138
+ if len(image_np.shape) > 2 and image_np.shape[2] == 3:
139
+ gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
140
+ else:
141
+ gray = image_np
142
+
143
+ # Create text density profile
144
+ # Sum pixel values horizontally to get vertical text density
145
+ v_profile = np.sum(255 - gray, axis=1)
146
+
147
+ # Normalize the profile
148
+ v_profile = v_profile / np.max(v_profile) if np.max(v_profile) > 0 else v_profile
149
+
150
+ # Find significant density changes
151
+ changes = []
152
+ threshold = 0.2
153
+ for i in range(1, len(v_profile)):
154
+ if abs(v_profile[i] - v_profile[i-1]) > threshold:
155
+ changes.append(i)
156
+
157
+ # Create adaptive regions based on density changes
158
+ img_height, img_width = gray.shape
159
+
160
+ # Default to at least 3 regions with overlap
161
+ if len(changes) < 2:
162
+ # If no significant changes, use default division with overlapping regions
163
+ header_height = int(img_height * 0.3)
164
+ middle_start = int(img_height * 0.2)
165
+ middle_height = int(img_height * 0.4)
166
+ body_start = int(img_height * 0.5)
167
+ body_height = img_height - body_start
168
+ else:
169
+ # Use detected density changes for more precise regions
170
+ changes = sorted(changes)
171
+ header_height = changes[0] + int(img_height * 0.05) # Add overlap
172
+ middle_start = max(0, changes[0] - int(img_height * 0.05))
173
+
174
+ if len(changes) > 1:
175
+ middle_height = (changes[1] - middle_start) + int(img_height * 0.05)
176
+ body_start = max(0, changes[1] - int(img_height * 0.05))
177
+ else:
178
+ middle_height = int(img_height * 0.4)
179
+ body_start = int(img_height * 0.5)
180
+
181
+ body_height = img_height - body_start
182
+
183
+ # Define regions with adaptive overlap
184
+ regions = [
185
+ (0, 0, img_width, header_height), # Header region
186
+ (0, middle_start, img_width, middle_height), # Middle region with overlap
187
+ (0, body_start, img_width, body_height) # Body region with overlap
188
+ ]
189
+
190
+ return regions
191
+
192
  def merge_region_texts(regions: List[Dict[str, Any]], min_similarity_threshold: float = 0.7) -> str:
193
  """
194
  Intelligently merge text from multiple document regions, handling overlapping content.