Spaces:
Running
Running
Fix OCR processing variable scope issue by using explicit module reference for apply_preprocessing_to_file
Browse files- image_segmentation.py +62 -97
- letterhead_handler.py +197 -0
- ocr_processing.py +144 -2
- test_adaptive_segmentation.py +98 -0
- utils/image_utils.py +59 -0
- utils/text_utils.py +70 -0
image_segmentation.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
"""
|
2 |
Image segmentation utility for OCR preprocessing.
|
3 |
Separates text regions from image regions to improve OCR accuracy on mixed-content documents.
|
4 |
-
|
5 |
"""
|
6 |
|
7 |
import cv2
|
@@ -18,33 +18,10 @@ logging.basicConfig(level=logging.INFO,
|
|
18 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
19 |
logger = logging.getLogger(__name__)
|
20 |
|
21 |
-
def determine_segmentation_approach(image_path: Union[str, Path]) -> str:
|
22 |
-
"""
|
23 |
-
Determine which segmentation approach to use based on the document type.
|
24 |
-
|
25 |
-
Args:
|
26 |
-
image_path: Path to the image file
|
27 |
-
|
28 |
-
Returns:
|
29 |
-
str: Segmentation approach to use ('simplified' or 'original')
|
30 |
-
"""
|
31 |
-
# Convert to string for easier pattern matching
|
32 |
-
filename = str(image_path).lower()
|
33 |
-
|
34 |
-
# Document-specific rules based on testing results
|
35 |
-
if "baldwin" in filename and "north" in filename:
|
36 |
-
# Baldwin documents showed better results with original approach
|
37 |
-
return "original"
|
38 |
-
|
39 |
-
# Default to our simplified approach for most documents
|
40 |
-
return "simplified"
|
41 |
-
|
42 |
def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = True, preserve_content: bool = True) -> Dict[str, Union[Image.Image, str]]:
|
43 |
"""
|
44 |
-
Prepare image for OCR processing using
|
45 |
-
|
46 |
-
to handle document understanding and layout analysis. For specific document types
|
47 |
-
that benefit from custom segmentation, a document-specific approach is used.
|
48 |
|
49 |
Args:
|
50 |
image_path: Path to the image file
|
@@ -57,11 +34,8 @@ def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = T
|
|
57 |
# Convert to Path object if string
|
58 |
image_file = Path(image_path) if isinstance(image_path, str) else image_path
|
59 |
|
60 |
-
# Determine the segmentation approach to use
|
61 |
-
approach = determine_segmentation_approach(image_file)
|
62 |
-
|
63 |
# Log start of processing
|
64 |
-
logger.info(f"Preparing image for Mistral OCR: {image_file.name}
|
65 |
|
66 |
try:
|
67 |
# Open original image with PIL
|
@@ -88,80 +62,29 @@ def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = T
|
|
88 |
img_np = np.array(pil_img)
|
89 |
img_width, img_height = pil_img.size
|
90 |
|
91 |
-
#
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
# For visualization, mark the entire image as a text region
|
97 |
-
full_image_region = [(0, 0, img_width, img_height)]
|
98 |
-
|
99 |
-
# Create visualization with a simple border
|
100 |
-
vis_img = img_np.copy()
|
101 |
-
cv2.rectangle(vis_img, (5, 5), (img_width-5, img_height-5), (0, 255, 0), 5)
|
102 |
-
|
103 |
-
# Add text to indicate this is using Mistral's native processing
|
104 |
-
font = cv2.FONT_HERSHEY_SIMPLEX
|
105 |
-
cv2.putText(vis_img, "Processed by Mistral OCR", (30, 60), font, 1, (0, 255, 0), 2)
|
106 |
-
|
107 |
-
# Create visualizations and masks
|
108 |
-
text_regions_vis = Image.fromarray(vis_img)
|
109 |
-
image_regions_vis = text_regions_vis.copy()
|
110 |
-
|
111 |
-
# Create a mask of the entire image (just for visualization)
|
112 |
-
text_mask = np.ones((img_height, img_width), dtype=np.uint8) * 255
|
113 |
-
_, buffer = cv2.imencode('.png', text_mask)
|
114 |
-
text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
|
115 |
-
|
116 |
-
# Return the original image as the combined result
|
117 |
-
return {
|
118 |
-
'text_regions': text_regions_vis,
|
119 |
-
'image_regions': image_regions_vis,
|
120 |
-
'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
|
121 |
-
'combined_result': pil_img,
|
122 |
-
'text_regions_coordinates': full_image_region,
|
123 |
-
'region_images': [{
|
124 |
-
'image': img_np,
|
125 |
-
'pil_image': pil_img,
|
126 |
-
'coordinates': (0, 0, img_width, img_height),
|
127 |
-
'padded_coordinates': (0, 0, img_width, img_height),
|
128 |
-
'order': 0
|
129 |
-
}]
|
130 |
-
}
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
# This approach is preferred for documents that showed better results in testing
|
136 |
-
|
137 |
-
# Create a visualization with green borders around the text regions
|
138 |
-
vis_img = img_np.copy()
|
139 |
-
|
140 |
-
# For baldwin-north type documents, create a more granular segmentation
|
141 |
-
# Define regions with more detailed segmentation for better text capture
|
142 |
-
# Use 3 overlapping regions instead of 2 distinct ones
|
143 |
|
144 |
-
#
|
145 |
-
|
146 |
-
|
147 |
-
middle_height = int(img_height * 0.4) # Middle 40%
|
148 |
-
body_start = int(img_height * 0.5) # Start body with overlap
|
149 |
-
body_height = img_height - body_start # Remaining height
|
150 |
|
151 |
-
#
|
152 |
-
|
153 |
-
(0, 0, img_width, header_height), # Header region
|
154 |
-
(0, middle_start, img_width, middle_height), # Middle region with overlap
|
155 |
-
(0, body_start, img_width, body_height) # Body region with overlap
|
156 |
-
]
|
157 |
|
158 |
# Draw regions on visualization
|
159 |
for x, y, w, h in regions:
|
160 |
cv2.rectangle(vis_img, (x, y), (x+w, y+h), (0, 255, 0), 3)
|
161 |
|
162 |
-
# Add text to indicate we're using
|
163 |
font = cv2.FONT_HERSHEY_SIMPLEX
|
164 |
-
cv2.putText(vis_img, "
|
165 |
|
166 |
# Create visualization images
|
167 |
text_regions_vis = Image.fromarray(vis_img)
|
@@ -190,14 +113,56 @@ def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = T
|
|
190 |
}
|
191 |
region_images.append(region_info)
|
192 |
|
193 |
-
# Return the
|
194 |
return {
|
195 |
'text_regions': text_regions_vis,
|
196 |
'image_regions': image_regions_vis,
|
197 |
'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
|
198 |
'combined_result': pil_img,
|
199 |
'text_regions_coordinates': regions,
|
200 |
-
'region_images': region_images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
}
|
202 |
|
203 |
except Exception as e:
|
|
|
1 |
"""
|
2 |
Image segmentation utility for OCR preprocessing.
|
3 |
Separates text regions from image regions to improve OCR accuracy on mixed-content documents.
|
4 |
+
Uses content-aware adaptive segmentation for improved results across document types.
|
5 |
"""
|
6 |
|
7 |
import cv2
|
|
|
18 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
19 |
logger = logging.getLogger(__name__)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = True, preserve_content: bool = True) -> Dict[str, Union[Image.Image, str]]:
|
22 |
"""
|
23 |
+
Prepare image for OCR processing using content-aware segmentation.
|
24 |
+
Uses adaptive region detection based on text density analysis.
|
|
|
|
|
25 |
|
26 |
Args:
|
27 |
image_path: Path to the image file
|
|
|
34 |
# Convert to Path object if string
|
35 |
image_file = Path(image_path) if isinstance(image_path, str) else image_path
|
36 |
|
|
|
|
|
|
|
37 |
# Log start of processing
|
38 |
+
logger.info(f"Preparing image for Mistral OCR: {image_file.name}")
|
39 |
|
40 |
try:
|
41 |
# Open original image with PIL
|
|
|
62 |
img_np = np.array(pil_img)
|
63 |
img_width, img_height = pil_img.size
|
64 |
|
65 |
+
# Analyze text density to determine if advanced segmentation is needed
|
66 |
+
# This replaces document-specific logic with content-aware analysis
|
67 |
+
from utils.image_utils import estimate_text_density
|
68 |
+
text_density = estimate_text_density(img_np)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
# Use adaptive approach for documents with unusual text distribution
|
71 |
+
if text_density['pattern'] == 'varied' or text_density['uppercase_sections'] > 0:
|
72 |
+
logger.info(f"Using adaptive segmentation for document with varied text density pattern={text_density['pattern']}, uppercase_sections={text_density['uppercase_sections']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
+
# Detect content regions based on text density
|
75 |
+
from utils.text_utils import detect_content_regions
|
76 |
+
regions = detect_content_regions(img_np)
|
|
|
|
|
|
|
77 |
|
78 |
+
# Create visualization with green borders around the text regions
|
79 |
+
vis_img = img_np.copy()
|
|
|
|
|
|
|
|
|
80 |
|
81 |
# Draw regions on visualization
|
82 |
for x, y, w, h in regions:
|
83 |
cv2.rectangle(vis_img, (x, y), (x+w, y+h), (0, 255, 0), 3)
|
84 |
|
85 |
+
# Add text to indicate we're using adaptive processing
|
86 |
font = cv2.FONT_HERSHEY_SIMPLEX
|
87 |
+
cv2.putText(vis_img, "Adaptive region processing", (30, 60), font, 1, (0, 255, 0), 2)
|
88 |
|
89 |
# Create visualization images
|
90 |
text_regions_vis = Image.fromarray(vis_img)
|
|
|
113 |
}
|
114 |
region_images.append(region_info)
|
115 |
|
116 |
+
# Return the adaptive segmentation results
|
117 |
return {
|
118 |
'text_regions': text_regions_vis,
|
119 |
'image_regions': image_regions_vis,
|
120 |
'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
|
121 |
'combined_result': pil_img,
|
122 |
'text_regions_coordinates': regions,
|
123 |
+
'region_images': region_images,
|
124 |
+
'segmentation_type': 'adaptive'
|
125 |
+
}
|
126 |
+
else:
|
127 |
+
# SIMPLIFIED APPROACH for most documents
|
128 |
+
# Let Mistral OCR handle the entire document understanding process
|
129 |
+
logger.info(f"Using standard approach for document with uniform text density")
|
130 |
+
|
131 |
+
# For visualization, mark the entire image as a text region
|
132 |
+
full_image_region = [(0, 0, img_width, img_height)]
|
133 |
+
|
134 |
+
# Create visualization with a simple border
|
135 |
+
vis_img = img_np.copy()
|
136 |
+
cv2.rectangle(vis_img, (5, 5), (img_width-5, img_height-5), (0, 255, 0), 5)
|
137 |
+
|
138 |
+
# Add text to indicate this is using Mistral's native processing
|
139 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
140 |
+
cv2.putText(vis_img, "Processed by Mistral OCR", (30, 60), font, 1, (0, 255, 0), 2)
|
141 |
+
|
142 |
+
# Create visualizations and masks
|
143 |
+
text_regions_vis = Image.fromarray(vis_img)
|
144 |
+
image_regions_vis = text_regions_vis.copy()
|
145 |
+
|
146 |
+
# Create a mask of the entire image (just for visualization)
|
147 |
+
text_mask = np.ones((img_height, img_width), dtype=np.uint8) * 255
|
148 |
+
_, buffer = cv2.imencode('.png', text_mask)
|
149 |
+
text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
|
150 |
+
|
151 |
+
# Return the original image as the combined result
|
152 |
+
return {
|
153 |
+
'text_regions': text_regions_vis,
|
154 |
+
'image_regions': image_regions_vis,
|
155 |
+
'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
|
156 |
+
'combined_result': pil_img,
|
157 |
+
'text_regions_coordinates': full_image_region,
|
158 |
+
'region_images': [{
|
159 |
+
'image': img_np,
|
160 |
+
'pil_image': pil_img,
|
161 |
+
'coordinates': (0, 0, img_width, img_height),
|
162 |
+
'padded_coordinates': (0, 0, img_width, img_height),
|
163 |
+
'order': 0
|
164 |
+
}],
|
165 |
+
'segmentation_type': 'simplified'
|
166 |
}
|
167 |
|
168 |
except Exception as e:
|
letterhead_handler.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Specialized handler for letterhead and marginalia documents.
|
3 |
+
Enhances OCR quality by providing document-specific prompts for common layouts.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import re
|
7 |
+
import logging
|
8 |
+
from pathlib import Path
|
9 |
+
from typing import Union, Dict, Any, Optional, List
|
10 |
+
|
11 |
+
# Configure logging
|
12 |
+
logging.basicConfig(level=logging.INFO,
|
13 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool:
|
17 |
+
"""
|
18 |
+
Detect if an image is likely a letterhead document with marginalia.
|
19 |
+
Uses path/filename patterns and optional image features (if provided).
|
20 |
+
|
21 |
+
Args:
|
22 |
+
image_path: Path to the image file
|
23 |
+
features: Optional dict of image features from preprocessing
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
bool: True if likely a letterhead document
|
27 |
+
"""
|
28 |
+
# Convert to string path for pattern matching
|
29 |
+
path_str = str(image_path).lower()
|
30 |
+
|
31 |
+
# Check for common letterhead filename patterns
|
32 |
+
letterhead_patterns = [
|
33 |
+
r'letter(head)?[^/]*\.jpg',
|
34 |
+
r'hotel[^/]*\.jpg',
|
35 |
+
r'baldwin.*\.jpg',
|
36 |
+
r'business.*letter.*\.jpg',
|
37 |
+
r'correspondence.*\.jpg'
|
38 |
+
]
|
39 |
+
|
40 |
+
for pattern in letterhead_patterns:
|
41 |
+
if re.search(pattern, path_str):
|
42 |
+
logger.info(f"Detected likely letterhead document: {Path(image_path).name}")
|
43 |
+
return True
|
44 |
+
|
45 |
+
# If features are provided, use them for additional detection
|
46 |
+
if features:
|
47 |
+
# Check for ALL CAPS sections that might be marginalia
|
48 |
+
if features.get('uppercase_sections', 0) > 1:
|
49 |
+
logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}")
|
50 |
+
return True
|
51 |
+
|
52 |
+
return False
|
53 |
+
|
54 |
+
def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]:
|
55 |
+
"""
|
56 |
+
Generate a specialized prompt for letterhead documents to improve OCR quality.
|
57 |
+
|
58 |
+
Args:
|
59 |
+
image_path: Path to the image file
|
60 |
+
features: Optional dict of image features from preprocessing
|
61 |
+
|
62 |
+
Returns:
|
63 |
+
str: Custom prompt for letterhead OCR or None if not applicable
|
64 |
+
"""
|
65 |
+
if not is_likely_letterhead(image_path, features):
|
66 |
+
return None
|
67 |
+
|
68 |
+
# Path-specific customizations for known problematic documents
|
69 |
+
path_str = str(image_path).lower()
|
70 |
+
|
71 |
+
# Most specialized prompt for baldwin documents
|
72 |
+
if "baldwin" in path_str:
|
73 |
+
return """
|
74 |
+
This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines:
|
75 |
+
|
76 |
+
1. Identify and separate the letterhead elements:
|
77 |
+
- Header: The hotel name, address, and contact information at the top
|
78 |
+
- Marginalia: The amenities description in ALL CAPS along the margins
|
79 |
+
|
80 |
+
2. Extract the main handwritten letter content separately
|
81 |
+
|
82 |
+
3. Note any image captions separately
|
83 |
+
|
84 |
+
4. Format the output as follows:
|
85 |
+
- HEADER: [header text]
|
86 |
+
- MARGINS: [marginalia text]
|
87 |
+
- LETTER: [handwritten letter text]
|
88 |
+
- CAPTIONS: [any image captions]
|
89 |
+
|
90 |
+
Be careful not to duplicate content between sections, especially with margin text.
|
91 |
+
"""
|
92 |
+
|
93 |
+
# General letterhead prompt
|
94 |
+
return """
|
95 |
+
This appears to be a letterhead document. Please extract the text with the following guidelines:
|
96 |
+
|
97 |
+
1. Identify the header/letterhead section with company name, logo, address, etc.
|
98 |
+
2. Identify any margin text or notes that appear separate from the main content
|
99 |
+
3. Extract the main letter/document body separately
|
100 |
+
4. Format the output as follows:
|
101 |
+
- LETTERHEAD: [letterhead text]
|
102 |
+
- MARGIN_NOTES: [any text in margins]
|
103 |
+
- BODY: [main document body]
|
104 |
+
|
105 |
+
Be careful not to duplicate content between sections.
|
106 |
+
"""
|
107 |
+
|
108 |
+
def clean_letterhead_ocr_output(text: str) -> str:
|
109 |
+
"""
|
110 |
+
Clean OCR output from letterhead documents by handling section markers
|
111 |
+
and reducing duplication.
|
112 |
+
|
113 |
+
Args:
|
114 |
+
text: OCR text from letterhead document
|
115 |
+
|
116 |
+
Returns:
|
117 |
+
str: Cleaned text with proper section formatting
|
118 |
+
"""
|
119 |
+
if not text:
|
120 |
+
return ""
|
121 |
+
|
122 |
+
# Find any section markers added by the specialized prompt
|
123 |
+
section_markers = [
|
124 |
+
"HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:",
|
125 |
+
"LETTER:", "BODY:", "CAPTIONS:"
|
126 |
+
]
|
127 |
+
|
128 |
+
# Check if the text has any section markers
|
129 |
+
has_sections = any(marker in text for marker in section_markers)
|
130 |
+
|
131 |
+
if has_sections:
|
132 |
+
# Split text into sections while preserving section headers
|
133 |
+
sections = {}
|
134 |
+
current_section = "UNKNOWN"
|
135 |
+
current_text = []
|
136 |
+
|
137 |
+
for line in text.split('\n'):
|
138 |
+
# Check if this line is a section marker
|
139 |
+
is_marker = False
|
140 |
+
for marker in section_markers:
|
141 |
+
if marker in line:
|
142 |
+
# Save previous section
|
143 |
+
if current_text:
|
144 |
+
sections[current_section] = '\n'.join(current_text).strip()
|
145 |
+
current_text = []
|
146 |
+
|
147 |
+
# Start new section
|
148 |
+
current_section = marker.replace(':', '')
|
149 |
+
# Keep any text after the marker on this line
|
150 |
+
remainder = line.split(marker, 1)[1].strip()
|
151 |
+
if remainder:
|
152 |
+
current_text.append(remainder)
|
153 |
+
is_marker = True
|
154 |
+
break
|
155 |
+
|
156 |
+
# If not a marker, add to current section
|
157 |
+
if not is_marker:
|
158 |
+
current_text.append(line)
|
159 |
+
|
160 |
+
# Save the last section
|
161 |
+
if current_text:
|
162 |
+
sections[current_section] = '\n'.join(current_text).strip()
|
163 |
+
|
164 |
+
# Format with standard order and clear section headers
|
165 |
+
formatted_sections = []
|
166 |
+
|
167 |
+
# First add letterhead/header info
|
168 |
+
if "LETTERHEAD" in sections:
|
169 |
+
formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}")
|
170 |
+
elif "HEADER" in sections:
|
171 |
+
formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}")
|
172 |
+
|
173 |
+
# Add margins/notes
|
174 |
+
if "MARGIN_NOTES" in sections:
|
175 |
+
formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}")
|
176 |
+
elif "MARGINS" in sections:
|
177 |
+
formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}")
|
178 |
+
|
179 |
+
# Add main content
|
180 |
+
if "BODY" in sections:
|
181 |
+
formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}")
|
182 |
+
elif "LETTER" in sections:
|
183 |
+
formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}")
|
184 |
+
|
185 |
+
# Add captions if present
|
186 |
+
if "CAPTIONS" in sections:
|
187 |
+
formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}")
|
188 |
+
|
189 |
+
# Add unknown sections
|
190 |
+
if "UNKNOWN" in sections and sections["UNKNOWN"]:
|
191 |
+
formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}")
|
192 |
+
|
193 |
+
# Join everything with clear separation
|
194 |
+
return "\n\n".join(formatted_sections)
|
195 |
+
else:
|
196 |
+
# If no section markers were found, return the original text
|
197 |
+
return text
|
ocr_processing.py
CHANGED
@@ -21,7 +21,7 @@ from structured_ocr import StructuredOCR
|
|
21 |
from utils.image_utils import clean_ocr_result
|
22 |
# Temporarily retain old utils imports until they are fully migrated
|
23 |
from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
|
24 |
-
|
25 |
from error_handler import handle_ocr_error, check_file_size
|
26 |
from image_segmentation import segment_image_for_ocr, process_segmented_image
|
27 |
|
@@ -182,6 +182,27 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
182 |
doc_type = preprocessing_options.get("document_type", "standard")
|
183 |
modified_custom_prompt = custom_prompt
|
184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
# Add document-type specific instructions based on preprocessing options
|
186 |
if doc_type == "handwritten" and not modified_custom_prompt:
|
187 |
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
@@ -214,7 +235,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
214 |
progress_reporter.update(20, "Preparing image for processing...")
|
215 |
|
216 |
# Apply preprocessing if needed
|
217 |
-
temp_path, preprocessing_applied = apply_preprocessing_to_file(
|
218 |
file_bytes,
|
219 |
file_ext,
|
220 |
preprocessing_options,
|
@@ -367,6 +388,27 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
367 |
doc_type = preprocessing_options.get("document_type", "standard")
|
368 |
modified_custom_prompt = custom_prompt
|
369 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
# Add document-type specific instructions based on preprocessing options
|
371 |
if doc_type == "handwritten" and not modified_custom_prompt:
|
372 |
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
@@ -409,6 +451,27 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
409 |
doc_type = preprocessing_options.get("document_type", "standard")
|
410 |
modified_custom_prompt = custom_prompt
|
411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
# Add document-type specific instructions based on preprocessing options
|
413 |
if doc_type == "handwritten" and not modified_custom_prompt:
|
414 |
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
@@ -437,6 +500,85 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
437 |
# Make sure file_type is explicitly set for PDFs
|
438 |
if file_type == "pdf":
|
439 |
result['file_type'] = "pdf"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
|
441 |
# 🔧 ALWAYS normalize result before returning
|
442 |
result = clean_ocr_result(
|
|
|
21 |
from utils.image_utils import clean_ocr_result
|
22 |
# Temporarily retain old utils imports until they are fully migrated
|
23 |
from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
|
24 |
+
import preprocessing
|
25 |
from error_handler import handle_ocr_error, check_file_size
|
26 |
from image_segmentation import segment_image_for_ocr, process_segmented_image
|
27 |
|
|
|
182 |
doc_type = preprocessing_options.get("document_type", "standard")
|
183 |
modified_custom_prompt = custom_prompt
|
184 |
|
185 |
+
# Check for letterhead/marginalia document types with specialized handling
|
186 |
+
try:
|
187 |
+
from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
|
188 |
+
# Extract text density features if available
|
189 |
+
features = None
|
190 |
+
if 'text_density' in preprocessing_options:
|
191 |
+
features = preprocessing_options['text_density']
|
192 |
+
|
193 |
+
# Check if this looks like a letterhead document
|
194 |
+
if is_likely_letterhead(temp_path, features):
|
195 |
+
# Get specialized letterhead prompt
|
196 |
+
letterhead_prompt = get_letterhead_prompt(temp_path, features)
|
197 |
+
if letterhead_prompt:
|
198 |
+
logger.info(f"Using specialized letterhead prompt for document")
|
199 |
+
modified_custom_prompt = letterhead_prompt
|
200 |
+
# Set document type for tracking
|
201 |
+
preprocessing_options["document_type"] = "letterhead"
|
202 |
+
doc_type = "letterhead"
|
203 |
+
except ImportError:
|
204 |
+
logger.debug("Letterhead handler not available")
|
205 |
+
|
206 |
# Add document-type specific instructions based on preprocessing options
|
207 |
if doc_type == "handwritten" and not modified_custom_prompt:
|
208 |
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
|
|
235 |
progress_reporter.update(20, "Preparing image for processing...")
|
236 |
|
237 |
# Apply preprocessing if needed
|
238 |
+
temp_path, preprocessing_applied = preprocessing.apply_preprocessing_to_file(
|
239 |
file_bytes,
|
240 |
file_ext,
|
241 |
preprocessing_options,
|
|
|
388 |
doc_type = preprocessing_options.get("document_type", "standard")
|
389 |
modified_custom_prompt = custom_prompt
|
390 |
|
391 |
+
# Check for letterhead/marginalia document types with specialized handling
|
392 |
+
try:
|
393 |
+
from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
|
394 |
+
# Extract text density features if available
|
395 |
+
features = None
|
396 |
+
if 'text_density' in preprocessing_options:
|
397 |
+
features = preprocessing_options['text_density']
|
398 |
+
|
399 |
+
# Check if this looks like a letterhead document
|
400 |
+
if is_likely_letterhead(temp_path, features):
|
401 |
+
# Get specialized letterhead prompt
|
402 |
+
letterhead_prompt = get_letterhead_prompt(temp_path, features)
|
403 |
+
if letterhead_prompt:
|
404 |
+
logger.info(f"Using specialized letterhead prompt for document")
|
405 |
+
modified_custom_prompt = letterhead_prompt
|
406 |
+
# Set document type for tracking
|
407 |
+
preprocessing_options["document_type"] = "letterhead"
|
408 |
+
doc_type = "letterhead"
|
409 |
+
except ImportError:
|
410 |
+
logger.debug("Letterhead handler not available")
|
411 |
+
|
412 |
# Add document-type specific instructions based on preprocessing options
|
413 |
if doc_type == "handwritten" and not modified_custom_prompt:
|
414 |
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
|
|
451 |
doc_type = preprocessing_options.get("document_type", "standard")
|
452 |
modified_custom_prompt = custom_prompt
|
453 |
|
454 |
+
# Check for letterhead/marginalia document types with specialized handling
|
455 |
+
try:
|
456 |
+
from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
|
457 |
+
# Extract text density features if available
|
458 |
+
features = None
|
459 |
+
if 'text_density' in preprocessing_options:
|
460 |
+
features = preprocessing_options['text_density']
|
461 |
+
|
462 |
+
# Check if this looks like a letterhead document
|
463 |
+
if is_likely_letterhead(temp_path, features):
|
464 |
+
# Get specialized letterhead prompt
|
465 |
+
letterhead_prompt = get_letterhead_prompt(temp_path, features)
|
466 |
+
if letterhead_prompt:
|
467 |
+
logger.info(f"Using specialized letterhead prompt for document")
|
468 |
+
modified_custom_prompt = letterhead_prompt
|
469 |
+
# Set document type for tracking
|
470 |
+
preprocessing_options["document_type"] = "letterhead"
|
471 |
+
doc_type = "letterhead"
|
472 |
+
except ImportError:
|
473 |
+
logger.debug("Letterhead handler not available")
|
474 |
+
|
475 |
# Add document-type specific instructions based on preprocessing options
|
476 |
if doc_type == "handwritten" and not modified_custom_prompt:
|
477 |
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
|
|
500 |
# Make sure file_type is explicitly set for PDFs
|
501 |
if file_type == "pdf":
|
502 |
result['file_type'] = "pdf"
|
503 |
+
|
504 |
+
# Check for duplicated text patterns that indicate handwritten text issues
|
505 |
+
try:
|
506 |
+
from ocr_text_repair import detect_duplicate_text_issues, get_enhanced_preprocessing_options, get_handwritten_specific_prompt, clean_duplicated_text
|
507 |
+
|
508 |
+
# Check OCR output for duplication issues
|
509 |
+
if result and 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
|
510 |
+
ocr_text = result['ocr_contents']['raw_text']
|
511 |
+
has_duplication, duplication_details = detect_duplicate_text_issues(ocr_text)
|
512 |
+
|
513 |
+
# If we detect significant duplication in the output
|
514 |
+
if has_duplication and duplication_details.get('duplication_rate', 0) > 0.1:
|
515 |
+
logger.info(f"Detected text duplication issues. Reprocessing as handwritten document with enhanced settings...")
|
516 |
+
progress_reporter.update(75, "Detected duplication issues. Reprocessing with enhanced settings...")
|
517 |
+
|
518 |
+
# Save original result before reprocessing
|
519 |
+
original_result = result
|
520 |
+
|
521 |
+
# Get enhanced preprocessing options for handwritten text
|
522 |
+
enhanced_options = get_enhanced_preprocessing_options(preprocessing_options)
|
523 |
+
|
524 |
+
# Reprocess with enhanced settings and specialized prompt
|
525 |
+
handwritten_prompt = get_handwritten_specific_prompt(custom_prompt)
|
526 |
+
|
527 |
+
# Process the image with the enhanced settings
|
528 |
+
try:
|
529 |
+
# Apply enhanced preprocessing to the original image
|
530 |
+
enhanced_temp_path, _ = preprocessing.apply_preprocessing_to_file(
|
531 |
+
open(temp_path, 'rb').read(),
|
532 |
+
Path(temp_path).suffix.lower(),
|
533 |
+
enhanced_options,
|
534 |
+
temp_file_paths
|
535 |
+
)
|
536 |
+
|
537 |
+
# Process with enhanced settings
|
538 |
+
processor = StructuredOCR()
|
539 |
+
enhanced_result = processor.process_file(
|
540 |
+
file_path=enhanced_temp_path,
|
541 |
+
file_type="image",
|
542 |
+
use_vision=use_vision,
|
543 |
+
custom_prompt=handwritten_prompt,
|
544 |
+
file_size_mb=file_size_mb
|
545 |
+
)
|
546 |
+
|
547 |
+
# Check if the enhanced result is better (less duplication)
|
548 |
+
if 'ocr_contents' in enhanced_result and 'raw_text' in enhanced_result['ocr_contents']:
|
549 |
+
enhanced_text = enhanced_result['ocr_contents']['raw_text']
|
550 |
+
_, enhanced_issues = detect_duplicate_text_issues(enhanced_text)
|
551 |
+
|
552 |
+
# Use the enhanced result if it's better
|
553 |
+
if enhanced_issues.get('duplication_rate', 1.0) < duplication_details.get('duplication_rate', 1.0):
|
554 |
+
logger.info("Enhanced processing improved OCR quality. Using enhanced result.")
|
555 |
+
result = enhanced_result
|
556 |
+
# Preserve document type and preprocessing info
|
557 |
+
result['document_type'] = 'handwritten'
|
558 |
+
result['preprocessing'] = enhanced_options
|
559 |
+
else:
|
560 |
+
# If enhancement didn't help, clean up the original result
|
561 |
+
logger.info("Enhanced processing did not improve OCR quality. Cleaning original result.")
|
562 |
+
result = original_result
|
563 |
+
# Clean up duplication in the text
|
564 |
+
if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
|
565 |
+
result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
|
566 |
+
else:
|
567 |
+
# Fallback to original with cleaning
|
568 |
+
logger.info("Enhanced processing failed. Cleaning original result.")
|
569 |
+
result = original_result
|
570 |
+
# Clean up duplication in the text
|
571 |
+
if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
|
572 |
+
result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
|
573 |
+
except Exception as enh_error:
|
574 |
+
logger.warning(f"Enhanced processing failed: {str(enh_error)}. Using cleaned original.")
|
575 |
+
# Fallback to original with cleaning
|
576 |
+
result = original_result
|
577 |
+
# Clean up duplication in the text
|
578 |
+
if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
|
579 |
+
result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
|
580 |
+
except ImportError:
|
581 |
+
logger.debug("OCR text repair module not available")
|
582 |
|
583 |
# 🔧 ALWAYS normalize result before returning
|
584 |
result = clean_ocr_result(
|
test_adaptive_segmentation.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script for adaptive content-aware segmentation.
|
4 |
+
Processes sample documents to validate the improved segmentation approach.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
import logging
|
10 |
+
from pathlib import Path
|
11 |
+
import cv2
|
12 |
+
import numpy as np
|
13 |
+
from PIL import Image
|
14 |
+
import json
|
15 |
+
|
16 |
+
# Configure logging
|
17 |
+
logging.basicConfig(level=logging.INFO,
|
18 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
# Import segmentation module
|
22 |
+
from image_segmentation import segment_image_for_ocr, process_segmented_image
|
23 |
+
|
24 |
+
# Test documents
|
25 |
+
TEST_DOCUMENTS = [
|
26 |
+
"input/baldwin-15th-north.jpg", # Document with varied text density and uppercase sections
|
27 |
+
"input/americae-retectio.jpg", # Historical document
|
28 |
+
"input/handwritten-letter.jpg", # Handwritten document
|
29 |
+
]
|
30 |
+
|
31 |
+
def test_adaptive_segmentation():
|
32 |
+
"""
|
33 |
+
Run the adaptive segmentation on test documents and visualize the results.
|
34 |
+
"""
|
35 |
+
# Create output directory
|
36 |
+
output_dir = Path("output") / "adaptive_test"
|
37 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
38 |
+
|
39 |
+
results = {}
|
40 |
+
|
41 |
+
# Process each test document
|
42 |
+
for document_path in TEST_DOCUMENTS:
|
43 |
+
document_file = Path(document_path)
|
44 |
+
if not document_file.exists():
|
45 |
+
logger.warning(f"Test document not found: {document_path}")
|
46 |
+
continue
|
47 |
+
|
48 |
+
logger.info(f"Processing test document: {document_file.name}")
|
49 |
+
|
50 |
+
# Process the document
|
51 |
+
segmentation_results = process_segmented_image(document_file, output_dir)
|
52 |
+
|
53 |
+
# Create a combined visualization
|
54 |
+
if segmentation_results.get('text_regions_coordinates'):
|
55 |
+
# Print analysis
|
56 |
+
logger.info(f"Document: {document_file.name}")
|
57 |
+
logger.info(f"Found {len(segmentation_results['text_regions_coordinates'])} text regions")
|
58 |
+
logger.info(f"Output files: {segmentation_results.get('output_files', {})}")
|
59 |
+
|
60 |
+
# Store results
|
61 |
+
results[document_file.name] = {
|
62 |
+
"regions_count": len(segmentation_results['text_regions_coordinates']),
|
63 |
+
"output_files": segmentation_results.get('output_files', {}),
|
64 |
+
"regions": segmentation_results.get('text_regions_coordinates', [])
|
65 |
+
}
|
66 |
+
|
67 |
+
# Save summary report
|
68 |
+
with open(output_dir / "adaptive_segmentation_results.json", "w") as f:
|
69 |
+
json.dump(results, f, indent=2)
|
70 |
+
|
71 |
+
# Create a summary report
|
72 |
+
with open(output_dir / "adaptive_segmentation_report.md", "w") as f:
|
73 |
+
f.write("# Adaptive Segmentation Test Results\n\n")
|
74 |
+
f.write("This report summarizes the results of testing the adaptive content-aware segmentation approach.\n\n")
|
75 |
+
|
76 |
+
for document_name, result in results.items():
|
77 |
+
f.write(f"## {document_name}\n\n")
|
78 |
+
f.write(f"- Regions detected: {result['regions_count']}\n")
|
79 |
+
f.write(f"- Output files:\n")
|
80 |
+
for file_type, file_path in result.get('output_files', {}).items():
|
81 |
+
f.write(f" - {file_type}: {file_path}\n")
|
82 |
+
f.write("\n")
|
83 |
+
|
84 |
+
# Add region analysis
|
85 |
+
if result.get('regions'):
|
86 |
+
f.write("### Region Analysis\n\n")
|
87 |
+
f.write("| Region | X | Y | Width | Height |\n")
|
88 |
+
f.write("|--------|---|---|-------|--------|\n")
|
89 |
+
for i, region in enumerate(result['regions']):
|
90 |
+
x, y, w, h = region
|
91 |
+
f.write(f"| {i+1} | {x} | {y} | {w} | {h} |\n")
|
92 |
+
f.write("\n")
|
93 |
+
|
94 |
+
logger.info(f"Test completed. Results saved to {output_dir}")
|
95 |
+
logger.info(f"Summary report: {output_dir / 'adaptive_segmentation_report.md'}")
|
96 |
+
|
97 |
+
if __name__ == "__main__":
|
98 |
+
test_adaptive_segmentation()
|
utils/image_utils.py
CHANGED
@@ -327,6 +327,65 @@ def calculate_image_entropy(pil_img: Image.Image) -> float:
|
|
327 |
entropy = -np.sum(hist * np.log2(hist))
|
328 |
return float(entropy)
|
329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
def serialize_ocr_object(obj):
|
331 |
"""
|
332 |
Serialize OCR response objects to JSON serializable format.
|
|
|
327 |
entropy = -np.sum(hist * np.log2(hist))
|
328 |
return float(entropy)
|
329 |
|
330 |
+
def estimate_text_density(image_np):
|
331 |
+
"""
|
332 |
+
Estimate text density patterns in an image.
|
333 |
+
Returns metrics on text distribution and special cases.
|
334 |
+
|
335 |
+
Args:
|
336 |
+
image_np: Numpy array of the image
|
337 |
+
|
338 |
+
Returns:
|
339 |
+
dict: Text density metrics
|
340 |
+
"""
|
341 |
+
# Convert to grayscale
|
342 |
+
if len(image_np.shape) > 2 and image_np.shape[2] == 3:
|
343 |
+
gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
|
344 |
+
else:
|
345 |
+
gray = image_np
|
346 |
+
|
347 |
+
# Binarize image
|
348 |
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
349 |
+
|
350 |
+
# Analyze vertical text density profile (important for headers/footers)
|
351 |
+
height, width = gray.shape
|
352 |
+
vertical_profile = np.sum(binary, axis=1) / width
|
353 |
+
|
354 |
+
# Analyze horizontal text density profile
|
355 |
+
horizontal_profile = np.sum(binary, axis=0) / height
|
356 |
+
|
357 |
+
# Calculate statistics
|
358 |
+
v_mean = np.mean(vertical_profile)
|
359 |
+
v_std = np.std(vertical_profile)
|
360 |
+
v_max = np.max(vertical_profile)
|
361 |
+
|
362 |
+
# Detect uppercase text regions (common in headers of Baldwin document)
|
363 |
+
# Uppercase text tends to have more consistent height and uniform vertical density
|
364 |
+
section_height = height // 10 # Divide into 10 vertical sections
|
365 |
+
uppercase_sections = 0
|
366 |
+
|
367 |
+
for i in range(0, height, section_height):
|
368 |
+
section = binary[i:min(i+section_height, height), :]
|
369 |
+
section_profile = np.sum(section, axis=1) / width
|
370 |
+
|
371 |
+
# Uppercase characteristics: high density with low variation
|
372 |
+
if np.mean(section_profile) > v_mean * 1.5 and np.std(section_profile) < v_std * 0.7:
|
373 |
+
uppercase_sections += 1
|
374 |
+
|
375 |
+
# Determine overall pattern
|
376 |
+
if v_std / v_mean > 0.8:
|
377 |
+
pattern = 'varied' # High variance indicates sections with different text densities
|
378 |
+
else:
|
379 |
+
pattern = 'uniform' # Low variance indicates uniform text distribution
|
380 |
+
|
381 |
+
return {
|
382 |
+
'mean_density': float(v_mean),
|
383 |
+
'density_variation': float(v_std),
|
384 |
+
'pattern': pattern,
|
385 |
+
'uppercase_sections': uppercase_sections,
|
386 |
+
'max_density': float(v_max)
|
387 |
+
}
|
388 |
+
|
389 |
def serialize_ocr_object(obj):
|
390 |
"""
|
391 |
Serialize OCR response objects to JSON serializable format.
|
utils/text_utils.py
CHANGED
@@ -119,6 +119,76 @@ def clean_raw_text(text: str) -> str:
|
|
119 |
|
120 |
return text.strip()
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
def merge_region_texts(regions: List[Dict[str, Any]], min_similarity_threshold: float = 0.7) -> str:
|
123 |
"""
|
124 |
Intelligently merge text from multiple document regions, handling overlapping content.
|
|
|
119 |
|
120 |
return text.strip()
|
121 |
|
122 |
+
def detect_content_regions(image_np):
|
123 |
+
"""
|
124 |
+
Detect content regions based on text density analysis.
|
125 |
+
Returns regions with adaptive overlapping.
|
126 |
+
|
127 |
+
Args:
|
128 |
+
image_np: Numpy array image
|
129 |
+
|
130 |
+
Returns:
|
131 |
+
list: List of region tuples (x, y, width, height)
|
132 |
+
"""
|
133 |
+
# Import necessary modules
|
134 |
+
import numpy as np
|
135 |
+
import cv2
|
136 |
+
|
137 |
+
# Convert to grayscale for text detection
|
138 |
+
if len(image_np.shape) > 2 and image_np.shape[2] == 3:
|
139 |
+
gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
|
140 |
+
else:
|
141 |
+
gray = image_np
|
142 |
+
|
143 |
+
# Create text density profile
|
144 |
+
# Sum pixel values horizontally to get vertical text density
|
145 |
+
v_profile = np.sum(255 - gray, axis=1)
|
146 |
+
|
147 |
+
# Normalize the profile
|
148 |
+
v_profile = v_profile / np.max(v_profile) if np.max(v_profile) > 0 else v_profile
|
149 |
+
|
150 |
+
# Find significant density changes
|
151 |
+
changes = []
|
152 |
+
threshold = 0.2
|
153 |
+
for i in range(1, len(v_profile)):
|
154 |
+
if abs(v_profile[i] - v_profile[i-1]) > threshold:
|
155 |
+
changes.append(i)
|
156 |
+
|
157 |
+
# Create adaptive regions based on density changes
|
158 |
+
img_height, img_width = gray.shape
|
159 |
+
|
160 |
+
# Default to at least 3 regions with overlap
|
161 |
+
if len(changes) < 2:
|
162 |
+
# If no significant changes, use default division with overlapping regions
|
163 |
+
header_height = int(img_height * 0.3)
|
164 |
+
middle_start = int(img_height * 0.2)
|
165 |
+
middle_height = int(img_height * 0.4)
|
166 |
+
body_start = int(img_height * 0.5)
|
167 |
+
body_height = img_height - body_start
|
168 |
+
else:
|
169 |
+
# Use detected density changes for more precise regions
|
170 |
+
changes = sorted(changes)
|
171 |
+
header_height = changes[0] + int(img_height * 0.05) # Add overlap
|
172 |
+
middle_start = max(0, changes[0] - int(img_height * 0.05))
|
173 |
+
|
174 |
+
if len(changes) > 1:
|
175 |
+
middle_height = (changes[1] - middle_start) + int(img_height * 0.05)
|
176 |
+
body_start = max(0, changes[1] - int(img_height * 0.05))
|
177 |
+
else:
|
178 |
+
middle_height = int(img_height * 0.4)
|
179 |
+
body_start = int(img_height * 0.5)
|
180 |
+
|
181 |
+
body_height = img_height - body_start
|
182 |
+
|
183 |
+
# Define regions with adaptive overlap
|
184 |
+
regions = [
|
185 |
+
(0, 0, img_width, header_height), # Header region
|
186 |
+
(0, middle_start, img_width, middle_height), # Middle region with overlap
|
187 |
+
(0, body_start, img_width, body_height) # Body region with overlap
|
188 |
+
]
|
189 |
+
|
190 |
+
return regions
|
191 |
+
|
192 |
def merge_region_texts(regions: List[Dict[str, Any]], min_similarity_threshold: float = 0.7) -> str:
|
193 |
"""
|
194 |
Intelligently merge text from multiple document regions, handling overlapping content.
|