Spaces:
Running
Running
Enhance handwritten document processing for improved OCR accuracy
Browse files- config.py +12 -1
- ocr_utils.py +239 -50
config.py
CHANGED
@@ -45,7 +45,18 @@ IMAGE_PREPROCESSING = {
|
|
45 |
"denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
|
46 |
"max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")), # Increased size limit for better quality
|
47 |
"target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
|
48 |
-
"compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95")) # Higher quality for better OCR results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
}
|
50 |
|
51 |
# OCR settings optimized for single-page performance
|
|
|
45 |
"denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
|
46 |
"max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")), # Increased size limit for better quality
|
47 |
"target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
|
48 |
+
"compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95")), # Higher quality for better OCR results
|
49 |
+
# Enhanced settings for handwritten documents
|
50 |
+
"handwritten": {
|
51 |
+
"contrast": float(os.environ.get("HANDWRITTEN_CONTRAST", "1.2")), # Lower contrast for handwritten text
|
52 |
+
"block_size": int(os.environ.get("HANDWRITTEN_BLOCK_SIZE", "21")), # Larger block size for adaptive thresholding
|
53 |
+
"constant": int(os.environ.get("HANDWRITTEN_CONSTANT", "5")), # Lower constant for adaptive thresholding
|
54 |
+
"use_dilation": os.environ.get("HANDWRITTEN_DILATION", "True").lower() in ("true", "1", "yes"), # Connect broken strokes
|
55 |
+
"clahe_limit": float(os.environ.get("HANDWRITTEN_CLAHE_LIMIT", "2.0")), # CLAHE limit for local contrast
|
56 |
+
"bilateral_d": int(os.environ.get("HANDWRITTEN_BILATERAL_D", "5")), # Bilateral filter window size
|
57 |
+
"bilateral_sigma1": int(os.environ.get("HANDWRITTEN_BILATERAL_SIGMA1", "25")), # Color sigma
|
58 |
+
"bilateral_sigma2": int(os.environ.get("HANDWRITTEN_BILATERAL_SIGMA2", "45")) # Space sigma
|
59 |
+
}
|
60 |
}
|
61 |
|
62 |
# OCR settings optimized for single-page performance
|
ocr_utils.py
CHANGED
@@ -565,6 +565,26 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
|
|
565 |
logger.debug(f"Document type detection for {image_file.name}: " +
|
566 |
f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
|
567 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
568 |
# Special processing for very large images (newspapers and large documents)
|
569 |
if is_newspaper:
|
570 |
# For newspaper format, we need more specialized processing
|
@@ -601,6 +621,34 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
|
|
601 |
# Also enhance saturation to make colored text more visible
|
602 |
enhancer_sat = ImageEnhance.Color(processed_img)
|
603 |
processed_img = enhancer_sat.enhance(1.2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
|
605 |
# Standard processing for other large images
|
606 |
elif file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
|
@@ -778,7 +826,7 @@ def _detect_document_type_impl(img_hash=None) -> bool:
|
|
778 |
# Documents typically have high variance (text on background)
|
779 |
# Handwritten documents may have less contrast than printed text
|
780 |
std_dev = np.std(img_np)
|
781 |
-
if std_dev >
|
782 |
return True
|
783 |
|
784 |
# 2. Quick check using downsampled image for edges
|
@@ -789,38 +837,63 @@ def _detect_document_type_impl(img_hash=None) -> bool:
|
|
789 |
else:
|
790 |
small_img = img_np
|
791 |
|
792 |
-
#
|
793 |
-
#
|
794 |
-
|
|
|
|
|
|
|
|
|
795 |
edge_ratio = np.count_nonzero(edges) / edges.size
|
796 |
|
797 |
-
#
|
798 |
-
|
799 |
-
|
800 |
-
|
801 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
802 |
|
803 |
dark_ratio = np.count_nonzero(dark_mask) / img_np.size
|
|
|
804 |
light_ratio = np.count_nonzero(light_mask) / img_np.size
|
805 |
|
|
|
|
|
|
|
|
|
|
|
806 |
# Special analysis for handwritten documents
|
807 |
-
#
|
808 |
-
if
|
809 |
-
|
810 |
-
lines = cv2.HoughLinesP(edges, 1, np.pi/180,
|
811 |
-
threshold=50, # Lower threshold for detection
|
812 |
-
minLineLength=30, # Shorter lines for handwriting
|
813 |
-
maxLineGap=20) # Larger gap for discontinuous handwriting
|
814 |
-
|
815 |
-
# If we find enough line segments, it's likely a document with text
|
816 |
-
if lines is not None and len(lines) > 10:
|
817 |
-
return True
|
818 |
|
819 |
-
# Combine heuristics for final decision
|
820 |
-
# Documents typically have both dark (text) and light (background) regions,
|
821 |
-
# and/or well-defined edges
|
822 |
# Lower thresholds for handwritten documents
|
823 |
-
return (dark_ratio > 0.
|
824 |
|
825 |
# Removed caching to fix unhashable type error
|
826 |
def preprocess_document_image(img: Image.Image) -> Image.Image:
|
@@ -1010,26 +1083,53 @@ def _preprocess_document_image_impl() -> Image.Image:
|
|
1010 |
img_np = np.array(enhanced)
|
1011 |
|
1012 |
if is_handwritten:
|
1013 |
-
#
|
1014 |
-
#
|
1015 |
-
# Guided filter works well for handwriting by preserving stroke details
|
1016 |
if img_size > 3000000: # Large images - downsample first
|
1017 |
scale_factor = 0.5
|
1018 |
small_img = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
|
1019 |
interpolation=cv2.INTER_AREA)
|
1020 |
-
|
1021 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1022 |
# Resize back
|
1023 |
filtered = cv2.resize(filtered, (width, height), interpolation=cv2.INTER_LINEAR)
|
1024 |
else:
|
1025 |
-
#
|
1026 |
-
|
1027 |
-
|
1028 |
-
|
1029 |
-
|
|
|
|
|
1030 |
|
1031 |
-
#
|
1032 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1033 |
|
1034 |
else:
|
1035 |
# Standard document processing - optimized for printed text
|
@@ -1557,6 +1657,7 @@ def serialize_ocr_object(obj):
|
|
1557 |
def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
|
1558 |
"""
|
1559 |
Attempt to use local pytesseract OCR as a fallback when API fails
|
|
|
1560 |
|
1561 |
Args:
|
1562 |
image_path: Path to the image file
|
@@ -1582,27 +1683,115 @@ def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str =
|
|
1582 |
image_path = Path(image_path) if isinstance(image_path, str) else image_path
|
1583 |
image = Image.open(image_path)
|
1584 |
|
1585 |
-
#
|
1586 |
-
|
1587 |
-
image = image.convert('RGB')
|
1588 |
-
|
1589 |
-
# Apply image enhancements for better OCR
|
1590 |
-
# Convert to grayscale for better text recognition
|
1591 |
-
image = image.convert('L')
|
1592 |
|
1593 |
-
#
|
1594 |
-
|
1595 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1596 |
|
1597 |
-
# Run OCR
|
1598 |
-
ocr_text = pytesseract.image_to_string(image,
|
1599 |
|
1600 |
if ocr_text and len(ocr_text.strip()) > 50:
|
1601 |
logger.info(f"Local OCR successful: extracted {len(ocr_text)} characters")
|
1602 |
return ocr_text
|
1603 |
else:
|
1604 |
-
|
1605 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1606 |
except ImportError:
|
1607 |
logger.warning("Pytesseract not installed - local OCR not available")
|
1608 |
return None
|
|
|
565 |
logger.debug(f"Document type detection for {image_file.name}: " +
|
566 |
f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
|
567 |
|
568 |
+
# Check for handwritten document characteristics
|
569 |
+
is_handwritten = False
|
570 |
+
if CV2_AVAILABLE and not is_newspaper:
|
571 |
+
# Use more advanced detection for handwritten content
|
572 |
+
try:
|
573 |
+
gray_np = np.array(img.convert('L'))
|
574 |
+
# Higher variance in edge strengths can indicate handwriting
|
575 |
+
edges = cv2.Canny(gray_np, 30, 100)
|
576 |
+
if np.count_nonzero(edges) / edges.size > 0.02: # Low edge threshold for handwriting
|
577 |
+
# Additional check with gradient magnitudes
|
578 |
+
sobelx = cv2.Sobel(gray_np, cv2.CV_64F, 1, 0, ksize=3)
|
579 |
+
sobely = cv2.Sobel(gray_np, cv2.CV_64F, 0, 1, ksize=3)
|
580 |
+
magnitude = np.sqrt(sobelx**2 + sobely**2)
|
581 |
+
# Handwriting typically has more variation in gradient magnitudes
|
582 |
+
if np.std(magnitude) > 20:
|
583 |
+
is_handwritten = True
|
584 |
+
logger.info(f"Handwritten document detected: {image_file.name}")
|
585 |
+
except Exception as e:
|
586 |
+
logger.debug(f"Handwriting detection error: {str(e)}")
|
587 |
+
|
588 |
# Special processing for very large images (newspapers and large documents)
|
589 |
if is_newspaper:
|
590 |
# For newspaper format, we need more specialized processing
|
|
|
621 |
# Also enhance saturation to make colored text more visible
|
622 |
enhancer_sat = ImageEnhance.Color(processed_img)
|
623 |
processed_img = enhancer_sat.enhance(1.2)
|
624 |
+
# Special processing for handwritten documents
|
625 |
+
elif is_handwritten:
|
626 |
+
logger.info(f"Processing handwritten document: {width}x{height}")
|
627 |
+
|
628 |
+
# For handwritten text, we need to preserve stroke details
|
629 |
+
# Use gentle scaling to maintain handwriting characteristics
|
630 |
+
max_dimension = max(width, height)
|
631 |
+
|
632 |
+
if max_dimension > 4000: # Large handwritten document
|
633 |
+
scale_factor = 0.6 # Less aggressive reduction for handwriting
|
634 |
+
else:
|
635 |
+
scale_factor = 0.8 # Minimal reduction for moderate size
|
636 |
+
|
637 |
+
# Calculate new dimensions
|
638 |
+
new_width = int(width * scale_factor)
|
639 |
+
new_height = int(height * scale_factor)
|
640 |
+
|
641 |
+
# Use high-quality resampling to preserve handwriting details
|
642 |
+
processed_img = img.resize((new_width, new_height), Image.LANCZOS)
|
643 |
+
|
644 |
+
# Lower contrast enhancement for handwriting to preserve stroke details
|
645 |
+
if img.mode in ('RGB', 'RGBA'):
|
646 |
+
# Convert to grayscale for better text processing
|
647 |
+
processed_img = processed_img.convert('L')
|
648 |
+
|
649 |
+
# Use reduced contrast enhancement to preserve subtle strokes
|
650 |
+
enhancer = ImageEnhance.Contrast(processed_img)
|
651 |
+
processed_img = enhancer.enhance(1.2) # Lower contrast value for handwriting
|
652 |
|
653 |
# Standard processing for other large images
|
654 |
elif file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
|
|
|
826 |
# Documents typically have high variance (text on background)
|
827 |
# Handwritten documents may have less contrast than printed text
|
828 |
std_dev = np.std(img_np)
|
829 |
+
if std_dev > 40: # Further lowered threshold to better detect handwritten documents with low contrast
|
830 |
return True
|
831 |
|
832 |
# 2. Quick check using downsampled image for edges
|
|
|
837 |
else:
|
838 |
small_img = img_np
|
839 |
|
840 |
+
# Enhanced edge detection for handwritten documents
|
841 |
+
# Use multiple Canny thresholds to better capture both faint and bold strokes
|
842 |
+
edges_low = cv2.Canny(small_img, 20, 110, L2gradient=False) # For faint handwriting
|
843 |
+
edges_high = cv2.Canny(small_img, 30, 150, L2gradient=False) # For standard text
|
844 |
+
|
845 |
+
# Combine edge detection results
|
846 |
+
edges = cv2.bitwise_or(edges_low, edges_high)
|
847 |
edge_ratio = np.count_nonzero(edges) / edges.size
|
848 |
|
849 |
+
# Special handling for potential handwritten content - more sensitive detection
|
850 |
+
handwritten_indicator = False
|
851 |
+
if edge_ratio > 0.015: # Lower threshold specifically for handwritten content
|
852 |
+
try:
|
853 |
+
# Look for handwriting stroke characteristics using gradient analysis
|
854 |
+
# Compute gradient magnitudes and directions
|
855 |
+
sobelx = cv2.Sobel(small_img, cv2.CV_64F, 1, 0, ksize=3)
|
856 |
+
sobely = cv2.Sobel(small_img, cv2.CV_64F, 0, 1, ksize=3)
|
857 |
+
magnitude = np.sqrt(sobelx**2 + sobely**2)
|
858 |
+
|
859 |
+
# Handwriting typically has higher variation in gradient magnitudes
|
860 |
+
if np.std(magnitude) > 18: # Lower threshold for more sensitivity
|
861 |
+
# Handwriting is indicated if we also have some line structure
|
862 |
+
# Try to find line segments that could indicate text lines
|
863 |
+
lines = cv2.HoughLinesP(edges, 1, np.pi/180,
|
864 |
+
threshold=45, # Lower threshold for handwriting
|
865 |
+
minLineLength=25, # Shorter minimum line length
|
866 |
+
maxLineGap=25) # Larger gap for disconnected handwriting
|
867 |
+
|
868 |
+
if lines is not None and len(lines) > 8: # Fewer line segments needed
|
869 |
+
handwritten_indicator = True
|
870 |
+
except Exception:
|
871 |
+
# If analysis fails, continue with other checks
|
872 |
+
pass
|
873 |
+
|
874 |
+
# 3. Enhanced histogram analysis for handwritten content
|
875 |
+
# Use more granular bins for better detection of varying stroke densities
|
876 |
+
dark_mask = img_np < 65 # Increased threshold to capture lighter handwritten text
|
877 |
+
medium_mask = (img_np >= 65) & (img_np < 170) # Medium gray range for handwriting
|
878 |
+
light_mask = img_np > 175 # Slightly adjusted for aged paper
|
879 |
|
880 |
dark_ratio = np.count_nonzero(dark_mask) / img_np.size
|
881 |
+
medium_ratio = np.count_nonzero(medium_mask) / img_np.size
|
882 |
light_ratio = np.count_nonzero(light_mask) / img_np.size
|
883 |
|
884 |
+
# Handwritten documents often have more medium-gray content than printed text
|
885 |
+
# This helps detect pencil or faded ink handwriting
|
886 |
+
if medium_ratio > 0.3 and edge_ratio > 0.015:
|
887 |
+
return True
|
888 |
+
|
889 |
# Special analysis for handwritten documents
|
890 |
+
# Return true immediately if handwriting characteristics detected
|
891 |
+
if handwritten_indicator:
|
892 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
893 |
|
894 |
+
# Combine heuristics for final decision with improved sensitivity
|
|
|
|
|
895 |
# Lower thresholds for handwritten documents
|
896 |
+
return (dark_ratio > 0.025 and light_ratio > 0.2) or edge_ratio > 0.025
|
897 |
|
898 |
# Removed caching to fix unhashable type error
|
899 |
def preprocess_document_image(img: Image.Image) -> Image.Image:
|
|
|
1083 |
img_np = np.array(enhanced)
|
1084 |
|
1085 |
if is_handwritten:
|
1086 |
+
# Enhanced processing for handwritten documents
|
1087 |
+
# Optimized for better stroke preservation and readability
|
|
|
1088 |
if img_size > 3000000: # Large images - downsample first
|
1089 |
scale_factor = 0.5
|
1090 |
small_img = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
|
1091 |
interpolation=cv2.INTER_AREA)
|
1092 |
+
|
1093 |
+
# Apply CLAHE for better local contrast in handwriting
|
1094 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
1095 |
+
enhanced_img = clahe.apply(small_img)
|
1096 |
+
|
1097 |
+
# Apply bilateral filter with parameters optimized for handwriting
|
1098 |
+
# Lower sigma values to preserve more detail
|
1099 |
+
filtered = cv2.bilateralFilter(enhanced_img, 7, 30, 50)
|
1100 |
+
|
1101 |
# Resize back
|
1102 |
filtered = cv2.resize(filtered, (width, height), interpolation=cv2.INTER_LINEAR)
|
1103 |
else:
|
1104 |
+
# For smaller handwritten images
|
1105 |
+
# Apply CLAHE for better local contrast
|
1106 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
1107 |
+
enhanced_img = clahe.apply(img_np)
|
1108 |
+
|
1109 |
+
# Apply bilateral filter with parameters optimized for handwriting
|
1110 |
+
filtered = cv2.bilateralFilter(enhanced_img, 5, 25, 45)
|
1111 |
|
1112 |
+
# Adaptive thresholding specific to handwriting
|
1113 |
+
try:
|
1114 |
+
# Use larger block size and lower constant for better stroke preservation
|
1115 |
+
binary = cv2.adaptiveThreshold(
|
1116 |
+
filtered, 255,
|
1117 |
+
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
1118 |
+
cv2.THRESH_BINARY,
|
1119 |
+
21, # Larger block size for handwriting
|
1120 |
+
5 # Lower constant for better stroke preservation
|
1121 |
+
)
|
1122 |
+
|
1123 |
+
# Apply slight dilation to connect broken strokes
|
1124 |
+
kernel = np.ones((2, 2), np.uint8)
|
1125 |
+
binary = cv2.dilate(binary, kernel, iterations=1)
|
1126 |
+
|
1127 |
+
# Convert back to PIL Image
|
1128 |
+
return Image.fromarray(binary)
|
1129 |
+
except Exception as e:
|
1130 |
+
logger.debug(f"Adaptive threshold for handwriting failed: {str(e)}")
|
1131 |
+
# Convert filtered image to PIL and return as fallback
|
1132 |
+
return Image.fromarray(filtered)
|
1133 |
|
1134 |
else:
|
1135 |
# Standard document processing - optimized for printed text
|
|
|
1657 |
def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
|
1658 |
"""
|
1659 |
Attempt to use local pytesseract OCR as a fallback when API fails
|
1660 |
+
With enhanced processing optimized for handwritten content
|
1661 |
|
1662 |
Args:
|
1663 |
image_path: Path to the image file
|
|
|
1683 |
image_path = Path(image_path) if isinstance(image_path, str) else image_path
|
1684 |
image = Image.open(image_path)
|
1685 |
|
1686 |
+
# Auto-detect if this appears to be handwritten
|
1687 |
+
is_handwritten = False
|
|
|
|
|
|
|
|
|
|
|
1688 |
|
1689 |
+
# Use OpenCV for better detection and preprocessing if available
|
1690 |
+
if CV2_AVAILABLE:
|
1691 |
+
try:
|
1692 |
+
# Convert image to numpy array
|
1693 |
+
img_np = np.array(image.convert('L'))
|
1694 |
+
|
1695 |
+
# Check for handwritten characteristics
|
1696 |
+
edges = cv2.Canny(img_np, 30, 100)
|
1697 |
+
edge_ratio = np.count_nonzero(edges) / edges.size
|
1698 |
+
|
1699 |
+
# Typical handwritten documents have more varied edge patterns
|
1700 |
+
if edge_ratio > 0.02:
|
1701 |
+
# Additional check with gradient magnitudes
|
1702 |
+
sobelx = cv2.Sobel(img_np, cv2.CV_64F, 1, 0, ksize=3)
|
1703 |
+
sobely = cv2.Sobel(img_np, cv2.CV_64F, 0, 1, ksize=3)
|
1704 |
+
magnitude = np.sqrt(sobelx**2 + sobely**2)
|
1705 |
+
# Handwriting typically has more variation in gradient magnitudes
|
1706 |
+
if np.std(magnitude) > 20:
|
1707 |
+
is_handwritten = True
|
1708 |
+
logger.info("Detected handwritten content for local OCR")
|
1709 |
+
|
1710 |
+
# Enhanced preprocessing based on document type
|
1711 |
+
if is_handwritten:
|
1712 |
+
# Process for handwritten content
|
1713 |
+
# Apply CLAHE for better local contrast
|
1714 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
1715 |
+
img_np = clahe.apply(img_np)
|
1716 |
+
|
1717 |
+
# Apply adaptive thresholding with optimized parameters for handwriting
|
1718 |
+
binary = cv2.adaptiveThreshold(
|
1719 |
+
img_np, 255,
|
1720 |
+
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
1721 |
+
cv2.THRESH_BINARY,
|
1722 |
+
21, # Larger block size for handwriting
|
1723 |
+
5 # Lower constant for better stroke preservation
|
1724 |
+
)
|
1725 |
+
|
1726 |
+
# Optional: apply dilation to thicken strokes slightly
|
1727 |
+
kernel = np.ones((2, 2), np.uint8)
|
1728 |
+
binary = cv2.dilate(binary, kernel, iterations=1)
|
1729 |
+
|
1730 |
+
# Convert back to PIL Image for tesseract
|
1731 |
+
image = Image.fromarray(binary)
|
1732 |
+
|
1733 |
+
# Set tesseract options for handwritten content
|
1734 |
+
custom_config = r'--oem 1 --psm 6 -l eng'
|
1735 |
+
else:
|
1736 |
+
# Process for printed content
|
1737 |
+
# Apply CLAHE for better contrast
|
1738 |
+
clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8, 8))
|
1739 |
+
img_np = clahe.apply(img_np)
|
1740 |
+
|
1741 |
+
# Apply bilateral filter to reduce noise while preserving edges
|
1742 |
+
img_np = cv2.bilateralFilter(img_np, 9, 75, 75)
|
1743 |
+
|
1744 |
+
# Apply Otsu's thresholding for printed text
|
1745 |
+
_, binary = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
1746 |
+
|
1747 |
+
# Convert back to PIL Image for tesseract
|
1748 |
+
image = Image.fromarray(binary)
|
1749 |
+
|
1750 |
+
# Set tesseract options for printed content
|
1751 |
+
custom_config = r'--oem 3 --psm 6 -l eng'
|
1752 |
+
except Exception as e:
|
1753 |
+
logger.warning(f"OpenCV preprocessing failed: {str(e)}. Using PIL fallback.")
|
1754 |
+
|
1755 |
+
# Convert to RGB if not already (pytesseract works best with RGB)
|
1756 |
+
if image.mode != 'RGB':
|
1757 |
+
image = image.convert('RGB')
|
1758 |
+
|
1759 |
+
# Apply basic image enhancements
|
1760 |
+
image = image.convert('L')
|
1761 |
+
enhancer = ImageEnhance.Contrast(image)
|
1762 |
+
image = enhancer.enhance(2.0)
|
1763 |
+
custom_config = r'--oem 3 --psm 6 -l eng'
|
1764 |
+
else:
|
1765 |
+
# PIL-only path without OpenCV
|
1766 |
+
# Convert to RGB if not already (pytesseract works best with RGB)
|
1767 |
+
if image.mode != 'RGB':
|
1768 |
+
image = image.convert('RGB')
|
1769 |
+
|
1770 |
+
# Apply basic image enhancements
|
1771 |
+
image = image.convert('L')
|
1772 |
+
enhancer = ImageEnhance.Contrast(image)
|
1773 |
+
image = enhancer.enhance(2.0)
|
1774 |
+
custom_config = r'--oem 3 --psm 6 -l eng'
|
1775 |
|
1776 |
+
# Run OCR with appropriate config
|
1777 |
+
ocr_text = pytesseract.image_to_string(image, config=custom_config)
|
1778 |
|
1779 |
if ocr_text and len(ocr_text.strip()) > 50:
|
1780 |
logger.info(f"Local OCR successful: extracted {len(ocr_text)} characters")
|
1781 |
return ocr_text
|
1782 |
else:
|
1783 |
+
# Try another psm mode as fallback
|
1784 |
+
logger.warning("First OCR attempt produced minimal text, trying another mode")
|
1785 |
+
# Try PSM mode 4 (assume single column of text)
|
1786 |
+
fallback_config = r'--oem 3 --psm 4 -l eng'
|
1787 |
+
ocr_text = pytesseract.image_to_string(image, config=fallback_config)
|
1788 |
+
|
1789 |
+
if ocr_text and len(ocr_text.strip()) > 50:
|
1790 |
+
logger.info(f"Local OCR fallback successful: extracted {len(ocr_text)} characters")
|
1791 |
+
return ocr_text
|
1792 |
+
else:
|
1793 |
+
logger.warning("Local OCR produced minimal or no text")
|
1794 |
+
return None
|
1795 |
except ImportError:
|
1796 |
logger.warning("Pytesseract not installed - local OCR not available")
|
1797 |
return None
|