milwright commited on
Commit
88d3e04
·
1 Parent(s): 70727c4

Enhance handwritten document processing for improved OCR accuracy

Browse files
Files changed (2) hide show
  1. config.py +12 -1
  2. ocr_utils.py +239 -50
config.py CHANGED
@@ -45,7 +45,18 @@ IMAGE_PREPROCESSING = {
45
  "denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
46
  "max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")), # Increased size limit for better quality
47
  "target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
48
- "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95")) # Higher quality for better OCR results
 
 
 
 
 
 
 
 
 
 
 
49
  }
50
 
51
  # OCR settings optimized for single-page performance
 
45
  "denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
46
  "max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")), # Increased size limit for better quality
47
  "target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
48
+ "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95")), # Higher quality for better OCR results
49
+ # Enhanced settings for handwritten documents
50
+ "handwritten": {
51
+ "contrast": float(os.environ.get("HANDWRITTEN_CONTRAST", "1.2")), # Lower contrast for handwritten text
52
+ "block_size": int(os.environ.get("HANDWRITTEN_BLOCK_SIZE", "21")), # Larger block size for adaptive thresholding
53
+ "constant": int(os.environ.get("HANDWRITTEN_CONSTANT", "5")), # Lower constant for adaptive thresholding
54
+ "use_dilation": os.environ.get("HANDWRITTEN_DILATION", "True").lower() in ("true", "1", "yes"), # Connect broken strokes
55
+ "clahe_limit": float(os.environ.get("HANDWRITTEN_CLAHE_LIMIT", "2.0")), # CLAHE limit for local contrast
56
+ "bilateral_d": int(os.environ.get("HANDWRITTEN_BILATERAL_D", "5")), # Bilateral filter window size
57
+ "bilateral_sigma1": int(os.environ.get("HANDWRITTEN_BILATERAL_SIGMA1", "25")), # Color sigma
58
+ "bilateral_sigma2": int(os.environ.get("HANDWRITTEN_BILATERAL_SIGMA2", "45")) # Space sigma
59
+ }
60
  }
61
 
62
  # OCR settings optimized for single-page performance
ocr_utils.py CHANGED
@@ -565,6 +565,26 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
565
  logger.debug(f"Document type detection for {image_file.name}: " +
566
  f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
567
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
  # Special processing for very large images (newspapers and large documents)
569
  if is_newspaper:
570
  # For newspaper format, we need more specialized processing
@@ -601,6 +621,34 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
601
  # Also enhance saturation to make colored text more visible
602
  enhancer_sat = ImageEnhance.Color(processed_img)
603
  processed_img = enhancer_sat.enhance(1.2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
 
605
  # Standard processing for other large images
606
  elif file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
@@ -778,7 +826,7 @@ def _detect_document_type_impl(img_hash=None) -> bool:
778
  # Documents typically have high variance (text on background)
779
  # Handwritten documents may have less contrast than printed text
780
  std_dev = np.std(img_np)
781
- if std_dev > 45: # Lowered threshold to better detect handwritten documents
782
  return True
783
 
784
  # 2. Quick check using downsampled image for edges
@@ -789,38 +837,63 @@ def _detect_document_type_impl(img_hash=None) -> bool:
789
  else:
790
  small_img = img_np
791
 
792
- # Use adaptive edge detection parameters for handwritten documents
793
- # Lowered threshold to better detect fainter handwritten text
794
- edges = cv2.Canny(small_img, 30, 130, L2gradient=False)
 
 
 
 
795
  edge_ratio = np.count_nonzero(edges) / edges.size
796
 
797
- # 3. Fast histogram approximation using bins
798
- # Instead of calculating full histogram, use bins for dark and light regions
799
- # Adjusted for handwritten documents which may have more gray values
800
- dark_mask = img_np < 60 # Increased threshold to capture lighter handwritten text
801
- light_mask = img_np > 180 # Lowered threshold to account for aged paper
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
802
 
803
  dark_ratio = np.count_nonzero(dark_mask) / img_np.size
 
804
  light_ratio = np.count_nonzero(light_mask) / img_np.size
805
 
 
 
 
 
 
806
  # Special analysis for handwritten documents
807
- # Check for line-like structures typical in handwritten text
808
- if CV2_AVAILABLE and edge_ratio > 0.02: # Lower threshold to capture handwritten documents
809
- # Try to find line segments that could indicate text lines
810
- lines = cv2.HoughLinesP(edges, 1, np.pi/180,
811
- threshold=50, # Lower threshold for detection
812
- minLineLength=30, # Shorter lines for handwriting
813
- maxLineGap=20) # Larger gap for discontinuous handwriting
814
-
815
- # If we find enough line segments, it's likely a document with text
816
- if lines is not None and len(lines) > 10:
817
- return True
818
 
819
- # Combine heuristics for final decision
820
- # Documents typically have both dark (text) and light (background) regions,
821
- # and/or well-defined edges
822
  # Lower thresholds for handwritten documents
823
- return (dark_ratio > 0.03 and light_ratio > 0.25) or edge_ratio > 0.03
824
 
825
  # Removed caching to fix unhashable type error
826
  def preprocess_document_image(img: Image.Image) -> Image.Image:
@@ -1010,26 +1083,53 @@ def _preprocess_document_image_impl() -> Image.Image:
1010
  img_np = np.array(enhanced)
1011
 
1012
  if is_handwritten:
1013
- # Special treatment for handwritten documents
1014
- # Use guided filter which preserves edges better than NLMeans
1015
- # Guided filter works well for handwriting by preserving stroke details
1016
  if img_size > 3000000: # Large images - downsample first
1017
  scale_factor = 0.5
1018
  small_img = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
1019
  interpolation=cv2.INTER_AREA)
1020
- # Apply bilateral filter which preserves edges while smoothing
1021
- filtered = cv2.bilateralFilter(small_img, 9, 75, 75)
 
 
 
 
 
 
 
1022
  # Resize back
1023
  filtered = cv2.resize(filtered, (width, height), interpolation=cv2.INTER_LINEAR)
1024
  else:
1025
- # Use bilateral filter directly for smaller images
1026
- filtered = cv2.bilateralFilter(img_np, 7, 50, 50)
1027
-
1028
- # Convert back to PIL Image
1029
- enhanced = Image.fromarray(filtered)
 
 
1030
 
1031
- # For handwritten docs, avoid binary thresholding which can destroy subtle strokes
1032
- return enhanced
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1033
 
1034
  else:
1035
  # Standard document processing - optimized for printed text
@@ -1557,6 +1657,7 @@ def serialize_ocr_object(obj):
1557
  def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
1558
  """
1559
  Attempt to use local pytesseract OCR as a fallback when API fails
 
1560
 
1561
  Args:
1562
  image_path: Path to the image file
@@ -1582,27 +1683,115 @@ def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str =
1582
  image_path = Path(image_path) if isinstance(image_path, str) else image_path
1583
  image = Image.open(image_path)
1584
 
1585
- # Convert to RGB if not already (pytesseract works best with RGB)
1586
- if image.mode != 'RGB':
1587
- image = image.convert('RGB')
1588
-
1589
- # Apply image enhancements for better OCR
1590
- # Convert to grayscale for better text recognition
1591
- image = image.convert('L')
1592
 
1593
- # Enhance contrast
1594
- enhancer = ImageEnhance.Contrast(image)
1595
- image = enhancer.enhance(2.0) # Higher contrast for better OCR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1596
 
1597
- # Run OCR
1598
- ocr_text = pytesseract.image_to_string(image, lang='eng')
1599
 
1600
  if ocr_text and len(ocr_text.strip()) > 50:
1601
  logger.info(f"Local OCR successful: extracted {len(ocr_text)} characters")
1602
  return ocr_text
1603
  else:
1604
- logger.warning("Local OCR produced minimal or no text")
1605
- return None
 
 
 
 
 
 
 
 
 
 
1606
  except ImportError:
1607
  logger.warning("Pytesseract not installed - local OCR not available")
1608
  return None
 
565
  logger.debug(f"Document type detection for {image_file.name}: " +
566
  f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
567
 
568
+ # Check for handwritten document characteristics
569
+ is_handwritten = False
570
+ if CV2_AVAILABLE and not is_newspaper:
571
+ # Use more advanced detection for handwritten content
572
+ try:
573
+ gray_np = np.array(img.convert('L'))
574
+ # Higher variance in edge strengths can indicate handwriting
575
+ edges = cv2.Canny(gray_np, 30, 100)
576
+ if np.count_nonzero(edges) / edges.size > 0.02: # Low edge threshold for handwriting
577
+ # Additional check with gradient magnitudes
578
+ sobelx = cv2.Sobel(gray_np, cv2.CV_64F, 1, 0, ksize=3)
579
+ sobely = cv2.Sobel(gray_np, cv2.CV_64F, 0, 1, ksize=3)
580
+ magnitude = np.sqrt(sobelx**2 + sobely**2)
581
+ # Handwriting typically has more variation in gradient magnitudes
582
+ if np.std(magnitude) > 20:
583
+ is_handwritten = True
584
+ logger.info(f"Handwritten document detected: {image_file.name}")
585
+ except Exception as e:
586
+ logger.debug(f"Handwriting detection error: {str(e)}")
587
+
588
  # Special processing for very large images (newspapers and large documents)
589
  if is_newspaper:
590
  # For newspaper format, we need more specialized processing
 
621
  # Also enhance saturation to make colored text more visible
622
  enhancer_sat = ImageEnhance.Color(processed_img)
623
  processed_img = enhancer_sat.enhance(1.2)
624
+ # Special processing for handwritten documents
625
+ elif is_handwritten:
626
+ logger.info(f"Processing handwritten document: {width}x{height}")
627
+
628
+ # For handwritten text, we need to preserve stroke details
629
+ # Use gentle scaling to maintain handwriting characteristics
630
+ max_dimension = max(width, height)
631
+
632
+ if max_dimension > 4000: # Large handwritten document
633
+ scale_factor = 0.6 # Less aggressive reduction for handwriting
634
+ else:
635
+ scale_factor = 0.8 # Minimal reduction for moderate size
636
+
637
+ # Calculate new dimensions
638
+ new_width = int(width * scale_factor)
639
+ new_height = int(height * scale_factor)
640
+
641
+ # Use high-quality resampling to preserve handwriting details
642
+ processed_img = img.resize((new_width, new_height), Image.LANCZOS)
643
+
644
+ # Lower contrast enhancement for handwriting to preserve stroke details
645
+ if img.mode in ('RGB', 'RGBA'):
646
+ # Convert to grayscale for better text processing
647
+ processed_img = processed_img.convert('L')
648
+
649
+ # Use reduced contrast enhancement to preserve subtle strokes
650
+ enhancer = ImageEnhance.Contrast(processed_img)
651
+ processed_img = enhancer.enhance(1.2) # Lower contrast value for handwriting
652
 
653
  # Standard processing for other large images
654
  elif file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
 
826
  # Documents typically have high variance (text on background)
827
  # Handwritten documents may have less contrast than printed text
828
  std_dev = np.std(img_np)
829
+ if std_dev > 40: # Further lowered threshold to better detect handwritten documents with low contrast
830
  return True
831
 
832
  # 2. Quick check using downsampled image for edges
 
837
  else:
838
  small_img = img_np
839
 
840
+ # Enhanced edge detection for handwritten documents
841
+ # Use multiple Canny thresholds to better capture both faint and bold strokes
842
+ edges_low = cv2.Canny(small_img, 20, 110, L2gradient=False) # For faint handwriting
843
+ edges_high = cv2.Canny(small_img, 30, 150, L2gradient=False) # For standard text
844
+
845
+ # Combine edge detection results
846
+ edges = cv2.bitwise_or(edges_low, edges_high)
847
  edge_ratio = np.count_nonzero(edges) / edges.size
848
 
849
+ # Special handling for potential handwritten content - more sensitive detection
850
+ handwritten_indicator = False
851
+ if edge_ratio > 0.015: # Lower threshold specifically for handwritten content
852
+ try:
853
+ # Look for handwriting stroke characteristics using gradient analysis
854
+ # Compute gradient magnitudes and directions
855
+ sobelx = cv2.Sobel(small_img, cv2.CV_64F, 1, 0, ksize=3)
856
+ sobely = cv2.Sobel(small_img, cv2.CV_64F, 0, 1, ksize=3)
857
+ magnitude = np.sqrt(sobelx**2 + sobely**2)
858
+
859
+ # Handwriting typically has higher variation in gradient magnitudes
860
+ if np.std(magnitude) > 18: # Lower threshold for more sensitivity
861
+ # Handwriting is indicated if we also have some line structure
862
+ # Try to find line segments that could indicate text lines
863
+ lines = cv2.HoughLinesP(edges, 1, np.pi/180,
864
+ threshold=45, # Lower threshold for handwriting
865
+ minLineLength=25, # Shorter minimum line length
866
+ maxLineGap=25) # Larger gap for disconnected handwriting
867
+
868
+ if lines is not None and len(lines) > 8: # Fewer line segments needed
869
+ handwritten_indicator = True
870
+ except Exception:
871
+ # If analysis fails, continue with other checks
872
+ pass
873
+
874
+ # 3. Enhanced histogram analysis for handwritten content
875
+ # Use more granular bins for better detection of varying stroke densities
876
+ dark_mask = img_np < 65 # Increased threshold to capture lighter handwritten text
877
+ medium_mask = (img_np >= 65) & (img_np < 170) # Medium gray range for handwriting
878
+ light_mask = img_np > 175 # Slightly adjusted for aged paper
879
 
880
  dark_ratio = np.count_nonzero(dark_mask) / img_np.size
881
+ medium_ratio = np.count_nonzero(medium_mask) / img_np.size
882
  light_ratio = np.count_nonzero(light_mask) / img_np.size
883
 
884
+ # Handwritten documents often have more medium-gray content than printed text
885
+ # This helps detect pencil or faded ink handwriting
886
+ if medium_ratio > 0.3 and edge_ratio > 0.015:
887
+ return True
888
+
889
  # Special analysis for handwritten documents
890
+ # Return true immediately if handwriting characteristics detected
891
+ if handwritten_indicator:
892
+ return True
 
 
 
 
 
 
 
 
893
 
894
+ # Combine heuristics for final decision with improved sensitivity
 
 
895
  # Lower thresholds for handwritten documents
896
+ return (dark_ratio > 0.025 and light_ratio > 0.2) or edge_ratio > 0.025
897
 
898
  # Removed caching to fix unhashable type error
899
  def preprocess_document_image(img: Image.Image) -> Image.Image:
 
1083
  img_np = np.array(enhanced)
1084
 
1085
  if is_handwritten:
1086
+ # Enhanced processing for handwritten documents
1087
+ # Optimized for better stroke preservation and readability
 
1088
  if img_size > 3000000: # Large images - downsample first
1089
  scale_factor = 0.5
1090
  small_img = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
1091
  interpolation=cv2.INTER_AREA)
1092
+
1093
+ # Apply CLAHE for better local contrast in handwriting
1094
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
1095
+ enhanced_img = clahe.apply(small_img)
1096
+
1097
+ # Apply bilateral filter with parameters optimized for handwriting
1098
+ # Lower sigma values to preserve more detail
1099
+ filtered = cv2.bilateralFilter(enhanced_img, 7, 30, 50)
1100
+
1101
  # Resize back
1102
  filtered = cv2.resize(filtered, (width, height), interpolation=cv2.INTER_LINEAR)
1103
  else:
1104
+ # For smaller handwritten images
1105
+ # Apply CLAHE for better local contrast
1106
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
1107
+ enhanced_img = clahe.apply(img_np)
1108
+
1109
+ # Apply bilateral filter with parameters optimized for handwriting
1110
+ filtered = cv2.bilateralFilter(enhanced_img, 5, 25, 45)
1111
 
1112
+ # Adaptive thresholding specific to handwriting
1113
+ try:
1114
+ # Use larger block size and lower constant for better stroke preservation
1115
+ binary = cv2.adaptiveThreshold(
1116
+ filtered, 255,
1117
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
1118
+ cv2.THRESH_BINARY,
1119
+ 21, # Larger block size for handwriting
1120
+ 5 # Lower constant for better stroke preservation
1121
+ )
1122
+
1123
+ # Apply slight dilation to connect broken strokes
1124
+ kernel = np.ones((2, 2), np.uint8)
1125
+ binary = cv2.dilate(binary, kernel, iterations=1)
1126
+
1127
+ # Convert back to PIL Image
1128
+ return Image.fromarray(binary)
1129
+ except Exception as e:
1130
+ logger.debug(f"Adaptive threshold for handwriting failed: {str(e)}")
1131
+ # Convert filtered image to PIL and return as fallback
1132
+ return Image.fromarray(filtered)
1133
 
1134
  else:
1135
  # Standard document processing - optimized for printed text
 
1657
  def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
1658
  """
1659
  Attempt to use local pytesseract OCR as a fallback when API fails
1660
+ With enhanced processing optimized for handwritten content
1661
 
1662
  Args:
1663
  image_path: Path to the image file
 
1683
  image_path = Path(image_path) if isinstance(image_path, str) else image_path
1684
  image = Image.open(image_path)
1685
 
1686
+ # Auto-detect if this appears to be handwritten
1687
+ is_handwritten = False
 
 
 
 
 
1688
 
1689
+ # Use OpenCV for better detection and preprocessing if available
1690
+ if CV2_AVAILABLE:
1691
+ try:
1692
+ # Convert image to numpy array
1693
+ img_np = np.array(image.convert('L'))
1694
+
1695
+ # Check for handwritten characteristics
1696
+ edges = cv2.Canny(img_np, 30, 100)
1697
+ edge_ratio = np.count_nonzero(edges) / edges.size
1698
+
1699
+ # Typical handwritten documents have more varied edge patterns
1700
+ if edge_ratio > 0.02:
1701
+ # Additional check with gradient magnitudes
1702
+ sobelx = cv2.Sobel(img_np, cv2.CV_64F, 1, 0, ksize=3)
1703
+ sobely = cv2.Sobel(img_np, cv2.CV_64F, 0, 1, ksize=3)
1704
+ magnitude = np.sqrt(sobelx**2 + sobely**2)
1705
+ # Handwriting typically has more variation in gradient magnitudes
1706
+ if np.std(magnitude) > 20:
1707
+ is_handwritten = True
1708
+ logger.info("Detected handwritten content for local OCR")
1709
+
1710
+ # Enhanced preprocessing based on document type
1711
+ if is_handwritten:
1712
+ # Process for handwritten content
1713
+ # Apply CLAHE for better local contrast
1714
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
1715
+ img_np = clahe.apply(img_np)
1716
+
1717
+ # Apply adaptive thresholding with optimized parameters for handwriting
1718
+ binary = cv2.adaptiveThreshold(
1719
+ img_np, 255,
1720
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
1721
+ cv2.THRESH_BINARY,
1722
+ 21, # Larger block size for handwriting
1723
+ 5 # Lower constant for better stroke preservation
1724
+ )
1725
+
1726
+ # Optional: apply dilation to thicken strokes slightly
1727
+ kernel = np.ones((2, 2), np.uint8)
1728
+ binary = cv2.dilate(binary, kernel, iterations=1)
1729
+
1730
+ # Convert back to PIL Image for tesseract
1731
+ image = Image.fromarray(binary)
1732
+
1733
+ # Set tesseract options for handwritten content
1734
+ custom_config = r'--oem 1 --psm 6 -l eng'
1735
+ else:
1736
+ # Process for printed content
1737
+ # Apply CLAHE for better contrast
1738
+ clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8, 8))
1739
+ img_np = clahe.apply(img_np)
1740
+
1741
+ # Apply bilateral filter to reduce noise while preserving edges
1742
+ img_np = cv2.bilateralFilter(img_np, 9, 75, 75)
1743
+
1744
+ # Apply Otsu's thresholding for printed text
1745
+ _, binary = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
1746
+
1747
+ # Convert back to PIL Image for tesseract
1748
+ image = Image.fromarray(binary)
1749
+
1750
+ # Set tesseract options for printed content
1751
+ custom_config = r'--oem 3 --psm 6 -l eng'
1752
+ except Exception as e:
1753
+ logger.warning(f"OpenCV preprocessing failed: {str(e)}. Using PIL fallback.")
1754
+
1755
+ # Convert to RGB if not already (pytesseract works best with RGB)
1756
+ if image.mode != 'RGB':
1757
+ image = image.convert('RGB')
1758
+
1759
+ # Apply basic image enhancements
1760
+ image = image.convert('L')
1761
+ enhancer = ImageEnhance.Contrast(image)
1762
+ image = enhancer.enhance(2.0)
1763
+ custom_config = r'--oem 3 --psm 6 -l eng'
1764
+ else:
1765
+ # PIL-only path without OpenCV
1766
+ # Convert to RGB if not already (pytesseract works best with RGB)
1767
+ if image.mode != 'RGB':
1768
+ image = image.convert('RGB')
1769
+
1770
+ # Apply basic image enhancements
1771
+ image = image.convert('L')
1772
+ enhancer = ImageEnhance.Contrast(image)
1773
+ image = enhancer.enhance(2.0)
1774
+ custom_config = r'--oem 3 --psm 6 -l eng'
1775
 
1776
+ # Run OCR with appropriate config
1777
+ ocr_text = pytesseract.image_to_string(image, config=custom_config)
1778
 
1779
  if ocr_text and len(ocr_text.strip()) > 50:
1780
  logger.info(f"Local OCR successful: extracted {len(ocr_text)} characters")
1781
  return ocr_text
1782
  else:
1783
+ # Try another psm mode as fallback
1784
+ logger.warning("First OCR attempt produced minimal text, trying another mode")
1785
+ # Try PSM mode 4 (assume single column of text)
1786
+ fallback_config = r'--oem 3 --psm 4 -l eng'
1787
+ ocr_text = pytesseract.image_to_string(image, config=fallback_config)
1788
+
1789
+ if ocr_text and len(ocr_text.strip()) > 50:
1790
+ logger.info(f"Local OCR fallback successful: extracted {len(ocr_text)} characters")
1791
+ return ocr_text
1792
+ else:
1793
+ logger.warning("Local OCR produced minimal or no text")
1794
+ return None
1795
  except ImportError:
1796
  logger.warning("Pytesseract not installed - local OCR not available")
1797
  return None