milwright commited on
Commit
3f83f08
·
verified ·
1 Parent(s): 59aaeae

Update historical-ocr application with enhanced features

Browse files
Files changed (8) hide show
  1. README.md +1 -12
  2. app.py +7 -7
  3. config.py +2 -1
  4. input/baldwin-letters-combined.jpg +3 -0
  5. input/revere.jpg +3 -0
  6. ocr_utils.py +151 -84
  7. requirements.txt +1 -1
  8. ui/custom.css +32 -0
README.md CHANGED
@@ -1,14 +1,3 @@
1
- ---
2
- title: Historical OCR with Contextual Intelligence
3
- emoji: 📜
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: "1.28.0"
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
  # Historical OCR with Contextual Intelligence
13
 
14
  An advanced OCR application for historical document analysis using Mistral AI.
@@ -43,4 +32,4 @@ Built with Streamlit and Mistral AI's OCR and large language model capabilities.
43
 
44
  ---
45
 
46
- Created by Zach Muhlbauer, CUNY Graduate Center
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Historical OCR with Contextual Intelligence
2
 
3
  An advanced OCR application for historical document analysis using Mistral AI.
 
32
 
33
  ---
34
 
35
+ Created by [Add your name/organization]
app.py CHANGED
@@ -827,7 +827,7 @@ with main_tab2:
827
  images = page.get('images', [])
828
  for img in images:
829
  if 'image_base64' in img:
830
- st.image(img['image_base64'], width=600)
831
 
832
  # Display text content if available
833
  text_content = page.get('markdown', '')
@@ -925,7 +925,7 @@ with main_tab1:
925
 
926
  # Process button - flush left with similar padding as file browser
927
  with left_col:
928
- process_button = st.button("Process Document")
929
 
930
  # Image preprocessing preview in upload column, right after the process button
931
  if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
@@ -934,13 +934,13 @@ with main_tab1:
934
 
935
  with preview_cols[0]:
936
  st.markdown("**Original Image**")
937
- st.image(uploaded_file, width=600)
938
 
939
  with preview_cols[1]:
940
  st.markdown("**Preprocessed Image**")
941
  try:
942
  processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
943
- st.image(io.BytesIO(processed_bytes), width=600)
944
  except Exception as e:
945
  st.error(f"Error in preprocessing: {str(e)}")
946
  st.info("Try using grayscale preprocessing for PNG images with transparency")
@@ -1636,7 +1636,7 @@ with main_tab1:
1636
  with columns1[i]:
1637
  if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tif']:
1638
  try:
1639
- st.image(str(img_path), caption=img_path.name, width=300)
1640
  except Exception:
1641
  st.info(f"Example: {img_path.name}")
1642
  else:
@@ -1649,7 +1649,7 @@ with main_tab1:
1649
  with columns2[i]:
1650
  if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tif']:
1651
  try:
1652
- st.image(str(img_path), caption=img_path.name, width=300)
1653
  except Exception:
1654
  st.info(f"Example: {img_path.name}")
1655
  else:
@@ -1662,7 +1662,7 @@ with main_tab1:
1662
  with columns[i % len(columns)]:
1663
  if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tif']:
1664
  try:
1665
- st.image(str(img_path), caption=img_path.name, width=300)
1666
  except Exception:
1667
  st.info(f"Example: {img_path.name}")
1668
  else:
 
827
  images = page.get('images', [])
828
  for img in images:
829
  if 'image_base64' in img:
830
+ st.image(img['image_base64'], use_container_width=True)
831
 
832
  # Display text content if available
833
  text_content = page.get('markdown', '')
 
925
 
926
  # Process button - flush left with similar padding as file browser
927
  with left_col:
928
+ process_button = st.button("Process Document", use_container_width=True)
929
 
930
  # Image preprocessing preview in upload column, right after the process button
931
  if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
 
934
 
935
  with preview_cols[0]:
936
  st.markdown("**Original Image**")
937
+ st.image(uploaded_file, use_container_width=True)
938
 
939
  with preview_cols[1]:
940
  st.markdown("**Preprocessed Image**")
941
  try:
942
  processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
943
+ st.image(io.BytesIO(processed_bytes), use_container_width=True)
944
  except Exception as e:
945
  st.error(f"Error in preprocessing: {str(e)}")
946
  st.info("Try using grayscale preprocessing for PNG images with transparency")
 
1636
  with columns1[i]:
1637
  if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tif']:
1638
  try:
1639
+ st.image(str(img_path), caption=img_path.name, use_container_width=True)
1640
  except Exception:
1641
  st.info(f"Example: {img_path.name}")
1642
  else:
 
1649
  with columns2[i]:
1650
  if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tif']:
1651
  try:
1652
+ st.image(str(img_path), caption=img_path.name, use_container_width=True)
1653
  except Exception:
1654
  st.info(f"Example: {img_path.name}")
1655
  else:
 
1662
  with columns[i % len(columns)]:
1663
  if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tif']:
1664
  try:
1665
+ st.image(str(img_path), caption=img_path.name, use_container_width=True)
1666
  except Exception:
1667
  st.info(f"Example: {img_path.name}")
1668
  else:
config.py CHANGED
@@ -22,7 +22,8 @@ MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY",
22
  os.environ.get("MISTRAL_API_KEY", "")).strip()
23
 
24
  # Check if we're in test mode (allows operation without valid API key)
25
- TEST_MODE = False # Disable test mode for production use
 
26
 
27
  # Just check if API key exists
28
  if not MISTRAL_API_KEY and not TEST_MODE:
 
22
  os.environ.get("MISTRAL_API_KEY", "")).strip()
23
 
24
  # Check if we're in test mode (allows operation without valid API key)
25
+ # Enable test mode for diagnosing OCR issues
26
+ TEST_MODE = True
27
 
28
  # Just check if API key exists
29
  if not MISTRAL_API_KEY and not TEST_MODE:
input/baldwin-letters-combined.jpg ADDED

Git LFS Details

  • SHA256: e43d067402153ca1a9c3d0f04c8072b079b6536e30f0b663e6ea27f81cc308d5
  • Pointer size: 131 Bytes
  • Size of remote file: 400 kB
input/revere.jpg ADDED

Git LFS Details

  • SHA256: a3b69e20a222e60187cb1492a45ffe2233e8bfecdec02a25395ac0a699316826
  • Pointer size: 130 Bytes
  • Size of remote file: 22.8 kB
ocr_utils.py CHANGED
@@ -9,6 +9,7 @@ import io
9
  import zipfile
10
  import logging
11
  import numpy as np
 
12
  from datetime import datetime
13
  from pathlib import Path
14
  from typing import Dict, List, Optional, Union, Any, Tuple
@@ -554,6 +555,8 @@ def _detect_document_type_impl(img_hash=None) -> bool:
554
  """
555
  Optimized implementation of document type detection for faster processing.
556
  The img_hash parameter is unused but kept for backward compatibility.
 
 
557
  """
558
  # Fast path: Get the image from thread-local storage
559
  if not hasattr(_detect_document_type_impl, "_current_img"):
@@ -566,26 +569,6 @@ def _detect_document_type_impl(img_hash=None) -> bool:
566
  if width * height < 100000: # Approx 300x300 or smaller
567
  return False
568
 
569
- # Quick check: If image has many colors, it's likely not a document
570
- # Sample a subset of pixels for color analysis (faster than full histogram)
571
- try:
572
- # Sample pixels in a grid pattern
573
- color_samples = []
574
- for x in range(0, width, max(1, width // 10)):
575
- for y in range(0, height, max(1, height // 10)):
576
- try:
577
- color_samples.append(img.getpixel((x, y)))
578
- except:
579
- pass
580
-
581
- # Count unique colors in the sample
582
- if img.mode == 'RGB':
583
- unique_colors = len(set(color_samples))
584
- if unique_colors > 1000: # Many unique colors suggest a photo, not a document
585
- return False
586
- except:
587
- pass # If sampling fails, continue with regular analysis
588
-
589
  # Convert to grayscale for analysis (using faster conversion)
590
  gray_img = img.convert('L')
591
 
@@ -609,7 +592,7 @@ def _detect_document_type_impl(img_hash=None) -> bool:
609
 
610
  # Count edge pixels using threshold (faster than summing individual pixels)
611
  edge_data = edges.getdata()
612
- edge_threshold = 50
613
 
614
  # Use list comprehension for better performance
615
  edge_count = sum(1 for p in edge_data if p > edge_threshold)
@@ -621,18 +604,17 @@ def _detect_document_type_impl(img_hash=None) -> bool:
621
  bright_ratio = bright_count / (width * height)
622
 
623
  # Documents typically have more edges (text boundaries) and bright areas (background)
624
- return edge_ratio > 0.05 or bright_ratio > 0.4
 
625
 
626
- # OpenCV path - optimized for speed
627
  img_np = np.array(gray_img)
628
 
629
- # Fast document detection heuristics
630
-
631
  # 1. Fast check: Variance of pixel values
632
- # Documents typically have high variance (black text on white background)
633
- # Use numpy's fast statistical functions
634
  std_dev = np.std(img_np)
635
- if std_dev > 60: # High standard deviation suggests document
636
  return True
637
 
638
  # 2. Quick check using downsampled image for edges
@@ -643,22 +625,38 @@ def _detect_document_type_impl(img_hash=None) -> bool:
643
  else:
644
  small_img = img_np
645
 
646
- # Use faster edge detection
647
- edges = cv2.Canny(small_img, 50, 150, L2gradient=False)
 
648
  edge_ratio = np.count_nonzero(edges) / edges.size
649
 
650
  # 3. Fast histogram approximation using bins
651
  # Instead of calculating full histogram, use bins for dark and light regions
652
- dark_mask = img_np < 50
653
- light_mask = img_np > 200
 
654
 
655
  dark_ratio = np.count_nonzero(dark_mask) / img_np.size
656
  light_ratio = np.count_nonzero(light_mask) / img_np.size
657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
658
  # Combine heuristics for final decision
659
  # Documents typically have both dark (text) and light (background) regions,
660
  # and/or well-defined edges
661
- return (dark_ratio > 0.05 and light_ratio > 0.3) or edge_ratio > 0.04
 
662
 
663
  # Removed caching to fix unhashable type error
664
  def preprocess_document_image(img: Image.Image) -> Image.Image:
@@ -678,7 +676,8 @@ def preprocess_document_image(img: Image.Image) -> Image.Image:
678
 
679
  def _preprocess_document_image_impl() -> Image.Image:
680
  """
681
- Optimized implementation of document preprocessing with adaptive processing based on image size
 
682
  """
683
  # Fast path: Get image from thread-local storage
684
  if not hasattr(preprocess_document_image, "_current_img"):
@@ -690,94 +689,162 @@ def _preprocess_document_image_impl() -> Image.Image:
690
  width, height = img.size
691
  img_size = width * height
692
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693
  # Ultra-fast path for tiny images - just convert to grayscale with contrast enhancement
694
  if img_size < 300000: # ~500x600 or smaller
695
  gray = img.convert('L')
 
 
696
  enhancer = ImageEnhance.Contrast(gray)
697
- return enhancer.enhance(IMAGE_PREPROCESSING["enhance_contrast"])
698
 
699
  # Fast path for small images - minimal processing
700
  if img_size < 1000000: # ~1000x1000 or smaller
701
  gray = img.convert('L')
 
 
702
  enhancer = ImageEnhance.Contrast(gray)
703
- enhanced = enhancer.enhance(IMAGE_PREPROCESSING["enhance_contrast"])
 
704
  # Light sharpening only if sharpen is enabled
 
705
  if IMAGE_PREPROCESSING["sharpen"]:
706
- enhanced = enhanced.filter(ImageFilter.SHARPEN)
 
 
 
 
707
  return enhanced
708
 
709
  # Standard path for medium images
710
  # Convert to grayscale (faster processing)
711
  gray = img.convert('L')
712
 
713
- # Improve contrast - key for text visibility
 
714
  enhancer = ImageEnhance.Contrast(gray)
715
- enhanced = enhancer.enhance(IMAGE_PREPROCESSING["enhance_contrast"])
716
 
717
- # Apply light sharpening for text clarity
718
  if IMAGE_PREPROCESSING["sharpen"]:
719
- enhanced = enhanced.filter(ImageFilter.SHARPEN)
 
 
 
 
720
 
721
- # Advanced processing for larger images or when OpenCV is available
722
- # The following optimizations improve OCR accuracy significantly for complex documents
723
- if img_size > 1500000 and CV2_AVAILABLE and IMAGE_PREPROCESSING["denoise"]:
724
  try:
725
  # Convert to numpy array for OpenCV processing
726
  img_np = np.array(enhanced)
727
 
728
- # Optimize denoising parameters based on image size
729
- if img_size > 4000000: # Very large images (~2000x2000 or larger)
730
- # More aggressive downsampling for very large images
731
- scale_factor = 0.5
732
- downsample = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
733
- interpolation=cv2.INTER_AREA)
 
 
 
 
 
 
 
 
 
734
 
735
- # Lighter denoising for downsampled image
736
- h_value = 7 # Strength parameter
737
- template_window = 5
738
- search_window = 13
739
 
740
- # Apply denoising on smaller image
741
- denoised_np = cv2.fastNlMeansDenoising(downsample, None, h_value, template_window, search_window)
742
 
743
- # Resize back to original size
744
- denoised_np = cv2.resize(denoised_np, (width, height), interpolation=cv2.INTER_LINEAR)
745
  else:
746
- # Direct denoising for medium-large images
747
- h_value = 8 # Balanced for speed and quality
748
- template_window = 5
749
- search_window = 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
 
751
- # Apply denoising
752
- denoised_np = cv2.fastNlMeansDenoising(img_np, None, h_value, template_window, search_window)
753
-
754
- # Convert back to PIL Image
755
- enhanced = Image.fromarray(denoised_np)
756
-
757
- # Apply adaptive thresholding only if it improves text visibility
758
- # Create a binarized version of the image
759
- if img_size < 8000000: # Skip for extremely large images to save processing time
760
- binary = cv2.adaptiveThreshold(denoised_np, 255,
761
- cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
762
- cv2.THRESH_BINARY, 11, 2)
763
 
764
- # Quick verification that binarization preserves text information
765
- # Use simplified check that works well for document images
766
- white_pixels_binary = np.count_nonzero(binary > 200)
767
- white_pixels_orig = np.count_nonzero(denoised_np > 200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
768
 
769
- # Check if binary preserves reasonable amount of white pixels (background)
770
- if white_pixels_binary > white_pixels_orig * 0.8:
771
- # Binarization looks good, use it
772
- return Image.fromarray(binary)
773
  except Exception as e:
774
  # If OpenCV processing fails, continue with PIL-enhanced image
775
  pass
776
 
777
  elif IMAGE_PREPROCESSING["denoise"]:
778
  # Fallback PIL denoising for systems without OpenCV
779
- # Use lighter median filter
780
- enhanced = enhanced.filter(ImageFilter.MedianFilter(3))
 
 
 
 
 
781
 
782
  # Return enhanced grayscale image
783
  return enhanced
 
9
  import zipfile
10
  import logging
11
  import numpy as np
12
+ import time
13
  from datetime import datetime
14
  from pathlib import Path
15
  from typing import Dict, List, Optional, Union, Any, Tuple
 
555
  """
556
  Optimized implementation of document type detection for faster processing.
557
  The img_hash parameter is unused but kept for backward compatibility.
558
+
559
+ Enhanced to better detect handwritten documents.
560
  """
561
  # Fast path: Get the image from thread-local storage
562
  if not hasattr(_detect_document_type_impl, "_current_img"):
 
569
  if width * height < 100000: # Approx 300x300 or smaller
570
  return False
571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  # Convert to grayscale for analysis (using faster conversion)
573
  gray_img = img.convert('L')
574
 
 
592
 
593
  # Count edge pixels using threshold (faster than summing individual pixels)
594
  edge_data = edges.getdata()
595
+ edge_threshold = 40 # Lowered threshold to better detect handwritten texts
596
 
597
  # Use list comprehension for better performance
598
  edge_count = sum(1 for p in edge_data if p > edge_threshold)
 
604
  bright_ratio = bright_count / (width * height)
605
 
606
  # Documents typically have more edges (text boundaries) and bright areas (background)
607
+ # Lowered edge threshold to better detect handwritten documents
608
+ return edge_ratio > 0.035 or bright_ratio > 0.4
609
 
610
+ # OpenCV path - optimized for speed and enhanced for handwritten documents
611
  img_np = np.array(gray_img)
612
 
 
 
613
  # 1. Fast check: Variance of pixel values
614
+ # Documents typically have high variance (text on background)
615
+ # Handwritten documents may have less contrast than printed text
616
  std_dev = np.std(img_np)
617
+ if std_dev > 45: # Lowered threshold to better detect handwritten documents
618
  return True
619
 
620
  # 2. Quick check using downsampled image for edges
 
625
  else:
626
  small_img = img_np
627
 
628
+ # Use adaptive edge detection parameters for handwritten documents
629
+ # Lowered threshold to better detect fainter handwritten text
630
+ edges = cv2.Canny(small_img, 30, 130, L2gradient=False)
631
  edge_ratio = np.count_nonzero(edges) / edges.size
632
 
633
  # 3. Fast histogram approximation using bins
634
  # Instead of calculating full histogram, use bins for dark and light regions
635
+ # Adjusted for handwritten documents which may have more gray values
636
+ dark_mask = img_np < 60 # Increased threshold to capture lighter handwritten text
637
+ light_mask = img_np > 180 # Lowered threshold to account for aged paper
638
 
639
  dark_ratio = np.count_nonzero(dark_mask) / img_np.size
640
  light_ratio = np.count_nonzero(light_mask) / img_np.size
641
 
642
+ # Special analysis for handwritten documents
643
+ # Check for line-like structures typical in handwritten text
644
+ if CV2_AVAILABLE and edge_ratio > 0.02: # Lower threshold to capture handwritten documents
645
+ # Try to find line segments that could indicate text lines
646
+ lines = cv2.HoughLinesP(edges, 1, np.pi/180,
647
+ threshold=50, # Lower threshold for detection
648
+ minLineLength=30, # Shorter lines for handwriting
649
+ maxLineGap=20) # Larger gap for discontinuous handwriting
650
+
651
+ # If we find enough line segments, it's likely a document with text
652
+ if lines is not None and len(lines) > 10:
653
+ return True
654
+
655
  # Combine heuristics for final decision
656
  # Documents typically have both dark (text) and light (background) regions,
657
  # and/or well-defined edges
658
+ # Lower thresholds for handwritten documents
659
+ return (dark_ratio > 0.03 and light_ratio > 0.25) or edge_ratio > 0.03
660
 
661
  # Removed caching to fix unhashable type error
662
  def preprocess_document_image(img: Image.Image) -> Image.Image:
 
676
 
677
  def _preprocess_document_image_impl() -> Image.Image:
678
  """
679
+ Optimized implementation of document preprocessing with adaptive processing based on image size.
680
+ Enhanced for better handwritten document processing.
681
  """
682
  # Fast path: Get image from thread-local storage
683
  if not hasattr(preprocess_document_image, "_current_img"):
 
689
  width, height = img.size
690
  img_size = width * height
691
 
692
+ # Check if the image might be a handwritten document - use special processing
693
+ is_handwritten = False
694
+ try:
695
+ # Simple check for handwritten document characteristics
696
+ # Handwritten documents often have more varied strokes and less stark contrast
697
+ if CV2_AVAILABLE:
698
+ # Convert to grayscale and calculate local variance
699
+ gray_np = np.array(img.convert('L'))
700
+ # Higher variance in edge strengths can indicate handwriting
701
+ edges = cv2.Canny(gray_np, 30, 100)
702
+ if np.count_nonzero(edges) / edges.size > 0.02: # Low edge threshold for handwriting
703
+ # Additional check with gradient magnitudes
704
+ sobelx = cv2.Sobel(gray_np, cv2.CV_64F, 1, 0, ksize=3)
705
+ sobely = cv2.Sobel(gray_np, cv2.CV_64F, 0, 1, ksize=3)
706
+ magnitude = np.sqrt(sobelx**2 + sobely**2)
707
+ # Handwriting typically has more variation in gradient magnitudes
708
+ if np.std(magnitude) > 20:
709
+ is_handwritten = True
710
+ except:
711
+ # If detection fails, assume it's not handwritten
712
+ pass
713
+
714
  # Ultra-fast path for tiny images - just convert to grayscale with contrast enhancement
715
  if img_size < 300000: # ~500x600 or smaller
716
  gray = img.convert('L')
717
+ # Lower contrast enhancement for handwritten documents
718
+ contrast_level = 1.4 if is_handwritten else IMAGE_PREPROCESSING["enhance_contrast"]
719
  enhancer = ImageEnhance.Contrast(gray)
720
+ return enhancer.enhance(contrast_level)
721
 
722
  # Fast path for small images - minimal processing
723
  if img_size < 1000000: # ~1000x1000 or smaller
724
  gray = img.convert('L')
725
+ # Use gentler contrast enhancement for handwritten documents
726
+ contrast_level = 1.4 if is_handwritten else IMAGE_PREPROCESSING["enhance_contrast"]
727
  enhancer = ImageEnhance.Contrast(gray)
728
+ enhanced = enhancer.enhance(contrast_level)
729
+
730
  # Light sharpening only if sharpen is enabled
731
+ # Use milder sharpening for handwritten documents to preserve stroke detail
732
  if IMAGE_PREPROCESSING["sharpen"]:
733
+ if is_handwritten:
734
+ # Use edge enhancement which is gentler than SHARPEN for handwriting
735
+ enhanced = enhanced.filter(ImageFilter.EDGE_ENHANCE)
736
+ else:
737
+ enhanced = enhanced.filter(ImageFilter.SHARPEN)
738
  return enhanced
739
 
740
  # Standard path for medium images
741
  # Convert to grayscale (faster processing)
742
  gray = img.convert('L')
743
 
744
+ # Adaptive contrast enhancement based on document type
745
+ contrast_level = 1.4 if is_handwritten else IMAGE_PREPROCESSING["enhance_contrast"]
746
  enhancer = ImageEnhance.Contrast(gray)
747
+ enhanced = enhancer.enhance(contrast_level)
748
 
749
+ # Apply light sharpening for text clarity - adapt based on document type
750
  if IMAGE_PREPROCESSING["sharpen"]:
751
+ if is_handwritten:
752
+ # Use edge enhancement which is gentler than SHARPEN for handwriting
753
+ enhanced = enhanced.filter(ImageFilter.EDGE_ENHANCE)
754
+ else:
755
+ enhanced = enhanced.filter(ImageFilter.SHARPEN)
756
 
757
+ # Advanced processing with OpenCV if available
758
+ if CV2_AVAILABLE and IMAGE_PREPROCESSING["denoise"]:
 
759
  try:
760
  # Convert to numpy array for OpenCV processing
761
  img_np = np.array(enhanced)
762
 
763
+ if is_handwritten:
764
+ # Special treatment for handwritten documents
765
+ # Use guided filter which preserves edges better than NLMeans
766
+ # Guided filter works well for handwriting by preserving stroke details
767
+ if img_size > 3000000: # Large images - downsample first
768
+ scale_factor = 0.5
769
+ small_img = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
770
+ interpolation=cv2.INTER_AREA)
771
+ # Apply bilateral filter which preserves edges while smoothing
772
+ filtered = cv2.bilateralFilter(small_img, 9, 75, 75)
773
+ # Resize back
774
+ filtered = cv2.resize(filtered, (width, height), interpolation=cv2.INTER_LINEAR)
775
+ else:
776
+ # Use bilateral filter directly for smaller images
777
+ filtered = cv2.bilateralFilter(img_np, 7, 50, 50)
778
 
779
+ # Convert back to PIL Image
780
+ enhanced = Image.fromarray(filtered)
 
 
781
 
782
+ # For handwritten docs, avoid binary thresholding which can destroy subtle strokes
783
+ return enhanced
784
 
 
 
785
  else:
786
+ # Standard document processing - optimized for printed text
787
+ # Optimize denoising parameters based on image size
788
+ if img_size > 4000000: # Very large images
789
+ # More aggressive downsampling for very large images
790
+ scale_factor = 0.5
791
+ downsample = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
792
+ interpolation=cv2.INTER_AREA)
793
+
794
+ # Lighter denoising for downsampled image
795
+ h_value = 7 # Strength parameter
796
+ template_window = 5
797
+ search_window = 13
798
+
799
+ # Apply denoising on smaller image
800
+ denoised_np = cv2.fastNlMeansDenoising(downsample, None, h_value, template_window, search_window)
801
+
802
+ # Resize back to original size
803
+ denoised_np = cv2.resize(denoised_np, (width, height), interpolation=cv2.INTER_LINEAR)
804
+ else:
805
+ # Direct denoising for medium-large images
806
+ h_value = 8 # Balanced for speed and quality
807
+ template_window = 5
808
+ search_window = 15
809
+
810
+ # Apply denoising
811
+ denoised_np = cv2.fastNlMeansDenoising(img_np, None, h_value, template_window, search_window)
812
 
813
+ # Convert back to PIL Image
814
+ enhanced = Image.fromarray(denoised_np)
 
 
 
 
 
 
 
 
 
 
815
 
816
+ # Apply adaptive thresholding only if it improves text visibility
817
+ # Create a binarized version of the image
818
+ if img_size < 8000000: # Skip for extremely large images to save processing time
819
+ binary = cv2.adaptiveThreshold(denoised_np, 255,
820
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
821
+ cv2.THRESH_BINARY, 11, 2)
822
+
823
+ # Quick verification that binarization preserves text information
824
+ # Use simplified check that works well for document images
825
+ white_pixels_binary = np.count_nonzero(binary > 200)
826
+ white_pixels_orig = np.count_nonzero(denoised_np > 200)
827
+
828
+ # Check if binary preserves reasonable amount of white pixels (background)
829
+ if white_pixels_binary > white_pixels_orig * 0.8:
830
+ # Binarization looks good, use it
831
+ return Image.fromarray(binary)
832
+
833
+ return enhanced
834
 
 
 
 
 
835
  except Exception as e:
836
  # If OpenCV processing fails, continue with PIL-enhanced image
837
  pass
838
 
839
  elif IMAGE_PREPROCESSING["denoise"]:
840
  # Fallback PIL denoising for systems without OpenCV
841
+ if is_handwritten:
842
+ # Lighter filtering for handwritten text to preserve details
843
+ # Use a smaller median filter for handwritten documents
844
+ enhanced = enhanced.filter(ImageFilter.MedianFilter(1))
845
+ else:
846
+ # Standard filtering for printed documents
847
+ enhanced = enhanced.filter(ImageFilter.MedianFilter(3))
848
 
849
  # Return enhanced grayscale image
850
  return enhanced
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  # Generated requirements for Hugging Face Spaces deployment
2
 
3
- streamlit>=1.28.0
4
  mistralai>=0.0.3
5
  Pillow>=9.0.0
6
  opencv-python-headless>=4.5.0
 
1
  # Generated requirements for Hugging Face Spaces deployment
2
 
3
+ streamlit>=1.20.0
4
  mistralai>=0.0.3
5
  Pillow>=9.0.0
6
  opencv-python-headless>=4.5.0
ui/custom.css CHANGED
@@ -64,4 +64,36 @@
64
  font-size: 1.3rem;
65
  font-weight: bold;
66
  margin-bottom: 15px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  }
 
64
  font-size: 1.3rem;
65
  font-weight: bold;
66
  margin-bottom: 15px;
67
+ }
68
+
69
+ /* Fix for image preprocessing preview */
70
+ .stExpander {
71
+ overflow: hidden !important;
72
+ }
73
+
74
+ .stExpander img {
75
+ max-width: 100% !important;
76
+ height: auto !important;
77
+ object-fit: contain !important;
78
+ }
79
+
80
+ /* Additional image fixes for all containers */
81
+ .document-content img,
82
+ .markdown-text-container img,
83
+ .page-text-content img,
84
+ .image-container img {
85
+ max-width: 100% !important;
86
+ height: auto !important;
87
+ object-fit: contain !important;
88
+ }
89
+
90
+ /* Responsive design rules */
91
+ @media (max-width: 768px) {
92
+ .stExpander img,
93
+ .document-content img,
94
+ .markdown-text-container img,
95
+ .page-text-content img,
96
+ .image-container img {
97
+ max-width: 95% !important;
98
+ }
99
  }