import os import io import cv2 import numpy as np import tempfile from PIL import Image, ImageEnhance, ImageFilter from pdf2image import convert_from_bytes import streamlit as st import logging # Configure logging logger = logging.getLogger("preprocessing") logger.setLevel(logging.INFO) @st.cache_data(ttl=24*3600, show_spinner=False) # Cache for 24 hours def convert_pdf_to_images(pdf_bytes, dpi=150, rotation=0): """Convert PDF bytes to a list of images with caching""" try: images = convert_from_bytes(pdf_bytes, dpi=dpi) # Apply rotation if specified if rotation != 0 and images: rotated_images = [] for img in images: rotated_img = img.rotate(rotation, expand=True, resample=Image.BICUBIC) rotated_images.append(rotated_img) return rotated_images return images except Exception as e: st.error(f"Error converting PDF: {str(e)}") logger.error(f"PDF conversion error: {str(e)}") return [] @st.cache_data(ttl=24*3600, show_spinner=False, hash_funcs={dict: lambda x: str(sorted(x.items()))}) def preprocess_image(image_bytes, preprocessing_options): """Preprocess image with selected options optimized for historical document OCR quality""" # Setup basic console logging logger = logging.getLogger("image_preprocessor") logger.setLevel(logging.INFO) # Log which preprocessing options are being applied logger.info(f"Preprocessing image with options: {preprocessing_options}") # Convert bytes to PIL Image image = Image.open(io.BytesIO(image_bytes)) # Check for alpha channel (RGBA) and convert to RGB if needed if image.mode == 'RGBA': # Convert RGBA to RGB by compositing the image onto a white background background = Image.new('RGB', image.size, (255, 255, 255)) background.paste(image, mask=image.split()[3]) # 3 is the alpha channel image = background logger.info("Converted RGBA image to RGB") elif image.mode not in ('RGB', 'L'): # Convert other modes to RGB as well image = image.convert('RGB') logger.info(f"Converted {image.mode} image to RGB") # Apply rotation if specified if preprocessing_options.get("rotation", 0) != 0: rotation_degrees = preprocessing_options.get("rotation") image = image.rotate(rotation_degrees, expand=True, resample=Image.BICUBIC) # Resize large images while preserving details important for OCR width, height = image.size max_dimension = max(width, height) # Less aggressive resizing to preserve document details if max_dimension > 2500: scale_factor = 2500 / max_dimension new_width = int(width * scale_factor) new_height = int(height * scale_factor) # Use LANCZOS for better quality preservation image = image.resize((new_width, new_height), Image.LANCZOS) img_array = np.array(image) # Apply preprocessing based on selected options with settings optimized for historical documents document_type = preprocessing_options.get("document_type", "standard") # Process grayscale option first as it's a common foundation if preprocessing_options.get("grayscale", False): if len(img_array.shape) == 3: # Only convert if it's not already grayscale if document_type == "handwritten": # Enhanced grayscale processing for handwritten documents img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) # Apply adaptive histogram equalization to enhance handwriting clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) img_array = clahe.apply(img_array) else: # Standard grayscale for printed documents img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) # Convert back to RGB for further processing img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB) if preprocessing_options.get("contrast", 0) != 0: contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 100) image = Image.fromarray(img_array) enhancer = ImageEnhance.Contrast(image) image = enhancer.enhance(contrast_factor) img_array = np.array(image) if preprocessing_options.get("denoise", False): try: # Apply appropriate denoising based on document type if document_type == "handwritten": # Very light denoising for handwritten documents to preserve pen strokes if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 3, 3, 5, 9) else: # Grayscale image img_array = cv2.fastNlMeansDenoising(img_array, None, 3, 7, 21) else: # Standard denoising for printed documents if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 5, 5, 7, 21) else: # Grayscale image img_array = cv2.fastNlMeansDenoising(img_array, None, 5, 7, 21) except Exception as e: logger.error(f"Denoising error: {str(e)}, falling back to standard processing") # Convert back to PIL Image processed_image = Image.fromarray(img_array) # Higher quality for OCR processing byte_io = io.BytesIO() try: # Make sure the image is in RGB mode before saving as JPEG if processed_image.mode not in ('RGB', 'L'): processed_image = processed_image.convert('RGB') processed_image.save(byte_io, format='JPEG', quality=92, optimize=True) byte_io.seek(0) logger.info(f"Preprocessing complete. Original image mode: {image.mode}, processed mode: {processed_image.mode}") logger.info(f"Original size: {len(image_bytes)/1024:.1f}KB, processed size: {len(byte_io.getvalue())/1024:.1f}KB") return byte_io.getvalue() except Exception as e: logger.error(f"Error saving processed image: {str(e)}") # Fallback to original image logger.info("Using original image as fallback") image_io = io.BytesIO() image.save(image_io, format='JPEG', quality=92) image_io.seek(0) return image_io.getvalue() def create_temp_file(content, suffix, temp_file_paths): """Create a temporary file and track it for cleanup""" with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: tmp.write(content) temp_path = tmp.name # Track temporary file for cleanup temp_file_paths.append(temp_path) logger.info(f"Created temporary file: {temp_path}") return temp_path def apply_preprocessing_to_file(file_bytes, file_ext, preprocessing_options, temp_file_paths): """Apply preprocessing to file and return path to processed file""" # Check if any preprocessing options with boolean values are True, or if any non-boolean values are non-default has_preprocessing = ( preprocessing_options.get("grayscale", False) or preprocessing_options.get("denoise", False) or preprocessing_options.get("contrast", 0) != 0 or preprocessing_options.get("rotation", 0) != 0 or preprocessing_options.get("document_type", "standard") != "standard" ) if has_preprocessing: # Apply preprocessing processed_bytes = preprocess_image(file_bytes, preprocessing_options) # Save processed image to temp file temp_path = create_temp_file(processed_bytes, file_ext, temp_file_paths) return temp_path, True # Return path and flag indicating preprocessing was applied else: # No preprocessing needed, just save the original file temp_path = create_temp_file(file_bytes, file_ext, temp_file_paths) return temp_path, False # Return path and flag indicating no preprocessing was applied