Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / preprocessing.py

milwright

Reconcile main with preview-improvements branch, implementing modular structure, raw text editing, and enhanced framing

7647e70 about 1 month ago

raw

history blame

8.23 kB

	import os
	import io
	import cv2
	import numpy as np
	import tempfile
	from PIL import Image, ImageEnhance, ImageFilter
	from pdf2image import convert_from_bytes
	import streamlit as st
	import logging

	# Configure logging
	logger = logging.getLogger("preprocessing")
	logger.setLevel(logging.INFO)

	@st.cache_data(ttl=24*3600, show_spinner=False) # Cache for 24 hours
	def convert_pdf_to_images(pdf_bytes, dpi=150, rotation=0):
	"""Convert PDF bytes to a list of images with caching"""
	try:
	images = convert_from_bytes(pdf_bytes, dpi=dpi)

	# Apply rotation if specified
	if rotation != 0 and images:
	rotated_images = []
	for img in images:
	rotated_img = img.rotate(rotation, expand=True, resample=Image.BICUBIC)
	rotated_images.append(rotated_img)
	return rotated_images

	return images
	except Exception as e:
	st.error(f"Error converting PDF: {str(e)}")
	logger.error(f"PDF conversion error: {str(e)}")
	return []

	@st.cache_data(ttl=24*3600, show_spinner=False, hash_funcs={dict: lambda x: str(sorted(x.items()))})
	def preprocess_image(image_bytes, preprocessing_options):
	"""Preprocess image with selected options optimized for historical document OCR quality"""
	# Setup basic console logging
	logger = logging.getLogger("image_preprocessor")
	logger.setLevel(logging.INFO)

	# Log which preprocessing options are being applied
	logger.info(f"Preprocessing image with options: {preprocessing_options}")

	# Convert bytes to PIL Image
	image = Image.open(io.BytesIO(image_bytes))

	# Check for alpha channel (RGBA) and convert to RGB if needed
	if image.mode == 'RGBA':
	# Convert RGBA to RGB by compositing the image onto a white background
	background = Image.new('RGB', image.size, (255, 255, 255))
	background.paste(image, mask=image.split()[3]) # 3 is the alpha channel
	image = background
	logger.info("Converted RGBA image to RGB")
	elif image.mode not in ('RGB', 'L'):
	# Convert other modes to RGB as well
	image = image.convert('RGB')
	logger.info(f"Converted {image.mode} image to RGB")

	# Apply rotation if specified
	if preprocessing_options.get("rotation", 0) != 0:
	rotation_degrees = preprocessing_options.get("rotation")
	image = image.rotate(rotation_degrees, expand=True, resample=Image.BICUBIC)

	# Resize large images while preserving details important for OCR
	width, height = image.size
	max_dimension = max(width, height)

	# Less aggressive resizing to preserve document details
	if max_dimension > 2500:
	scale_factor = 2500 / max_dimension
	new_width = int(width * scale_factor)
	new_height = int(height * scale_factor)
	# Use LANCZOS for better quality preservation
	image = image.resize((new_width, new_height), Image.LANCZOS)

	img_array = np.array(image)

	# Apply preprocessing based on selected options with settings optimized for historical documents
	document_type = preprocessing_options.get("document_type", "standard")

	# Process grayscale option first as it's a common foundation
	if preprocessing_options.get("grayscale", False):
	if len(img_array.shape) == 3: # Only convert if it's not already grayscale
	if document_type == "handwritten":
	# Enhanced grayscale processing for handwritten documents
	img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
	# Apply adaptive histogram equalization to enhance handwriting
	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
	img_array = clahe.apply(img_array)
	else:
	# Standard grayscale for printed documents
	img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)

	# Convert back to RGB for further processing
	img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)

	if preprocessing_options.get("contrast", 0) != 0:
	contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 100)
	image = Image.fromarray(img_array)
	enhancer = ImageEnhance.Contrast(image)
	image = enhancer.enhance(contrast_factor)
	img_array = np.array(image)

	if preprocessing_options.get("denoise", False):
	try:
	# Apply appropriate denoising based on document type
	if document_type == "handwritten":
	# Very light denoising for handwritten documents to preserve pen strokes
	if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
	img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 3, 3, 5, 9)
	else: # Grayscale image
	img_array = cv2.fastNlMeansDenoising(img_array, None, 3, 7, 21)
	else:
	# Standard denoising for printed documents
	if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
	img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 5, 5, 7, 21)
	else: # Grayscale image
	img_array = cv2.fastNlMeansDenoising(img_array, None, 5, 7, 21)
	except Exception as e:
	logger.error(f"Denoising error: {str(e)}, falling back to standard processing")

	# Convert back to PIL Image
	processed_image = Image.fromarray(img_array)

	# Higher quality for OCR processing
	byte_io = io.BytesIO()
	try:
	# Make sure the image is in RGB mode before saving as JPEG
	if processed_image.mode not in ('RGB', 'L'):
	processed_image = processed_image.convert('RGB')

	processed_image.save(byte_io, format='JPEG', quality=92, optimize=True)
	byte_io.seek(0)

	logger.info(f"Preprocessing complete. Original image mode: {image.mode}, processed mode: {processed_image.mode}")
	logger.info(f"Original size: {len(image_bytes)/1024:.1f}KB, processed size: {len(byte_io.getvalue())/1024:.1f}KB")

	return byte_io.getvalue()
	except Exception as e:
	logger.error(f"Error saving processed image: {str(e)}")
	# Fallback to original image
	logger.info("Using original image as fallback")
	image_io = io.BytesIO()
	image.save(image_io, format='JPEG', quality=92)
	image_io.seek(0)
	return image_io.getvalue()

	def create_temp_file(content, suffix, temp_file_paths):
	"""Create a temporary file and track it for cleanup"""
	with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
	tmp.write(content)
	temp_path = tmp.name
	# Track temporary file for cleanup
	temp_file_paths.append(temp_path)
	logger.info(f"Created temporary file: {temp_path}")
	return temp_path

	def apply_preprocessing_to_file(file_bytes, file_ext, preprocessing_options, temp_file_paths):
	"""Apply preprocessing to file and return path to processed file"""
	# Check if any preprocessing options with boolean values are True, or if any non-boolean values are non-default
	has_preprocessing = (
	preprocessing_options.get("grayscale", False) or
	preprocessing_options.get("denoise", False) or
	preprocessing_options.get("contrast", 0) != 0 or
	preprocessing_options.get("rotation", 0) != 0 or
	preprocessing_options.get("document_type", "standard") != "standard"
	)

	if has_preprocessing:
	# Apply preprocessing
	processed_bytes = preprocess_image(file_bytes, preprocessing_options)

	# Save processed image to temp file
	temp_path = create_temp_file(processed_bytes, file_ext, temp_file_paths)
	return temp_path, True # Return path and flag indicating preprocessing was applied
	else:
	# No preprocessing needed, just save the original file
	temp_path = create_temp_file(file_bytes, file_ext, temp_file_paths)
	return temp_path, False # Return path and flag indicating no preprocessing was applied