historical-ocr / ocr_utils.py
milwright's picture
Rolling out modular v2
c04ffe5
"""
OCR utility functions for image processing and OCR operations.
This module provides helper functions used across the Historical OCR application.
"""
import os
import base64
import logging
from pathlib import Path
from typing import Union, Optional
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Try to import optional dependencies
try:
import pytesseract
TESSERACT_AVAILABLE = True
except ImportError:
logger.warning("pytesseract not available - local OCR fallback will not work")
TESSERACT_AVAILABLE = False
try:
from PIL import Image
PILLOW_AVAILABLE = True
except ImportError:
logger.warning("PIL not available - image preprocessing will be limited")
PILLOW_AVAILABLE = False
def encode_image_for_api(image_path: Union[str, Path]) -> str:
"""
Encode an image as base64 data URL for API submission with proper MIME type.
Args:
image_path: Path to the image file
Returns:
Base64 data URL for the image
"""
# Convert to Path object if string
image_file = Path(image_path) if isinstance(image_path, str) else image_path
# Verify image exists
if not image_file.is_file():
raise FileNotFoundError(f"Image file not found: {image_file}")
# Determine mime type based on file extension
mime_type = 'image/jpeg' # Default mime type
suffix = image_file.suffix.lower()
if suffix == '.png':
mime_type = 'image/png'
elif suffix == '.gif':
mime_type = 'image/gif'
elif suffix in ['.jpg', '.jpeg']:
mime_type = 'image/jpeg'
elif suffix == '.pdf':
mime_type = 'application/pdf'
# Encode image as base64
encoded = base64.b64encode(image_file.read_bytes()).decode()
return f"data:{mime_type};base64,{encoded}"
def try_local_ocr_fallback(file_path: Union[str, Path], base64_data_url: Optional[str] = None) -> Optional[str]:
"""
Try to perform OCR using local Tesseract as a fallback when the API is unavailable.
Args:
file_path: Path to the image file
base64_data_url: Optional base64 data URL if already available
Returns:
Extracted text or None if extraction failed
"""
if not TESSERACT_AVAILABLE or not PILLOW_AVAILABLE:
logger.warning("Local OCR fallback is not available (missing dependencies)")
return None
try:
logger.info("Using local Tesseract OCR as fallback")
# Use PIL to open the image
img = Image.open(file_path)
# Use Tesseract to extract text
text = pytesseract.image_to_string(img)
if text:
logger.info("Successfully extracted text using local Tesseract OCR")
return text
else:
logger.warning("Tesseract extracted no text")
return None
except Exception as e:
logger.error(f"Error using local OCR fallback: {str(e)}")
return None