Spaces:

milwright
/

historical-ocr

Running

File size: 3,108 Bytes

59aaeae
c04ffe5
 
59aaeae
 
c04ffe5
59aaeae
 
 
c04ffe5
59aaeae
 
aabc02c
c04ffe5
aabc02c
 
c04ffe5
 
 
 
 
 
 
59aaeae
 
c04ffe5
59aaeae
aabc02c
 
 
 
59aaeae
 
 
c04ffe5
59aaeae
 
 
 
 
 
 
 
 
 
 
 
 
 
aabc02c
 
 
 
 
 
 
 
 
 
 
 
59aaeae
 
aabc02c
 
59aaeae
c04ffe5
59aaeae
c04ffe5
59aaeae
 
c04ffe5
59aaeae
 
 
c04ffe5
59aaeae
c04ffe5
 
 
59aaeae
 
c04ffe5
59aaeae
c04ffe5
 
59aaeae
c04ffe5
 
59aaeae
c04ffe5
 
 
59aaeae
c04ffe5
 
59aaeae
c04ffe5

"""
OCR utility functions for image processing and OCR operations.
This module provides helper functions used across the Historical OCR application.
"""

import os
import base64
import logging
from pathlib import Path
from typing import Union, Optional

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Try to import optional dependencies
try:
    import pytesseract
    TESSERACT_AVAILABLE = True
except ImportError:
    logger.warning("pytesseract not available - local OCR fallback will not work")
    TESSERACT_AVAILABLE = False

try:
    from PIL import Image
    PILLOW_AVAILABLE = True
except ImportError:
    logger.warning("PIL not available - image preprocessing will be limited")
    PILLOW_AVAILABLE = False


def encode_image_for_api(image_path: Union[str, Path]) -> str:
    """
    Encode an image as base64 data URL for API submission with proper MIME type.
    
    Args:
        image_path: Path to the image file
        
    Returns:
        Base64 data URL for the image
    """
    # Convert to Path object if string
    image_file = Path(image_path) if isinstance(image_path, str) else image_path
    
    # Verify image exists
    if not image_file.is_file():
        raise FileNotFoundError(f"Image file not found: {image_file}")
    
    # Determine mime type based on file extension
    mime_type = 'image/jpeg'  # Default mime type
    suffix = image_file.suffix.lower()
    if suffix == '.png':
        mime_type = 'image/png'
    elif suffix == '.gif':
        mime_type = 'image/gif'
    elif suffix in ['.jpg', '.jpeg']:
        mime_type = 'image/jpeg'
    elif suffix == '.pdf':
        mime_type = 'application/pdf'
    
    # Encode image as base64
    encoded = base64.b64encode(image_file.read_bytes()).decode()
    return f"data:{mime_type};base64,{encoded}"


def try_local_ocr_fallback(file_path: Union[str, Path], base64_data_url: Optional[str] = None) -> Optional[str]:
    """
    Try to perform OCR using local Tesseract as a fallback when the API is unavailable.
    
    Args:
        file_path: Path to the image file
        base64_data_url: Optional base64 data URL if already available
        
    Returns:
        Extracted text or None if extraction failed
    """
    if not TESSERACT_AVAILABLE or not PILLOW_AVAILABLE:
        logger.warning("Local OCR fallback is not available (missing dependencies)")
        return None
    
    try:
        logger.info("Using local Tesseract OCR as fallback")
        
        # Use PIL to open the image
        img = Image.open(file_path)
        
        # Use Tesseract to extract text
        text = pytesseract.image_to_string(img)
        
        if text:
            logger.info("Successfully extracted text using local Tesseract OCR")
            return text
        else:
            logger.warning("Tesseract extracted no text")
            return None
    except Exception as e:
        logger.error(f"Error using local OCR fallback: {str(e)}")
        return None