""" File utility functions for historical OCR processing. """ import base64 import logging from pathlib import Path # Configure logging logger = logging.getLogger("utils") logger.setLevel(logging.INFO) def get_base64_from_image(image_path): """ Get base64 data URL from image file with proper MIME type. Args: image_path: Path to the image file Returns: Base64 data URL with appropriate MIME type prefix """ try: # Convert to Path object for better handling path_obj = Path(image_path) # Determine mime type based on file extension mime_type = 'image/jpeg' # Default mime type suffix = path_obj.suffix.lower() if suffix == '.png': mime_type = 'image/png' elif suffix == '.gif': mime_type = 'image/gif' elif suffix in ['.jpg', '.jpeg']: mime_type = 'image/jpeg' elif suffix == '.pdf': mime_type = 'application/pdf' # Read and encode file with open(path_obj, "rb") as file: encoded = base64.b64encode(file.read()).decode('utf-8') return f"data:{mime_type};base64,{encoded}" except Exception as e: logger.error(f"Error encoding file to base64: {str(e)}") return "" def get_base64_from_bytes(file_bytes, mime_type=None, file_name=None): """ Get base64 data URL from file bytes with proper MIME type. Args: file_bytes: Binary file data mime_type: MIME type of the file (optional) file_name: Original file name for MIME type detection (optional) Returns: Base64 data URL with appropriate MIME type prefix """ try: # Determine mime type if not provided if mime_type is None and file_name is not None: # Get file extension suffix = Path(file_name).suffix.lower() if suffix == '.png': mime_type = 'image/png' elif suffix == '.gif': mime_type = 'image/gif' elif suffix in ['.jpg', '.jpeg']: mime_type = 'image/jpeg' elif suffix == '.pdf': mime_type = 'application/pdf' else: # Default to image/jpeg for unknown types when processing images mime_type = 'image/jpeg' elif mime_type is None: # Default MIME type if we can't determine it - use image/jpeg instead of application/octet-stream # to ensure compatibility with Mistral AI OCR API mime_type = 'image/jpeg' # Encode and create data URL encoded = base64.b64encode(file_bytes).decode('utf-8') return f"data:{mime_type};base64,{encoded}" except Exception as e: logger.error(f"Error encoding bytes to base64: {str(e)}") return "" def handle_temp_files(temp_file_paths): """ Clean up temporary files Args: temp_file_paths: List of temporary file paths to clean up """ import os for temp_path in temp_file_paths: try: if os.path.exists(temp_path): os.unlink(temp_path) logger.info(f"Removed temporary file: {temp_path}") except Exception as e: logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")