historical-ocr / utils /file_utils.py
milwright's picture
Rolling out modular v2
c04ffe5
"""
File utility functions for historical OCR processing.
"""
import base64
import logging
from pathlib import Path
# Configure logging
logger = logging.getLogger("utils")
logger.setLevel(logging.INFO)
def get_base64_from_image(image_path):
"""
Get base64 data URL from image file with proper MIME type.
Args:
image_path: Path to the image file
Returns:
Base64 data URL with appropriate MIME type prefix
"""
try:
# Convert to Path object for better handling
path_obj = Path(image_path)
# Determine mime type based on file extension
mime_type = 'image/jpeg' # Default mime type
suffix = path_obj.suffix.lower()
if suffix == '.png':
mime_type = 'image/png'
elif suffix == '.gif':
mime_type = 'image/gif'
elif suffix in ['.jpg', '.jpeg']:
mime_type = 'image/jpeg'
elif suffix == '.pdf':
mime_type = 'application/pdf'
# Read and encode file
with open(path_obj, "rb") as file:
encoded = base64.b64encode(file.read()).decode('utf-8')
return f"data:{mime_type};base64,{encoded}"
except Exception as e:
logger.error(f"Error encoding file to base64: {str(e)}")
return ""
def get_base64_from_bytes(file_bytes, mime_type=None, file_name=None):
"""
Get base64 data URL from file bytes with proper MIME type.
Args:
file_bytes: Binary file data
mime_type: MIME type of the file (optional)
file_name: Original file name for MIME type detection (optional)
Returns:
Base64 data URL with appropriate MIME type prefix
"""
try:
# Determine mime type if not provided
if mime_type is None and file_name is not None:
# Get file extension
suffix = Path(file_name).suffix.lower()
if suffix == '.png':
mime_type = 'image/png'
elif suffix == '.gif':
mime_type = 'image/gif'
elif suffix in ['.jpg', '.jpeg']:
mime_type = 'image/jpeg'
elif suffix == '.pdf':
mime_type = 'application/pdf'
else:
# Default to image/jpeg for unknown types when processing images
mime_type = 'image/jpeg'
elif mime_type is None:
# Default MIME type if we can't determine it - use image/jpeg instead of application/octet-stream
# to ensure compatibility with Mistral AI OCR API
mime_type = 'image/jpeg'
# Encode and create data URL
encoded = base64.b64encode(file_bytes).decode('utf-8')
return f"data:{mime_type};base64,{encoded}"
except Exception as e:
logger.error(f"Error encoding bytes to base64: {str(e)}")
return ""
def handle_temp_files(temp_file_paths):
"""
Clean up temporary files
Args:
temp_file_paths: List of temporary file paths to clean up
"""
import os
for temp_path in temp_file_paths:
try:
if os.path.exists(temp_path):
os.unlink(temp_path)
logger.info(f"Removed temporary file: {temp_path}")
except Exception as e:
logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")