Spaces:
Running
Running
""" | |
File utility functions for historical OCR processing. | |
""" | |
import base64 | |
import logging | |
from pathlib import Path | |
# Configure logging | |
logger = logging.getLogger("utils") | |
logger.setLevel(logging.INFO) | |
def get_base64_from_image(image_path): | |
""" | |
Get base64 data URL from image file with proper MIME type. | |
Args: | |
image_path: Path to the image file | |
Returns: | |
Base64 data URL with appropriate MIME type prefix | |
""" | |
try: | |
# Convert to Path object for better handling | |
path_obj = Path(image_path) | |
# Determine mime type based on file extension | |
mime_type = 'image/jpeg' # Default mime type | |
suffix = path_obj.suffix.lower() | |
if suffix == '.png': | |
mime_type = 'image/png' | |
elif suffix == '.gif': | |
mime_type = 'image/gif' | |
elif suffix in ['.jpg', '.jpeg']: | |
mime_type = 'image/jpeg' | |
elif suffix == '.pdf': | |
mime_type = 'application/pdf' | |
# Read and encode file | |
with open(path_obj, "rb") as file: | |
encoded = base64.b64encode(file.read()).decode('utf-8') | |
return f"data:{mime_type};base64,{encoded}" | |
except Exception as e: | |
logger.error(f"Error encoding file to base64: {str(e)}") | |
return "" | |
def get_base64_from_bytes(file_bytes, mime_type=None, file_name=None): | |
""" | |
Get base64 data URL from file bytes with proper MIME type. | |
Args: | |
file_bytes: Binary file data | |
mime_type: MIME type of the file (optional) | |
file_name: Original file name for MIME type detection (optional) | |
Returns: | |
Base64 data URL with appropriate MIME type prefix | |
""" | |
try: | |
# Determine mime type if not provided | |
if mime_type is None and file_name is not None: | |
# Get file extension | |
suffix = Path(file_name).suffix.lower() | |
if suffix == '.png': | |
mime_type = 'image/png' | |
elif suffix == '.gif': | |
mime_type = 'image/gif' | |
elif suffix in ['.jpg', '.jpeg']: | |
mime_type = 'image/jpeg' | |
elif suffix == '.pdf': | |
mime_type = 'application/pdf' | |
else: | |
# Default to image/jpeg for unknown types when processing images | |
mime_type = 'image/jpeg' | |
elif mime_type is None: | |
# Default MIME type if we can't determine it - use image/jpeg instead of application/octet-stream | |
# to ensure compatibility with Mistral AI OCR API | |
mime_type = 'image/jpeg' | |
# Encode and create data URL | |
encoded = base64.b64encode(file_bytes).decode('utf-8') | |
return f"data:{mime_type};base64,{encoded}" | |
except Exception as e: | |
logger.error(f"Error encoding bytes to base64: {str(e)}") | |
return "" | |
def handle_temp_files(temp_file_paths): | |
""" | |
Clean up temporary files | |
Args: | |
temp_file_paths: List of temporary file paths to clean up | |
""" | |
import os | |
for temp_path in temp_file_paths: | |
try: | |
if os.path.exists(temp_path): | |
os.unlink(temp_path) | |
logger.info(f"Removed temporary file: {temp_path}") | |
except Exception as e: | |
logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}") | |