historical-ocr / process_file.py
milwright's picture
Rolling out modular v2
c04ffe5
"""
Utility function for processing files with OCR in the Historical OCR Workshop app.
"""
import os
import tempfile
from pathlib import Path
from datetime import datetime
def process_file(uploaded_file, use_vision=True, processor=None, custom_prompt=None):
"""Process the uploaded file and return the OCR results
Args:
uploaded_file: The uploaded file to process
use_vision: Whether to use vision model
processor: StructuredOCR processor (if None, it will be imported)
custom_prompt: Optional additional instructions for the model
Returns:
dict: The OCR results
"""
# Import the processor if not provided
if processor is None:
from structured_ocr import StructuredOCR
processor = StructuredOCR()
# Save the uploaded file to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
tmp.write(uploaded_file.getvalue())
temp_path = tmp.name
try:
# Determine file type from extension
file_ext = Path(uploaded_file.name).suffix.lower()
file_type = "pdf" if file_ext == ".pdf" else "image"
# Get file size in MB
file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
# Process the file with file size information for automatic page limiting
result = processor.process_file(
temp_path,
file_type=file_type,
use_vision=use_vision,
file_size_mb=file_size_mb,
custom_prompt=custom_prompt
)
# Add processing metadata
result.update({
"file_name": uploaded_file.name,
"processed_at": datetime.now().isoformat(),
"file_size_mb": round(file_size_mb, 2),
"use_vision": use_vision
})
return result
except Exception as e:
return {
"error": str(e),
"file_name": uploaded_file.name
}
finally:
# Clean up the temporary file
if os.path.exists(temp_path):
os.unlink(temp_path)