Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / process_file.py

milwright

Rolling out modular v2

c04ffe5 8 months ago

raw

history blame contribute delete

2.17 kB

	"""
	Utility function for processing files with OCR in the Historical OCR Workshop app.
	"""

	import os
	import tempfile
	from pathlib import Path
	from datetime import datetime

	def process_file(uploaded_file, use_vision=True, processor=None, custom_prompt=None):
	"""Process the uploaded file and return the OCR results

	Args:
	uploaded_file: The uploaded file to process
	use_vision: Whether to use vision model
	processor: StructuredOCR processor (if None, it will be imported)
	custom_prompt: Optional additional instructions for the model

	Returns:
	dict: The OCR results
	"""
	# Import the processor if not provided
	if processor is None:
	from structured_ocr import StructuredOCR
	processor = StructuredOCR()

	# Save the uploaded file to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
	tmp.write(uploaded_file.getvalue())
	temp_path = tmp.name

	try:
	# Determine file type from extension
	file_ext = Path(uploaded_file.name).suffix.lower()
	file_type = "pdf" if file_ext == ".pdf" else "image"

	# Get file size in MB
	file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)

	# Process the file with file size information for automatic page limiting
	result = processor.process_file(
	temp_path,
	file_type=file_type,
	use_vision=use_vision,
	file_size_mb=file_size_mb,
	custom_prompt=custom_prompt
	)

	# Add processing metadata
	result.update({
	"file_name": uploaded_file.name,
	"processed_at": datetime.now().isoformat(),
	"file_size_mb": round(file_size_mb, 2),
	"use_vision": use_vision
	})

	return result
	except Exception as e:
	return {
	"error": str(e),
	"file_name": uploaded_file.name
	}
	finally:
	# Clean up the temporary file
	if os.path.exists(temp_path):
	os.unlink(temp_path)