Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / utils /general_utils.py

milwright

Consolidate segmentation improvements and code cleanup

42dc069 2 months ago

raw

history blame contribute delete

7.02 kB

	"""
	General utility functions for historical OCR processing.
	"""
	import os
	import base64
	import hashlib
	import time
	import logging
	from datetime import datetime
	from pathlib import Path
	from functools import wraps

	# Configure logging
	logger = logging.getLogger("utils")
	logger.setLevel(logging.INFO)

	def generate_cache_key(file_bytes, file_type, use_vision, preprocessing_options=None, pdf_rotation=0, custom_prompt=None):
	"""
	Generate a cache key for OCR processing

	Args:
	file_bytes: File content as bytes
	file_type: Type of file (pdf or image)
	use_vision: Whether to use vision model
	preprocessing_options: Dictionary of preprocessing options
	pdf_rotation: PDF rotation value
	custom_prompt: Custom prompt for OCR

	Returns:
	str: Cache key
	"""
	# Generate file hash
	file_hash = hashlib.md5(file_bytes).hexdigest()

	# Include preprocessing options in cache key
	preprocessing_options_hash = ""
	if preprocessing_options:
	# Add pdf_rotation to preprocessing options to ensure it's part of the cache key
	if pdf_rotation != 0:
	preprocessing_options_with_rotation = preprocessing_options.copy()
	preprocessing_options_with_rotation['pdf_rotation'] = pdf_rotation
	preprocessing_str = str(sorted(preprocessing_options_with_rotation.items()))
	else:
	preprocessing_str = str(sorted(preprocessing_options.items()))
	preprocessing_options_hash = hashlib.md5(preprocessing_str.encode()).hexdigest()
	elif pdf_rotation != 0:
	# If no preprocessing options but we have rotation, include that in the hash
	preprocessing_options_hash = hashlib.md5(f"pdf_rotation_{pdf_rotation}".encode()).hexdigest()

	# Create base cache key
	cache_key = f"{file_hash}_{file_type}_{use_vision}_{preprocessing_options_hash}"

	# Include custom prompt in cache key if provided
	if custom_prompt:
	custom_prompt_hash = hashlib.md5(str(custom_prompt).encode()).hexdigest()
	cache_key = f"{cache_key}_{custom_prompt_hash}"

	return cache_key

	def timing(description):
	"""Context manager for timing code execution"""
	class TimingContext:
	def __init__(self, description):
	self.description = description

	def __enter__(self):
	self.start_time = time.time()
	return self

	def __exit__(self, exc_type, exc_val, exc_tb):
	end_time = time.time()
	execution_time = end_time - self.start_time
	logger.info(f"{self.description} took {execution_time:.2f} seconds")
	return False

	return TimingContext(description)

	def format_timestamp(timestamp=None, for_filename=False):
	"""
	Format timestamp for display or filenames

	Args:
	timestamp: Datetime object or string to format (defaults to current time)
	for_filename: Whether to format for use in a filename (defaults to False)

	Returns:
	str: Formatted timestamp
	"""
	if timestamp is None:
	timestamp = datetime.now()
	elif isinstance(timestamp, str):
	try:
	timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
	except ValueError:
	timestamp = datetime.now()

	if for_filename:
	# Format suitable for filenames: "Apr 30, 2025"
	return timestamp.strftime("%b %d, %Y")
	else:
	# Standard format for display
	return timestamp.strftime("%Y-%m-%d %H:%M")

	def create_descriptive_filename(original_filename, result, file_ext, preprocessing_options=None):
	"""
	Create a user-friendly descriptive filename for the result

	Args:
	original_filename: Original filename
	result: OCR result dictionary
	file_ext: File extension
	preprocessing_options: Dictionary of preprocessing options

	Returns:
	str: Human-readable descriptive filename
	"""
	# Get base name without extension and capitalize words
	original_name = Path(original_filename).stem

	# Make the original name more readable by replacing dashes and underscores with spaces
	# Then capitalize each word
	readable_name = original_name.replace('-', ' ').replace('_', ' ')
	# Split by spaces and capitalize each word, then rejoin
	name_parts = readable_name.split()
	readable_name = ' '.join(word.capitalize() for word in name_parts)

	# Determine document type
	doc_type = None
	if 'detected_document_type' in result and result['detected_document_type']:
	doc_type = result['detected_document_type'].capitalize()
	elif 'topics' in result and result['topics']:
	# Use first topic as document type if not explicitly detected
	doc_type = result['topics'][0]

	# Find period/era information
	period_info = None
	if 'topics' in result and result['topics']:
	for tag in result['topics']:
	if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
	period_info = tag
	break

	# Format metadata within parentheses if available
	metadata = []
	if doc_type:
	metadata.append(doc_type)
	if period_info:
	metadata.append(period_info)

	metadata_str = ""
	if metadata:
	metadata_str = f" ({', '.join(metadata)})"

	# Add current date for uniqueness and sorting
	current_date = format_timestamp(for_filename=True)
	date_str = f" - {current_date}"

	# Generate final user-friendly filename
	descriptive_name = f"{readable_name}{metadata_str}{date_str}{file_ext}"
	return descriptive_name

	def extract_subject_tags(result, raw_text, preprocessing_options=None):
	"""
	Extract subject tags from OCR result

	Args:
	result: OCR result dictionary
	raw_text: Raw text from OCR
	preprocessing_options: Dictionary of preprocessing options

	Returns:
	list: Subject tags
	"""
	subject_tags = []

	# Use existing topics as starting point if available
	if 'topics' in result and result['topics']:
	subject_tags = list(result['topics'])

	# Add document type if detected
	if 'detected_document_type' in result:
	doc_type = result['detected_document_type'].capitalize()
	if doc_type not in subject_tags:
	subject_tags.append(doc_type)

	# If no tags were found, add some defaults
	if not subject_tags:
	subject_tags = ["Document", "Historical Document"]

	# Try to infer content type
	if "letter" in raw_text.lower()[:1000] or "dear" in raw_text.lower()[:200]:
	subject_tags.append("Letter")

	# Check if it might be a newspaper
	if "newspaper" in raw_text.lower()[:1000] or "editor" in raw_text.lower()[:500]:
	subject_tags.append("Newspaper")

	return subject_tags