Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / constants.py

milwright

Reconcile main with preview-improvements branch, implementing modular structure, raw text editing, and enhanced framing

7647e70 about 1 month ago

raw

history blame

5.05 kB

	"""
	Constants for the Historical OCR application.

	This module contains all the constants used throughout the application,
	making it easier to maintain and update values in one place.
	"""

	# API limits
	MAX_FILE_SIZE_MB = 50
	MAX_PAGES = 20

	# Caching
	CACHE_TTL_SECONDS = 24 * 3600 # 24 hours
	MAX_CACHE_ENTRIES = 20

	# Image processing
	MAX_IMAGE_DIMENSION = 2500
	IMAGE_QUALITY = 92

	# Document types
	DOCUMENT_TYPES = [
	"Auto-detect (standard processing)",
	"Newspaper or Magazine",
	"Letter or Correspondence",
	"Book or Publication",
	"Form or Legal Document",
	"Recipe",
	"Handwritten Document",
	"Map or Illustration",
	"Table or Spreadsheet",
	"Other (specify in instructions)"
	]

	# Document layouts
	DOCUMENT_LAYOUTS = [
	"Standard layout",
	"Multiple columns",
	"Table/grid format",
	"Mixed layout with images"
	]

	# Preprocessing document types
	PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"]

	# Rotation options
	ROTATION_OPTIONS = [0, 90, 180, 270]

	# PDF settings
	DEFAULT_PDF_DPI = 100
	MIN_PDF_DPI = 72
	MAX_PDF_DPI = 300
	DEFAULT_MAX_PAGES = 3

	# Performance modes
	PERFORMANCE_MODES = ["Quality", "Speed"]

	# Custom prompt templates
	CUSTOM_PROMPT_TEMPLATES = {
	"Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.",
	"Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.",
	"Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.",
	"Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.",
	"Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.",
	"Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.",
	"Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.",
	"Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.",
	"Other (specify in instructions)": "Please describe the document type and any special processing requirements here."
	}

	# Layout prompt additions
	LAYOUT_PROMPT_ADDITIONS = {
	"Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.",
	"Table/grid format": "Document contains table data. Preserve row and column structure during extraction.",
	"Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order."
	}

	# Content themes for subject tag extraction
	CONTENT_THEMES = {
	"Historical": ["century", "ancient", "historical", "history", "vintage", "archive", "heritage"],
	"Travel": ["travel", "journey", "expedition", "exploration", "voyage", "map", "location"],
	"Science": ["experiment", "research", "study", "analysis", "scientific", "laboratory"],
	"Literature": ["book", "novel", "poetry", "author", "literary", "chapter", "story"],
	"Art": ["painting", "illustration", "drawing", "artist", "exhibit", "gallery", "portrait"],
	"Education": ["education", "school", "university", "college", "learning", "student", "teach"],
	"Politics": ["government", "political", "policy", "administration", "election", "legislature"],
	"Business": ["business", "company", "corporation", "market", "industry", "commercial", "trade"],
	"Social": ["society", "community", "social", "culture", "tradition", "customs"],
	"Technology": ["technology", "invention", "device", "mechanical", "machine", "technical"],
	"Military": ["military", "army", "navy", "war", "battle", "soldier", "weapon"],
	"Religion": ["religion", "church", "temple", "spiritual", "sacred", "ritual"],
	"Medicine": ["medical", "medicine", "health", "hospital", "treatment", "disease", "doctor"],
	"Legal": ["legal", "law", "court", "justice", "attorney", "judicial", "statute"],
	"Correspondence": ["letter", "mail", "correspondence", "message", "communication"]
	}

	# Period tags based on year ranges
	PERIOD_TAGS = {
	(0, 1799): "Pre-1800s",
	(1800, 1849): "Early 19th Century",
	(1850, 1899): "Late 19th Century",
	(1900, 1949): "Early 20th Century",
	(1950, 2099): "Modern Era"
	}

	# Default fallback tags
	DEFAULT_TAGS = ["Document", "Historical", "Text"]
	GENERIC_TAGS = ["Archive", "Content", "Record"]

	# UI constants
	PROGRESS_DELAY = 0.8 # Seconds to show completion message