""" Constants for the Historical OCR application. This module contains all the constants used throughout the application, making it easier to maintain and update values in one place. """ # API limits MAX_FILE_SIZE_MB = 50 MAX_PAGES = 20 # Caching CACHE_TTL_SECONDS = 24 * 3600 # 24 hours MAX_CACHE_ENTRIES = 20 # Image processing MAX_IMAGE_DIMENSION = 2500 IMAGE_QUALITY = 92 # Document types DOCUMENT_TYPES = [ "Auto-detect (standard processing)", "Newspaper or Magazine", "Letter or Correspondence", "Book or Publication", "Form or Legal Document", "Recipe", "Handwritten Document", "Map or Illustration", "Table or Spreadsheet", "Other (specify in instructions)" ] # Document layouts DOCUMENT_LAYOUTS = [ "Standard layout", "Multiple columns", "Table/grid format", "Mixed layout with images" ] # Preprocessing document types PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"] # Rotation options ROTATION_OPTIONS = [0, 90, 180, 270] # PDF settings DEFAULT_PDF_DPI = 100 MIN_PDF_DPI = 72 MAX_PDF_DPI = 300 DEFAULT_MAX_PAGES = 3 # Performance modes PERFORMANCE_MODES = ["Quality", "Speed"] # Custom prompt templates CUSTOM_PROMPT_TEMPLATES = { "Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.", "Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.", "Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.", "Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.", "Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.", "Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.", "Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.", "Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.", "Other (specify in instructions)": "Please describe the document type and any special processing requirements here." } # Layout prompt additions LAYOUT_PROMPT_ADDITIONS = { "Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.", "Table/grid format": "Document contains table data. Preserve row and column structure during extraction.", "Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order." } # Content themes for subject tag extraction CONTENT_THEMES = { "Historical": ["century", "ancient", "historical", "history", "vintage", "archive", "heritage"], "Travel": ["travel", "journey", "expedition", "exploration", "voyage", "map", "location"], "Science": ["experiment", "research", "study", "analysis", "scientific", "laboratory"], "Literature": ["book", "novel", "poetry", "author", "literary", "chapter", "story"], "Art": ["painting", "illustration", "drawing", "artist", "exhibit", "gallery", "portrait"], "Education": ["education", "school", "university", "college", "learning", "student", "teach"], "Politics": ["government", "political", "policy", "administration", "election", "legislature"], "Business": ["business", "company", "corporation", "market", "industry", "commercial", "trade"], "Social": ["society", "community", "social", "culture", "tradition", "customs"], "Technology": ["technology", "invention", "device", "mechanical", "machine", "technical"], "Military": ["military", "army", "navy", "war", "battle", "soldier", "weapon"], "Religion": ["religion", "church", "temple", "spiritual", "sacred", "ritual"], "Medicine": ["medical", "medicine", "health", "hospital", "treatment", "disease", "doctor"], "Legal": ["legal", "law", "court", "justice", "attorney", "judicial", "statute"], "Correspondence": ["letter", "mail", "correspondence", "message", "communication"] } # Period tags based on year ranges PERIOD_TAGS = { (0, 1799): "Pre-1800s", (1800, 1849): "Early 19th Century", (1850, 1899): "Late 19th Century", (1900, 1949): "Early 20th Century", (1950, 2099): "Modern Era" } # Default fallback tags DEFAULT_TAGS = ["Document", "Historical", "Text"] GENERIC_TAGS = ["Archive", "Content", "Record"] # UI constants PROGRESS_DELAY = 0.8 # Seconds to show completion message