Spaces:
Running
Running
""" | |
Constants for the Historical OCR application. | |
This module contains all the constants used throughout the application, | |
making it easier to maintain and update values in one place. | |
""" | |
# API limits | |
MAX_FILE_SIZE_MB = 50 | |
MAX_PAGES = 20 | |
# Caching | |
CACHE_TTL_SECONDS = 24 * 3600 # 24 hours | |
MAX_CACHE_ENTRIES = 20 | |
# Image processing | |
MAX_IMAGE_DIMENSION = 2500 | |
IMAGE_QUALITY = 92 | |
# Document types | |
DOCUMENT_TYPES = [ | |
"Auto-detect (standard processing)", | |
"Newspaper or Magazine", | |
"Letter or Correspondence", | |
"Book or Publication", | |
"Form or Legal Document", | |
"Recipe", | |
"Handwritten Document", | |
"Map or Illustration", | |
"Table or Spreadsheet", | |
"Other (specify in instructions)" | |
] | |
# Document layouts | |
DOCUMENT_LAYOUTS = [ | |
"Standard layout", | |
"Multiple columns", | |
"Table/grid format", | |
"Mixed layout with images" | |
] | |
# Preprocessing document types | |
PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"] | |
# Rotation options | |
ROTATION_OPTIONS = [0, 90, 180, 270] | |
# PDF settings | |
DEFAULT_PDF_DPI = 100 | |
MIN_PDF_DPI = 72 | |
MAX_PDF_DPI = 300 | |
DEFAULT_MAX_PAGES = 3 | |
# Performance modes | |
PERFORMANCE_MODES = ["Quality", "Speed"] | |
# Custom prompt templates | |
CUSTOM_PROMPT_TEMPLATES = { | |
"Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.", | |
"Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.", | |
"Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.", | |
"Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.", | |
"Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.", | |
"Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.", | |
"Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.", | |
"Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.", | |
"Other (specify in instructions)": "Please describe the document type and any special processing requirements here." | |
} | |
# Layout prompt additions | |
LAYOUT_PROMPT_ADDITIONS = { | |
"Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.", | |
"Table/grid format": "Document contains table data. Preserve row and column structure during extraction.", | |
"Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order." | |
} | |
# Content themes for subject tag extraction | |
CONTENT_THEMES = { | |
"Historical": ["century", "ancient", "historical", "history", "vintage", "archive", "heritage"], | |
"Travel": ["travel", "journey", "expedition", "exploration", "voyage", "map", "location"], | |
"Science": ["experiment", "research", "study", "analysis", "scientific", "laboratory"], | |
"Literature": ["book", "novel", "poetry", "author", "literary", "chapter", "story"], | |
"Art": ["painting", "illustration", "drawing", "artist", "exhibit", "gallery", "portrait"], | |
"Education": ["education", "school", "university", "college", "learning", "student", "teach"], | |
"Politics": ["government", "political", "policy", "administration", "election", "legislature"], | |
"Business": ["business", "company", "corporation", "market", "industry", "commercial", "trade"], | |
"Social": ["society", "community", "social", "culture", "tradition", "customs"], | |
"Technology": ["technology", "invention", "device", "mechanical", "machine", "technical"], | |
"Military": ["military", "army", "navy", "war", "battle", "soldier", "weapon"], | |
"Religion": ["religion", "church", "temple", "spiritual", "sacred", "ritual"], | |
"Medicine": ["medical", "medicine", "health", "hospital", "treatment", "disease", "doctor"], | |
"Legal": ["legal", "law", "court", "justice", "attorney", "judicial", "statute"], | |
"Correspondence": ["letter", "mail", "correspondence", "message", "communication"] | |
} | |
# Period tags based on year ranges | |
PERIOD_TAGS = { | |
(0, 1799): "Pre-1800s", | |
(1800, 1849): "Early 19th Century", | |
(1850, 1899): "Late 19th Century", | |
(1900, 1949): "Early 20th Century", | |
(1950, 2099): "Modern Era" | |
} | |
# Default fallback tags | |
DEFAULT_TAGS = ["Document", "Historical", "Text"] | |
GENERIC_TAGS = ["Archive", "Content", "Record"] | |
# UI constants | |
PROGRESS_DELAY = 0.8 # Seconds to show completion message | |