Spaces:

milwright
/

historical-ocr

Running

File size: 5,045 Bytes

7647e70

"""
Constants for the Historical OCR application.

This module contains all the constants used throughout the application,
making it easier to maintain and update values in one place.
"""

# API limits
MAX_FILE_SIZE_MB = 50
MAX_PAGES = 20

# Caching
CACHE_TTL_SECONDS = 24 * 3600  # 24 hours
MAX_CACHE_ENTRIES = 20

# Image processing
MAX_IMAGE_DIMENSION = 2500
IMAGE_QUALITY = 92

# Document types
DOCUMENT_TYPES = [
    "Auto-detect (standard processing)",
    "Newspaper or Magazine",
    "Letter or Correspondence",
    "Book or Publication",
    "Form or Legal Document",
    "Recipe",
    "Handwritten Document",
    "Map or Illustration",
    "Table or Spreadsheet",
    "Other (specify in instructions)"
]

# Document layouts
DOCUMENT_LAYOUTS = [
    "Standard layout",
    "Multiple columns",
    "Table/grid format",
    "Mixed layout with images"
]

# Preprocessing document types
PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"]

# Rotation options
ROTATION_OPTIONS = [0, 90, 180, 270]

# PDF settings
DEFAULT_PDF_DPI = 100
MIN_PDF_DPI = 72
MAX_PDF_DPI = 300
DEFAULT_MAX_PAGES = 3

# Performance modes
PERFORMANCE_MODES = ["Quality", "Speed"]

# Custom prompt templates
CUSTOM_PROMPT_TEMPLATES = {
    "Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.",
    "Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.",
    "Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.",
    "Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.",
    "Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.",
    "Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.",
    "Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.",
    "Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.",
    "Other (specify in instructions)": "Please describe the document type and any special processing requirements here."
}

# Layout prompt additions
LAYOUT_PROMPT_ADDITIONS = {
    "Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.",
    "Table/grid format": "Document contains table data. Preserve row and column structure during extraction.",
    "Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order."
}

# Content themes for subject tag extraction
CONTENT_THEMES = {
    "Historical": ["century", "ancient", "historical", "history", "vintage", "archive", "heritage"],
    "Travel": ["travel", "journey", "expedition", "exploration", "voyage", "map", "location"],
    "Science": ["experiment", "research", "study", "analysis", "scientific", "laboratory"],
    "Literature": ["book", "novel", "poetry", "author", "literary", "chapter", "story"],
    "Art": ["painting", "illustration", "drawing", "artist", "exhibit", "gallery", "portrait"],
    "Education": ["education", "school", "university", "college", "learning", "student", "teach"],
    "Politics": ["government", "political", "policy", "administration", "election", "legislature"],
    "Business": ["business", "company", "corporation", "market", "industry", "commercial", "trade"],
    "Social": ["society", "community", "social", "culture", "tradition", "customs"],
    "Technology": ["technology", "invention", "device", "mechanical", "machine", "technical"],
    "Military": ["military", "army", "navy", "war", "battle", "soldier", "weapon"],
    "Religion": ["religion", "church", "temple", "spiritual", "sacred", "ritual"],
    "Medicine": ["medical", "medicine", "health", "hospital", "treatment", "disease", "doctor"],
    "Legal": ["legal", "law", "court", "justice", "attorney", "judicial", "statute"],
    "Correspondence": ["letter", "mail", "correspondence", "message", "communication"]
}

# Period tags based on year ranges
PERIOD_TAGS = {
    (0, 1799): "Pre-1800s",
    (1800, 1849): "Early 19th Century",
    (1850, 1899): "Late 19th Century",
    (1900, 1949): "Early 20th Century",
    (1950, 2099): "Modern Era"
}

# Default fallback tags
DEFAULT_TAGS = ["Document", "Historical", "Text"]
GENERIC_TAGS = ["Archive", "Content", "Record"]

# UI constants
PROGRESS_DELAY = 0.8  # Seconds to show completion message