Spaces:
Running
Running
File size: 5,045 Bytes
7647e70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
"""
Constants for the Historical OCR application.
This module contains all the constants used throughout the application,
making it easier to maintain and update values in one place.
"""
# API limits
MAX_FILE_SIZE_MB = 50
MAX_PAGES = 20
# Caching
CACHE_TTL_SECONDS = 24 * 3600 # 24 hours
MAX_CACHE_ENTRIES = 20
# Image processing
MAX_IMAGE_DIMENSION = 2500
IMAGE_QUALITY = 92
# Document types
DOCUMENT_TYPES = [
"Auto-detect (standard processing)",
"Newspaper or Magazine",
"Letter or Correspondence",
"Book or Publication",
"Form or Legal Document",
"Recipe",
"Handwritten Document",
"Map or Illustration",
"Table or Spreadsheet",
"Other (specify in instructions)"
]
# Document layouts
DOCUMENT_LAYOUTS = [
"Standard layout",
"Multiple columns",
"Table/grid format",
"Mixed layout with images"
]
# Preprocessing document types
PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"]
# Rotation options
ROTATION_OPTIONS = [0, 90, 180, 270]
# PDF settings
DEFAULT_PDF_DPI = 100
MIN_PDF_DPI = 72
MAX_PDF_DPI = 300
DEFAULT_MAX_PAGES = 3
# Performance modes
PERFORMANCE_MODES = ["Quality", "Speed"]
# Custom prompt templates
CUSTOM_PROMPT_TEMPLATES = {
"Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.",
"Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.",
"Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.",
"Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.",
"Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.",
"Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.",
"Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.",
"Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.",
"Other (specify in instructions)": "Please describe the document type and any special processing requirements here."
}
# Layout prompt additions
LAYOUT_PROMPT_ADDITIONS = {
"Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.",
"Table/grid format": "Document contains table data. Preserve row and column structure during extraction.",
"Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order."
}
# Content themes for subject tag extraction
CONTENT_THEMES = {
"Historical": ["century", "ancient", "historical", "history", "vintage", "archive", "heritage"],
"Travel": ["travel", "journey", "expedition", "exploration", "voyage", "map", "location"],
"Science": ["experiment", "research", "study", "analysis", "scientific", "laboratory"],
"Literature": ["book", "novel", "poetry", "author", "literary", "chapter", "story"],
"Art": ["painting", "illustration", "drawing", "artist", "exhibit", "gallery", "portrait"],
"Education": ["education", "school", "university", "college", "learning", "student", "teach"],
"Politics": ["government", "political", "policy", "administration", "election", "legislature"],
"Business": ["business", "company", "corporation", "market", "industry", "commercial", "trade"],
"Social": ["society", "community", "social", "culture", "tradition", "customs"],
"Technology": ["technology", "invention", "device", "mechanical", "machine", "technical"],
"Military": ["military", "army", "navy", "war", "battle", "soldier", "weapon"],
"Religion": ["religion", "church", "temple", "spiritual", "sacred", "ritual"],
"Medicine": ["medical", "medicine", "health", "hospital", "treatment", "disease", "doctor"],
"Legal": ["legal", "law", "court", "justice", "attorney", "judicial", "statute"],
"Correspondence": ["letter", "mail", "correspondence", "message", "communication"]
}
# Period tags based on year ranges
PERIOD_TAGS = {
(0, 1799): "Pre-1800s",
(1800, 1849): "Early 19th Century",
(1850, 1899): "Late 19th Century",
(1900, 1949): "Early 20th Century",
(1950, 2099): "Modern Era"
}
# Default fallback tags
DEFAULT_TAGS = ["Document", "Historical", "Text"]
GENERIC_TAGS = ["Archive", "Content", "Record"]
# UI constants
PROGRESS_DELAY = 0.8 # Seconds to show completion message
|