Spaces:
Running
Running
""" | |
Constants for the Historical OCR application. | |
This module contains all the constants used throughout the application, | |
making it easier to maintain and update values in one place. | |
""" | |
# API limits | |
MAX_FILE_SIZE_MB = 200 | |
MAX_PAGES = 20 | |
# Caching | |
CACHE_TTL_SECONDS = 24 * 3600 # 24 hours | |
MAX_CACHE_ENTRIES = 20 | |
# Image processing | |
MAX_IMAGE_DIMENSION = 2500 | |
IMAGE_QUALITY = 100 | |
# Document types | |
DOCUMENT_TYPES = [ | |
"Auto-detect (standard processing)", | |
"Newspaper or Magazine", | |
"Letter or Correspondence", | |
"Book or Publication", | |
"Form or Legal Document", | |
"Recipe", | |
"Handwritten Document", | |
"Map or Illustration", | |
"Table or Spreadsheet", | |
"Other (specify in instructions)" | |
] | |
# Document layouts | |
DOCUMENT_LAYOUTS = [ | |
"Standard layout", | |
"Multiple columns", | |
"Table/grid format", | |
"Mixed layout with images" | |
] | |
# Preprocessing document types | |
PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"] | |
# Rotation options | |
ROTATION_OPTIONS = [0, 90, 180, 270] | |
# PDF settings | |
DEFAULT_PDF_DPI = 100 | |
MIN_PDF_DPI = 72 | |
MAX_PDF_DPI = 300 | |
DEFAULT_MAX_PAGES = 3 | |
# Performance modes | |
PERFORMANCE_MODES = ["Quality", "Speed"] | |
# Custom prompt templates | |
CUSTOM_PROMPT_TEMPLATES = { | |
"Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.", | |
"Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.", | |
"Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.", | |
"Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.", | |
"Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.", | |
"Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.", | |
"Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.", | |
"Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.", | |
"Other (specify in instructions)": "Please describe the document type and any special processing requirements here." | |
} | |
# Layout prompt additions | |
LAYOUT_PROMPT_ADDITIONS = { | |
"Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.", | |
"Table/grid format": "Document contains table data. Preserve row and column structure during extraction.", | |
"Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order." | |
} | |
# Content themes for subject tag extraction | |
CONTENT_THEMES = { | |
# Historical Periods | |
"Prehistoric": ["paleolithic", "neolithic", "stone age", "bronze age", "iron age", "prehistoric", "ancient", "archaeology", "artifact", "primitive"], | |
"Ancient World": ["mesopotamia", "egypt", "greek", "roman", "persia", "babylonian", "assyrian", "pharaoh", "hieroglyphics", "cuneiform", "classical", "antiquity", "hellenistic", "republic", "empire"], | |
"Medieval": ["middle ages", "medieval", "feudal", "crusades", "byzantine", "carolingian", "holy roman empire", "dark ages", "castle", "knights", "chivalry", "monastery", "plague", "viking", "norse"], | |
"Renaissance": ["renaissance", "humanism", "reformation", "counter-reformation", "medici", "tudor", "elizabethan", "shakespeare", "machiavelli", "gutenberg", "printing press"], | |
"Early Modern": ["early modern", "enlightenment", "age of reason", "scientific revolution", "colonial", "colonization", "imperialism", "revolution", "baroque", "bourbon", "habsburg", "stuart"], | |
"18th Century": ["18th century", "1700s", "revolution", "american revolution", "french revolution", "enlightenment", "rococo", "neoclassical", "voltaire", "rousseau", "industrial"], | |
"19th Century": ["19th century", "1800s", "victorian", "romantic", "napoleonic", "civil war", "industrial revolution", "manifest destiny", "colonial", "imperialism", "belle epoque", "fin de siecle"], | |
"20th Century": ["20th century", "1900s", "world war", "great depression", "cold war", "interwar", "postwar", "modernism", "atomic", "post-colonial", "totalitarian", "fascism", "soviet", "civil rights"], | |
"Contemporary": ["contemporary", "modern", "postmodern", "digital age", "globalization", "information age", "post-industrial", "post-colonial", "post-soviet", "post-war", "21st century"], | |
# Geographic Contexts | |
"European History": ["europe", "western europe", "eastern europe", "central europe", "mediterranean", "nordic", "iberian", "british", "habsburg", "bourbon", "prussia", "holy roman empire"], | |
"Asian History": ["asia", "east asia", "south asia", "central asia", "southeast asia", "china", "japan", "india", "persia", "ottoman", "mongolian", "dynasty", "shogunate", "mughal", "silk road"], | |
"African History": ["africa", "north africa", "west africa", "east africa", "sub-saharan", "sahel", "swahili", "maghreb", "nubian", "ethiopian", "zulu", "colonial africa", "apartheid"], | |
"American History": ["america", "colonial america", "revolutionary", "antebellum", "civil war", "reconstruction", "frontier", "westward expansion", "manifest destiny", "native american", "indigenous"], | |
"Latin American": ["latin america", "mesoamerica", "caribbean", "aztec", "mayan", "inca", "colonial", "viceroyalty", "independence", "revolution", "hispanic", "creole", "mestizo", "indigenous"], | |
"Oceanic History": ["oceania", "pacific", "australian", "aboriginal", "indigenous", "polynesian", "melanesian", "micronesian", "maori", "maritime", "exploration", "settlement", "colonial"], | |
# Historical Methodologies & Approaches | |
"Archival Research": ["archive", "manuscript", "primary source", "provenance", "document", "preservation", "cataloging", "repository", "collection", "papers", "fonds", "records", "registry"], | |
"Oral History": ["oral history", "testimony", "interview", "narrative", "memory", "ethnography", "storytelling", "tradition", "folklore", "witness", "account", "recording", "indigenous knowledge"], | |
"Historical Archaeology": ["archaeology", "excavation", "artifact", "material culture", "stratigraphy", "conservation", "field work", "site", "ruins", "preservation", "heritage", "restoration"], | |
"Digital History": ["digital", "database", "digitization", "computational", "network analysis", "gis", "mapping", "visualization", "data mining", "text analysis", "digital humanities", "encoding"], | |
"Historiography": ["historiography", "revisionism", "interpretation", "narrative", "discourse", "bias", "perspective", "theory", "methodology", "framework", "historical thinking", "meta-history"], | |
# Historical Document Types | |
"Administrative Records": ["record", "registry", "account", "ledger", "census", "tax roll", "inventory", "charter", "deed", "grant", "patent", "minutes", "docket", "survey", "assessment", "register"], | |
"Diplomatic Documents": ["treaty", "agreement", "proclamation", "declaration", "diplomatic", "embassy", "consul", "dispatch", "communique", "protocol", "convention", "alliance", "international"], | |
"Personal Papers": ["diary", "journal", "memoir", "autobiography", "correspondence", "letter", "personal", "private", "papers", "notes", "scrapbook", "commonplace book", "sketchbook"], | |
"Media History": ["newspaper", "gazette", "periodical", "pamphlet", "broadside", "print culture", "press", "editorial", "journalism", "reporter", "editor", "circulation", "readership", "subscriber"], | |
"Visual Materials": ["photograph", "illustration", "print", "map", "atlas", "cartography", "engraving", "woodcut", "lithograph", "panorama", "portrait", "landscape", "sketch", "drawing", "plate"], | |
"Legal Documents": ["legal", "law", "statute", "code", "constitution", "legislation", "decree", "ordinance", "bylaw", "regulation", "case", "trial", "testimony", "deposition", "verdict", "judgment"], | |
# Historical Themes & Movements | |
"Economic History": ["economic", "commerce", "trade", "market", "merchant", "finance", "banking", "currency", "coin", "inflation", "recession", "depression", "exchange", "capital", "labor", "guild"], | |
"Social History": ["social", "society", "class", "status", "hierarchy", "everyday life", "community", "neighborhood", "urban", "rural", "poverty", "wealth", "leisure", "entertainment", "customs"], | |
"Political History": ["political", "politics", "government", "state", "monarchy", "republic", "democracy", "aristocracy", "parliament", "congress", "election", "regime", "policy", "reform", "revolution"], | |
"Intellectual History": ["intellectual", "idea", "philosophy", "theory", "concept", "movement", "thought", "discourse", "debate", "enlightenment", "rationalism", "empiricism", "ideology"], | |
"Cultural History": ["cultural", "culture", "custom", "tradition", "ritual", "ceremony", "festival", "celebration", "holiday", "folklore", "music", "art", "literature", "fashion", "consumption"], | |
"Religious History": ["religious", "religion", "church", "theology", "belief", "faith", "worship", "ritual", "sacred", "clergy", "monastery", "temple", "mosque", "synagogue", "pilgrimage", "sect"], | |
"Military History": ["military", "war", "conflict", "battle", "campaign", "siege", "army", "navy", "soldier", "officer", "regiment", "battalion", "artillery", "cavalry", "infantry", "strategy", "tactics"], | |
"Science History": ["scientific", "science", "experiment", "discovery", "theory", "hypothesis", "observation", "laboratory", "academy", "research", "natural philosophy", "medicine", "technology"], | |
"Environmental History": ["environmental", "ecology", "climate", "weather", "landscape", "agriculture", "farming", "forestry", "conservation", "pollution", "resource", "sustainability", "natural"], | |
# Specialized Historical Topics | |
"Migration History": ["migration", "immigration", "emigration", "diaspora", "exile", "refugee", "settlement", "colonization", "population movement", "forced migration", "displacement", "resettlement"], | |
"Maritime History": ["maritime", "naval", "shipping", "navigation", "sailor", "piracy", "privateering", "admiralty", "port", "harbor", "shipyard", "vessel", "sail", "trade route", "exploration"], | |
"Gender History": ["gender", "women", "feminist", "sexuality", "masculinity", "femininity", "patriarchy", "suffrage", "domestic", "family", "marriage", "emancipation", "rights", "equality"], | |
"Labor History": ["labor", "worker", "union", "strike", "apprentice", "guild", "factory", "workshop", "wage", "hours", "working conditions", "industrialization", "mechanization", "automation"], | |
"Urban History": ["urban", "city", "town", "metropolitan", "municipal", "civic", "suburb", "neighborhood", "planning", "infrastructure", "utilities", "housing", "development", "gentrification"], | |
"Rural History": ["rural", "countryside", "village", "agricultural", "farming", "peasant", "yeoman", "tenant", "sharecropper", "enclosure", "common land", "manor", "estate", "plantation"], | |
"Colonial History": ["colonial", "colony", "settlement", "frontier", "borderland", "territory", "dominion", "province", "governance", "administration", "native", "indigenous", "contact zone"], | |
"Indigenous History": ["indigenous", "native", "aboriginal", "first nations", "tribal", "reservation", "sovereignty", "land rights", "treaty rights", "cultural preservation", "oral tradition"], | |
# General Historical Terms | |
"Historical": ["history", "historical", "historiography", "heritage", "legacy", "tradition", "memory", "commemoration", "preservation", "conservation", "restoration", "interpretation", "significance"], | |
"Chronology": ["chronology", "timeline", "periodization", "era", "epoch", "age", "century", "decade", "millennium", "year", "date", "dating", "chronological", "contemporary", "synchronic", "diachronic"], | |
"Heritage": ["heritage", "preservation", "conservation", "landmark", "monument", "historic site", "museum", "archive", "collection", "artifact", "relic", "antiquity", "cultural heritage", "patrimony"] | |
} | |
# Period tags based on year ranges | |
# These ranges are used to assign historical period tags to documents based on their year. | |
PERIOD_TAGS = { | |
(0, 499): "Ancient Era (to 500 CE)", | |
(500, 999): "Early Medieval (500β1000)", | |
(1000, 1299): "High Medieval (1000β1300)", | |
(1300, 1499): "Late Medieval (1300β1500)", | |
(1500, 1599): "Renaissance (1500β1600)", | |
(1600, 1699): "Early Modern (1600β1700)", | |
(1700, 1775): "Enlightenment (1700β1775)", | |
(1776, 1799): "Age of Revolutions (1776β1800)", | |
(1800, 1849): "Early 19th Century (1800β1850)", | |
(1850, 1899): "Late 19th Century (1850β1900)", | |
(1900, 1918): "Early 20th Century & WWI (1900β1918)", | |
(1919, 1938): "Interwar Period (1919β1938)", | |
(1939, 1945): "World War II (1939β1945)", | |
(1946, 1968): "Postwar & Mid-20th Century (1946β1968)", | |
(1969, 1989): "Late 20th Century (1969β1989)", | |
(1990, 2000): "Turn of the 21st Century (1990β2000)", | |
(2001, 2099): "Contemporary (21st Century)" | |
} | |
# Default fallback tags for documents when no specific tags are detected. | |
DEFAULT_TAGS = [ | |
"Document", | |
"Historical", | |
"Text", | |
"Primary Source", | |
"Archival Material", | |
"Record", | |
"Manuscript", | |
"Printed Material", | |
"Correspondence", | |
"Publication" | |
] | |
# Generic tags that can be used for broad categorization or as supplemental tags. | |
GENERIC_TAGS = [ | |
"Archive", | |
"Content", | |
"Record", | |
"Source", | |
"Material", | |
"Page", | |
"Scan", | |
"Image", | |
"Transcription", | |
"Uncategorized", | |
"General", | |
"Miscellaneous" | |
] | |
# UI constants | |
PROGRESS_DELAY = 0.8 # Seconds to show completion message | |