Spaces:
Running
Running
| """ | |
| Constants for the Historical OCR application. | |
| This module contains all the constants used throughout the application, | |
| making it easier to maintain and update values in one place. | |
| """ | |
| # API limits | |
| MAX_FILE_SIZE_MB = 200 | |
| MAX_PAGES = 20 | |
| # Caching | |
| CACHE_TTL_SECONDS = 24 * 3600 # 24 hours | |
| MAX_CACHE_ENTRIES = 20 | |
| # Image processing | |
| MAX_IMAGE_DIMENSION = 2500 | |
| IMAGE_QUALITY = 100 | |
| # Document types | |
| DOCUMENT_TYPES = [ | |
| "Auto-detect (standard processing)", | |
| "Newspaper or Magazine", | |
| "Letter or Correspondence", | |
| "Book or Publication", | |
| "Form or Legal Document", | |
| "Recipe", | |
| "Handwritten Document", | |
| "Map or Illustration", | |
| "Table or Spreadsheet", | |
| "Other (specify in instructions)" | |
| ] | |
| # Document layouts | |
| DOCUMENT_LAYOUTS = [ | |
| "Standard layout", | |
| "Multiple columns", | |
| "Table/grid format", | |
| "Mixed layout with images" | |
| ] | |
| # Preprocessing document types | |
| PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"] | |
| # Rotation options | |
| ROTATION_OPTIONS = [0, 90, 180, 270] | |
| # PDF settings | |
| DEFAULT_PDF_DPI = 100 | |
| MIN_PDF_DPI = 72 | |
| MAX_PDF_DPI = 300 | |
| DEFAULT_MAX_PAGES = 3 | |
| # Performance modes | |
| PERFORMANCE_MODES = ["Quality", "Speed"] | |
| # Custom prompt templates | |
| CUSTOM_PROMPT_TEMPLATES = { | |
| "Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.", | |
| "Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.", | |
| "Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.", | |
| "Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.", | |
| "Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.", | |
| "Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.", | |
| "Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.", | |
| "Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.", | |
| "Other (specify in instructions)": "Please describe the document type and any special processing requirements here." | |
| } | |
| # Layout prompt additions | |
| LAYOUT_PROMPT_ADDITIONS = { | |
| "Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.", | |
| "Table/grid format": "Document contains table data. Preserve row and column structure during extraction.", | |
| "Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order." | |
| } | |
| # Content themes for subject tag extraction | |
| CONTENT_THEMES = { | |
| # Historical Periods | |
| "Prehistoric": ["paleolithic", "neolithic", "stone age", "bronze age", "iron age", "prehistoric", "ancient", "archaeology", "artifact", "primitive"], | |
| "Ancient World": ["mesopotamia", "egypt", "greek", "roman", "persia", "babylonian", "assyrian", "pharaoh", "hieroglyphics", "cuneiform", "classical", "antiquity", "hellenistic", "republic", "empire"], | |
| "Medieval": ["middle ages", "medieval", "feudal", "crusades", "byzantine", "carolingian", "holy roman empire", "dark ages", "castle", "knights", "chivalry", "monastery", "plague", "viking", "norse"], | |
| "Renaissance": ["renaissance", "humanism", "reformation", "counter-reformation", "medici", "tudor", "elizabethan", "shakespeare", "machiavelli", "gutenberg", "printing press"], | |
| "Early Modern": ["early modern", "enlightenment", "age of reason", "scientific revolution", "colonial", "colonization", "imperialism", "revolution", "baroque", "bourbon", "habsburg", "stuart"], | |
| "18th Century": ["18th century", "1700s", "revolution", "american revolution", "french revolution", "enlightenment", "rococo", "neoclassical", "voltaire", "rousseau", "industrial"], | |
| "19th Century": ["19th century", "1800s", "victorian", "romantic", "napoleonic", "civil war", "industrial revolution", "manifest destiny", "colonial", "imperialism", "belle epoque", "fin de siecle"], | |
| "20th Century": ["20th century", "1900s", "world war", "great depression", "cold war", "interwar", "postwar", "modernism", "atomic", "post-colonial", "totalitarian", "fascism", "soviet", "civil rights"], | |
| "Contemporary": ["contemporary", "modern", "postmodern", "digital age", "globalization", "information age", "post-industrial", "post-colonial", "post-soviet", "post-war", "21st century"], | |
| # Geographic Contexts | |
| "European History": ["europe", "western europe", "eastern europe", "central europe", "mediterranean", "nordic", "iberian", "british", "habsburg", "bourbon", "prussia", "holy roman empire"], | |
| "Asian History": ["asia", "east asia", "south asia", "central asia", "southeast asia", "china", "japan", "india", "persia", "ottoman", "mongolian", "dynasty", "shogunate", "mughal", "silk road"], | |
| "African History": ["africa", "north africa", "west africa", "east africa", "sub-saharan", "sahel", "swahili", "maghreb", "nubian", "ethiopian", "zulu", "colonial africa", "apartheid"], | |
| "American History": ["america", "colonial america", "revolutionary", "antebellum", "civil war", "reconstruction", "frontier", "westward expansion", "manifest destiny", "native american", "indigenous"], | |
| "Latin American": ["latin america", "mesoamerica", "caribbean", "aztec", "mayan", "inca", "colonial", "viceroyalty", "independence", "revolution", "hispanic", "creole", "mestizo", "indigenous"], | |
| "Oceanic History": ["oceania", "pacific", "australian", "aboriginal", "indigenous", "polynesian", "melanesian", "micronesian", "maori", "maritime", "exploration", "settlement", "colonial"], | |
| # Historical Methodologies & Approaches | |
| "Archival Research": ["archive", "manuscript", "primary source", "provenance", "document", "preservation", "cataloging", "repository", "collection", "papers", "fonds", "records", "registry"], | |
| "Oral History": ["oral history", "testimony", "interview", "narrative", "memory", "ethnography", "storytelling", "tradition", "folklore", "witness", "account", "recording", "indigenous knowledge"], | |
| "Historical Archaeology": ["archaeology", "excavation", "artifact", "material culture", "stratigraphy", "conservation", "field work", "site", "ruins", "preservation", "heritage", "restoration"], | |
| "Digital History": ["digital", "database", "digitization", "computational", "network analysis", "gis", "mapping", "visualization", "data mining", "text analysis", "digital humanities", "encoding"], | |
| "Historiography": ["historiography", "revisionism", "interpretation", "narrative", "discourse", "bias", "perspective", "theory", "methodology", "framework", "historical thinking", "meta-history"], | |
| # Historical Document Types | |
| "Administrative Records": ["record", "registry", "account", "ledger", "census", "tax roll", "inventory", "charter", "deed", "grant", "patent", "minutes", "docket", "survey", "assessment", "register"], | |
| "Diplomatic Documents": ["treaty", "agreement", "proclamation", "declaration", "diplomatic", "embassy", "consul", "dispatch", "communique", "protocol", "convention", "alliance", "international"], | |
| "Personal Papers": ["diary", "journal", "memoir", "autobiography", "correspondence", "letter", "personal", "private", "papers", "notes", "scrapbook", "commonplace book", "sketchbook"], | |
| "Media History": ["newspaper", "gazette", "periodical", "pamphlet", "broadside", "print culture", "press", "editorial", "journalism", "reporter", "editor", "circulation", "readership", "subscriber"], | |
| "Visual Materials": ["photograph", "illustration", "print", "map", "atlas", "cartography", "engraving", "woodcut", "lithograph", "panorama", "portrait", "landscape", "sketch", "drawing", "plate"], | |
| "Legal Documents": ["legal", "law", "statute", "code", "constitution", "legislation", "decree", "ordinance", "bylaw", "regulation", "case", "trial", "testimony", "deposition", "verdict", "judgment"], | |
| # Historical Themes & Movements | |
| "Economic History": ["economic", "commerce", "trade", "market", "merchant", "finance", "banking", "currency", "coin", "inflation", "recession", "depression", "exchange", "capital", "labor", "guild"], | |
| "Social History": ["social", "society", "class", "status", "hierarchy", "everyday life", "community", "neighborhood", "urban", "rural", "poverty", "wealth", "leisure", "entertainment", "customs"], | |
| "Political History": ["political", "politics", "government", "state", "monarchy", "republic", "democracy", "aristocracy", "parliament", "congress", "election", "regime", "policy", "reform", "revolution"], | |
| "Intellectual History": ["intellectual", "idea", "philosophy", "theory", "concept", "movement", "thought", "discourse", "debate", "enlightenment", "rationalism", "empiricism", "ideology"], | |
| "Cultural History": ["cultural", "culture", "custom", "tradition", "ritual", "ceremony", "festival", "celebration", "holiday", "folklore", "music", "art", "literature", "fashion", "consumption"], | |
| "Religious History": ["religious", "religion", "church", "theology", "belief", "faith", "worship", "ritual", "sacred", "clergy", "monastery", "temple", "mosque", "synagogue", "pilgrimage", "sect"], | |
| "Military History": ["military", "war", "conflict", "battle", "campaign", "siege", "army", "navy", "soldier", "officer", "regiment", "battalion", "artillery", "cavalry", "infantry", "strategy", "tactics"], | |
| "Science History": ["scientific", "science", "experiment", "discovery", "theory", "hypothesis", "observation", "laboratory", "academy", "research", "natural philosophy", "medicine", "technology"], | |
| "Environmental History": ["environmental", "ecology", "climate", "weather", "landscape", "agriculture", "farming", "forestry", "conservation", "pollution", "resource", "sustainability", "natural"], | |
| # Specialized Historical Topics | |
| "Migration History": ["migration", "immigration", "emigration", "diaspora", "exile", "refugee", "settlement", "colonization", "population movement", "forced migration", "displacement", "resettlement"], | |
| "Maritime History": ["maritime", "naval", "shipping", "navigation", "sailor", "piracy", "privateering", "admiralty", "port", "harbor", "shipyard", "vessel", "sail", "trade route", "exploration"], | |
| "Gender History": ["gender", "women", "feminist", "sexuality", "masculinity", "femininity", "patriarchy", "suffrage", "domestic", "family", "marriage", "emancipation", "rights", "equality"], | |
| "Labor History": ["labor", "worker", "union", "strike", "apprentice", "guild", "factory", "workshop", "wage", "hours", "working conditions", "industrialization", "mechanization", "automation"], | |
| "Urban History": ["urban", "city", "town", "metropolitan", "municipal", "civic", "suburb", "neighborhood", "planning", "infrastructure", "utilities", "housing", "development", "gentrification"], | |
| "Rural History": ["rural", "countryside", "village", "agricultural", "farming", "peasant", "yeoman", "tenant", "sharecropper", "enclosure", "common land", "manor", "estate", "plantation"], | |
| "Colonial History": ["colonial", "colony", "settlement", "frontier", "borderland", "territory", "dominion", "province", "governance", "administration", "native", "indigenous", "contact zone"], | |
| "Indigenous History": ["indigenous", "native", "aboriginal", "first nations", "tribal", "reservation", "sovereignty", "land rights", "treaty rights", "cultural preservation", "oral tradition"], | |
| # General Historical Terms | |
| "Historical": ["history", "historical", "historiography", "heritage", "legacy", "tradition", "memory", "commemoration", "preservation", "conservation", "restoration", "interpretation", "significance"], | |
| "Chronology": ["chronology", "timeline", "periodization", "era", "epoch", "age", "century", "decade", "millennium", "year", "date", "dating", "chronological", "contemporary", "synchronic", "diachronic"], | |
| "Heritage": ["heritage", "preservation", "conservation", "landmark", "monument", "historic site", "museum", "archive", "collection", "artifact", "relic", "antiquity", "cultural heritage", "patrimony"] | |
| } | |
| # Period tags based on year ranges | |
| # These ranges are used to assign historical period tags to documents based on their year. | |
| PERIOD_TAGS = { | |
| (0, 499): "Ancient Era (to 500 CE)", | |
| (500, 999): "Early Medieval (500–1000)", | |
| (1000, 1299): "High Medieval (1000–1300)", | |
| (1300, 1499): "Late Medieval (1300–1500)", | |
| (1500, 1599): "Renaissance (1500–1600)", | |
| (1600, 1699): "Early Modern (1600–1700)", | |
| (1700, 1775): "Enlightenment (1700–1775)", | |
| (1776, 1799): "Age of Revolutions (1776–1800)", | |
| (1800, 1849): "Early 19th Century (1800–1850)", | |
| (1850, 1899): "Late 19th Century (1850–1900)", | |
| (1900, 1918): "Early 20th Century & WWI (1900–1918)", | |
| (1919, 1938): "Interwar Period (1919–1938)", | |
| (1939, 1945): "World War II (1939–1945)", | |
| (1946, 1968): "Postwar & Mid-20th Century (1946–1968)", | |
| (1969, 1989): "Late 20th Century (1969–1989)", | |
| (1990, 2000): "Turn of the 21st Century (1990–2000)", | |
| (2001, 2099): "Contemporary (21st Century)" | |
| } | |
| # Default fallback tags for documents when no specific tags are detected. | |
| DEFAULT_TAGS = [ | |
| "Document", | |
| "Historical", | |
| "Text", | |
| "Primary Source", | |
| "Archival Material", | |
| "Record", | |
| "Manuscript", | |
| "Printed Material", | |
| "Correspondence", | |
| "Publication" | |
| ] | |
| # Generic tags that can be used for broad categorization or as supplemental tags. | |
| GENERIC_TAGS = [ | |
| "Archive", | |
| "Content", | |
| "Record", | |
| "Source", | |
| "Material", | |
| "Page", | |
| "Scan", | |
| "Image", | |
| "Transcription", | |
| "Uncategorized", | |
| "General", | |
| "Miscellaneous" | |
| ] | |
| # UI constants | |
| PROGRESS_DELAY = 0.8 # Seconds to show completion message | |