Spaces:
Running
Running
Integrate image segmentation and language detection modules
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- .gitignore +0 -0
- CLAUDE.md +9 -13
- __pycache__/config.cpython-312.pyc +0 -0
- __pycache__/constants.cpython-312.pyc +0 -0
- __pycache__/error_handler.cpython-312.pyc +0 -0
- __pycache__/image_segmentation.cpython-312.pyc +0 -0
- __pycache__/language_detection.cpython-312.pyc +0 -0
- __pycache__/ocr_processing.cpython-312.pyc +0 -0
- __pycache__/ocr_utils.cpython-312.pyc +0 -0
- __pycache__/preprocessing.cpython-312.pyc +0 -0
- __pycache__/structured_ocr.cpython-312.pyc +0 -0
- __pycache__/ui_components.cpython-312.pyc +0 -0
- __pycache__/utils.cpython-312.pyc +0 -0
- app.py +10 -1
- config.py +1 -1
- constants.py +61 -17
- image_segmentation.py +246 -0
- language_detection.py +374 -0
- ocr_processing.py +39 -1
- output/magellan_test_result.json +64 -0
- output/segmentation_test/comparison_report.md +25 -0
- output/segmentation_test/magician-or-bottle-cungerer_combined.jpg +3 -0
- output/segmentation_test/magician-or-bottle-cungerer_image_regions.jpg +3 -0
- output/segmentation_test/magician-or-bottle-cungerer_text_mask.png +3 -0
- output/segmentation_test/magician-or-bottle-cungerer_text_regions.jpg +3 -0
- output/segmentation_test/result_with_segmentation.json +0 -0
- output/segmentation_test/result_without_segmentation.json +0 -0
- output/segmentation_test/segmentation_results.json +11 -0
- output/segmentation_test/text_with_segmentation.txt +0 -0
- output/segmentation_test/text_without_segmentation.txt +0 -0
- preprocessing.py +10 -8
- requirements.txt +19 -13
- structured_ocr.py +209 -18
- test_magellan_language.py +39 -0
- test_magician.py +57 -0
- testing/magician_app_investigation_plan.md +58 -0
- testing/magician_app_result.json +16 -0
- testing/magician_image_final_report.md +58 -0
- testing/magician_image_findings.md +84 -0
- testing/magician_ocr_text.txt +9 -0
- testing/magician_test/branch_comparison.txt +20 -0
- testing/magician_test/processed_magician.jpg +3 -0
- testing/magician_test/test_report.txt +16 -0
- testing/newspaper_test/newspaper_comparison.jpg +3 -0
- testing/newspaper_test/newspaper_test_report.txt +18 -0
- testing/newspaper_test/processed_newspaper.jpg +3 -0
- testing/output/processed_magician.jpg +3 -0
- testing/output/test_report.txt +16 -0
- testing/test_app_direct.py +180 -0
.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
.gitignore
ADDED
File without changes
|
CLAUDE.md
CHANGED
@@ -5,17 +5,15 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
|
5 |
## Commands
|
6 |
- Run app: `streamlit run app.py`
|
7 |
- Test OCR functionality: `python structured_ocr.py <file_path>`
|
8 |
-
- Process PDF files: `python pdf_ocr.py <file_path>`
|
9 |
- Process single file with logging: `python process_file.py <file_path>`
|
10 |
-
- Run
|
11 |
-
- Run notebook demo: `jupyter notebook notebook_demo.ipynb`
|
12 |
- Run typechecking: `mypy .`
|
13 |
- Lint code: `ruff check .` or `flake8`
|
14 |
|
15 |
## Environment Setup
|
16 |
- API key: Set `MISTRAL_API_KEY` in `.env` file or environment variable
|
17 |
- Install dependencies: `pip install -r requirements.txt`
|
18 |
-
- System requirements: Install `poppler-utils` and `tesseract-ocr` for PDF processing
|
19 |
|
20 |
## Code Style Guidelines
|
21 |
- **Imports**: Standard library first, third-party next, local modules last
|
@@ -23,14 +21,12 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
|
23 |
- **Error handling**: Use specific exceptions with informative messages
|
24 |
- **Naming**: snake_case for variables/functions, PascalCase for classes
|
25 |
- **Documentation**: Google-style docstrings for all functions/classes
|
26 |
-
- **
|
27 |
-
- **Exception handling**: Implement graceful fallbacks for API errors
|
28 |
- **Line length**: ≤100 characters
|
29 |
|
30 |
-
##
|
31 |
-
-
|
32 |
-
-
|
33 |
-
-
|
34 |
-
-
|
35 |
-
-
|
36 |
-
- Demo: `notebook_demo.ipynb` - Interactive notebook with educational examples
|
|
|
5 |
## Commands
|
6 |
- Run app: `streamlit run app.py`
|
7 |
- Test OCR functionality: `python structured_ocr.py <file_path>`
|
|
|
8 |
- Process single file with logging: `python process_file.py <file_path>`
|
9 |
+
- Run specific test: `python testing/test_magician_image.py`
|
|
|
10 |
- Run typechecking: `mypy .`
|
11 |
- Lint code: `ruff check .` or `flake8`
|
12 |
|
13 |
## Environment Setup
|
14 |
- API key: Set `MISTRAL_API_KEY` in `.env` file or environment variable
|
15 |
- Install dependencies: `pip install -r requirements.txt`
|
16 |
+
- System requirements: Install `poppler-utils` and `tesseract-ocr` for PDF processing
|
17 |
|
18 |
## Code Style Guidelines
|
19 |
- **Imports**: Standard library first, third-party next, local modules last
|
|
|
21 |
- **Error handling**: Use specific exceptions with informative messages
|
22 |
- **Naming**: snake_case for variables/functions, PascalCase for classes
|
23 |
- **Documentation**: Google-style docstrings for all functions/classes
|
24 |
+
- **Preprocessing**: Support handwritten documents via document_type parameter
|
|
|
25 |
- **Line length**: ≤100 characters
|
26 |
|
27 |
+
## Base64 Encoding
|
28 |
+
- Always include MIME type in data URLs: `data:image/jpeg;base64,...`
|
29 |
+
- Use the appropriate MIME type for different file formats: jpeg, png, pdf, etc.
|
30 |
+
- For encoded bytes, use `encode_bytes_for_api` with correct MIME type
|
31 |
+
- For file paths, use `encode_image_for_api` which auto-detects MIME type
|
32 |
+
- In utils.py, use `get_base64_from_bytes` for raw bytes or `get_base64_from_image` for files
|
|
__pycache__/config.cpython-312.pyc
CHANGED
Binary files a/__pycache__/config.cpython-312.pyc and b/__pycache__/config.cpython-312.pyc differ
|
|
__pycache__/constants.cpython-312.pyc
ADDED
Binary file (11.6 kB). View file
|
|
__pycache__/error_handler.cpython-312.pyc
ADDED
Binary file (3.2 kB). View file
|
|
__pycache__/image_segmentation.cpython-312.pyc
ADDED
Binary file (10.6 kB). View file
|
|
__pycache__/language_detection.cpython-312.pyc
ADDED
Binary file (18 kB). View file
|
|
__pycache__/ocr_processing.cpython-312.pyc
ADDED
Binary file (15.5 kB). View file
|
|
__pycache__/ocr_utils.cpython-312.pyc
CHANGED
Binary files a/__pycache__/ocr_utils.cpython-312.pyc and b/__pycache__/ocr_utils.cpython-312.pyc differ
|
|
__pycache__/preprocessing.cpython-312.pyc
ADDED
Binary file (9.21 kB). View file
|
|
__pycache__/structured_ocr.cpython-312.pyc
CHANGED
Binary files a/__pycache__/structured_ocr.cpython-312.pyc and b/__pycache__/structured_ocr.cpython-312.pyc differ
|
|
__pycache__/ui_components.cpython-312.pyc
ADDED
Binary file (44.1 kB). View file
|
|
__pycache__/utils.cpython-312.pyc
ADDED
Binary file (14.2 kB). View file
|
|
app.py
CHANGED
@@ -365,7 +365,16 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
|
|
365 |
|
366 |
# Show preprocessing metadata in a well-formatted caption
|
367 |
meta_items = []
|
368 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
meta_items.append(f"Document type ({sidebar_options['preprocessing_options']['document_type']})")
|
370 |
if sidebar_options["preprocessing_options"].get("grayscale", False):
|
371 |
meta_items.append("Grayscale")
|
|
|
365 |
|
366 |
# Show preprocessing metadata in a well-formatted caption
|
367 |
meta_items = []
|
368 |
+
# Only include document type in the list if actual preprocessing is applied
|
369 |
+
has_active_preprocessing = (
|
370 |
+
sidebar_options["preprocessing_options"].get("grayscale", False) or
|
371 |
+
sidebar_options["preprocessing_options"].get("denoise", False) or
|
372 |
+
sidebar_options["preprocessing_options"].get("contrast", 0) != 0 or
|
373 |
+
sidebar_options["preprocessing_options"].get("rotation", 0) != 0
|
374 |
+
)
|
375 |
+
|
376 |
+
# Only show document type if there's actual preprocessing being applied
|
377 |
+
if has_active_preprocessing and sidebar_options["preprocessing_options"].get("document_type", "standard") != "standard":
|
378 |
meta_items.append(f"Document type ({sidebar_options['preprocessing_options']['document_type']})")
|
379 |
if sidebar_options["preprocessing_options"].get("grayscale", False):
|
380 |
meta_items.append("Grayscale")
|
config.py
CHANGED
@@ -40,7 +40,7 @@ VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest") #
|
|
40 |
# Image preprocessing settings optimized for historical documents
|
41 |
# These can be customized from environment variables
|
42 |
IMAGE_PREPROCESSING = {
|
43 |
-
"enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "1.
|
44 |
"sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
|
45 |
"denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
|
46 |
"max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")), # Increased size limit for better quality
|
|
|
40 |
# Image preprocessing settings optimized for historical documents
|
41 |
# These can be customized from environment variables
|
42 |
IMAGE_PREPROCESSING = {
|
43 |
+
"enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "1.2")), # Reduced contrast for more natural image appearance
|
44 |
"sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
|
45 |
"denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
|
46 |
"max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")), # Increased size limit for better quality
|
constants.py
CHANGED
@@ -6,7 +6,7 @@ making it easier to maintain and update values in one place.
|
|
6 |
"""
|
7 |
|
8 |
# API limits
|
9 |
-
MAX_FILE_SIZE_MB =
|
10 |
MAX_PAGES = 20
|
11 |
|
12 |
# Caching
|
@@ -15,7 +15,7 @@ MAX_CACHE_ENTRIES = 20
|
|
15 |
|
16 |
# Image processing
|
17 |
MAX_IMAGE_DIMENSION = 2500
|
18 |
-
IMAGE_QUALITY =
|
19 |
|
20 |
# Document types
|
21 |
DOCUMENT_TYPES = [
|
@@ -76,21 +76,65 @@ LAYOUT_PROMPT_ADDITIONS = {
|
|
76 |
|
77 |
# Content themes for subject tag extraction
|
78 |
CONTENT_THEMES = {
|
79 |
-
|
80 |
-
"
|
81 |
-
"
|
82 |
-
"
|
83 |
-
"
|
84 |
-
"
|
85 |
-
"
|
86 |
-
"
|
87 |
-
"
|
88 |
-
"
|
89 |
-
|
90 |
-
|
91 |
-
"
|
92 |
-
"
|
93 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
}
|
95 |
|
96 |
# Period tags based on year ranges
|
|
|
6 |
"""
|
7 |
|
8 |
# API limits
|
9 |
+
MAX_FILE_SIZE_MB = 200
|
10 |
MAX_PAGES = 20
|
11 |
|
12 |
# Caching
|
|
|
15 |
|
16 |
# Image processing
|
17 |
MAX_IMAGE_DIMENSION = 2500
|
18 |
+
IMAGE_QUALITY = 100
|
19 |
|
20 |
# Document types
|
21 |
DOCUMENT_TYPES = [
|
|
|
76 |
|
77 |
# Content themes for subject tag extraction
|
78 |
CONTENT_THEMES = {
|
79 |
+
# Historical Periods
|
80 |
+
"Prehistoric": ["paleolithic", "neolithic", "stone age", "bronze age", "iron age", "prehistoric", "ancient", "archaeology", "artifact", "primitive"],
|
81 |
+
"Ancient World": ["mesopotamia", "egypt", "greek", "roman", "persia", "babylonian", "assyrian", "pharaoh", "hieroglyphics", "cuneiform", "classical", "antiquity", "hellenistic", "republic", "empire"],
|
82 |
+
"Medieval": ["middle ages", "medieval", "feudal", "crusades", "byzantine", "carolingian", "holy roman empire", "dark ages", "castle", "knights", "chivalry", "monastery", "plague", "viking", "norse"],
|
83 |
+
"Renaissance": ["renaissance", "humanism", "reformation", "counter-reformation", "medici", "tudor", "elizabethan", "shakespeare", "machiavelli", "gutenberg", "printing press"],
|
84 |
+
"Early Modern": ["early modern", "enlightenment", "age of reason", "scientific revolution", "colonial", "colonization", "imperialism", "revolution", "baroque", "bourbon", "habsburg", "stuart"],
|
85 |
+
"18th Century": ["18th century", "1700s", "revolution", "american revolution", "french revolution", "enlightenment", "rococo", "neoclassical", "voltaire", "rousseau", "industrial"],
|
86 |
+
"19th Century": ["19th century", "1800s", "victorian", "romantic", "napoleonic", "civil war", "industrial revolution", "manifest destiny", "colonial", "imperialism", "belle epoque", "fin de siecle"],
|
87 |
+
"20th Century": ["20th century", "1900s", "world war", "great depression", "cold war", "interwar", "postwar", "modernism", "atomic", "post-colonial", "totalitarian", "fascism", "soviet", "civil rights"],
|
88 |
+
"Contemporary": ["contemporary", "modern", "postmodern", "digital age", "globalization", "information age", "post-industrial", "post-colonial", "post-soviet", "post-war", "21st century"],
|
89 |
+
|
90 |
+
# Geographic Contexts
|
91 |
+
"European History": ["europe", "western europe", "eastern europe", "central europe", "mediterranean", "nordic", "iberian", "british", "habsburg", "bourbon", "prussia", "holy roman empire"],
|
92 |
+
"Asian History": ["asia", "east asia", "south asia", "central asia", "southeast asia", "china", "japan", "india", "persia", "ottoman", "mongolian", "dynasty", "shogunate", "mughal", "silk road"],
|
93 |
+
"African History": ["africa", "north africa", "west africa", "east africa", "sub-saharan", "sahel", "swahili", "maghreb", "nubian", "ethiopian", "zulu", "colonial africa", "apartheid"],
|
94 |
+
"American History": ["america", "colonial america", "revolutionary", "antebellum", "civil war", "reconstruction", "frontier", "westward expansion", "manifest destiny", "native american", "indigenous"],
|
95 |
+
"Latin American": ["latin america", "mesoamerica", "caribbean", "aztec", "mayan", "inca", "colonial", "viceroyalty", "independence", "revolution", "hispanic", "creole", "mestizo", "indigenous"],
|
96 |
+
"Oceanic History": ["oceania", "pacific", "australian", "aboriginal", "indigenous", "polynesian", "melanesian", "micronesian", "maori", "maritime", "exploration", "settlement", "colonial"],
|
97 |
+
|
98 |
+
# Historical Methodologies & Approaches
|
99 |
+
"Archival Research": ["archive", "manuscript", "primary source", "provenance", "document", "preservation", "cataloging", "repository", "collection", "papers", "fonds", "records", "registry"],
|
100 |
+
"Oral History": ["oral history", "testimony", "interview", "narrative", "memory", "ethnography", "storytelling", "tradition", "folklore", "witness", "account", "recording", "indigenous knowledge"],
|
101 |
+
"Historical Archaeology": ["archaeology", "excavation", "artifact", "material culture", "stratigraphy", "conservation", "field work", "site", "ruins", "preservation", "heritage", "restoration"],
|
102 |
+
"Digital History": ["digital", "database", "digitization", "computational", "network analysis", "gis", "mapping", "visualization", "data mining", "text analysis", "digital humanities", "encoding"],
|
103 |
+
"Historiography": ["historiography", "revisionism", "interpretation", "narrative", "discourse", "bias", "perspective", "theory", "methodology", "framework", "historical thinking", "meta-history"],
|
104 |
+
|
105 |
+
# Historical Document Types
|
106 |
+
"Administrative Records": ["record", "registry", "account", "ledger", "census", "tax roll", "inventory", "charter", "deed", "grant", "patent", "minutes", "docket", "survey", "assessment", "register"],
|
107 |
+
"Diplomatic Documents": ["treaty", "agreement", "proclamation", "declaration", "diplomatic", "embassy", "consul", "dispatch", "communique", "protocol", "convention", "alliance", "international"],
|
108 |
+
"Personal Papers": ["diary", "journal", "memoir", "autobiography", "correspondence", "letter", "personal", "private", "papers", "notes", "scrapbook", "commonplace book", "sketchbook"],
|
109 |
+
"Media History": ["newspaper", "gazette", "periodical", "pamphlet", "broadside", "print culture", "press", "editorial", "journalism", "reporter", "editor", "circulation", "readership", "subscriber"],
|
110 |
+
"Visual Materials": ["photograph", "illustration", "print", "map", "atlas", "cartography", "engraving", "woodcut", "lithograph", "panorama", "portrait", "landscape", "sketch", "drawing", "plate"],
|
111 |
+
"Legal Documents": ["legal", "law", "statute", "code", "constitution", "legislation", "decree", "ordinance", "bylaw", "regulation", "case", "trial", "testimony", "deposition", "verdict", "judgment"],
|
112 |
+
|
113 |
+
# Historical Themes & Movements
|
114 |
+
"Economic History": ["economic", "commerce", "trade", "market", "merchant", "finance", "banking", "currency", "coin", "inflation", "recession", "depression", "exchange", "capital", "labor", "guild"],
|
115 |
+
"Social History": ["social", "society", "class", "status", "hierarchy", "everyday life", "community", "neighborhood", "urban", "rural", "poverty", "wealth", "leisure", "entertainment", "customs"],
|
116 |
+
"Political History": ["political", "politics", "government", "state", "monarchy", "republic", "democracy", "aristocracy", "parliament", "congress", "election", "regime", "policy", "reform", "revolution"],
|
117 |
+
"Intellectual History": ["intellectual", "idea", "philosophy", "theory", "concept", "movement", "thought", "discourse", "debate", "enlightenment", "rationalism", "empiricism", "ideology"],
|
118 |
+
"Cultural History": ["cultural", "culture", "custom", "tradition", "ritual", "ceremony", "festival", "celebration", "holiday", "folklore", "music", "art", "literature", "fashion", "consumption"],
|
119 |
+
"Religious History": ["religious", "religion", "church", "theology", "belief", "faith", "worship", "ritual", "sacred", "clergy", "monastery", "temple", "mosque", "synagogue", "pilgrimage", "sect"],
|
120 |
+
"Military History": ["military", "war", "conflict", "battle", "campaign", "siege", "army", "navy", "soldier", "officer", "regiment", "battalion", "artillery", "cavalry", "infantry", "strategy", "tactics"],
|
121 |
+
"Science History": ["scientific", "science", "experiment", "discovery", "theory", "hypothesis", "observation", "laboratory", "academy", "research", "natural philosophy", "medicine", "technology"],
|
122 |
+
"Environmental History": ["environmental", "ecology", "climate", "weather", "landscape", "agriculture", "farming", "forestry", "conservation", "pollution", "resource", "sustainability", "natural"],
|
123 |
+
|
124 |
+
# Specialized Historical Topics
|
125 |
+
"Migration History": ["migration", "immigration", "emigration", "diaspora", "exile", "refugee", "settlement", "colonization", "population movement", "forced migration", "displacement", "resettlement"],
|
126 |
+
"Maritime History": ["maritime", "naval", "shipping", "navigation", "sailor", "piracy", "privateering", "admiralty", "port", "harbor", "shipyard", "vessel", "sail", "trade route", "exploration"],
|
127 |
+
"Gender History": ["gender", "women", "feminist", "sexuality", "masculinity", "femininity", "patriarchy", "suffrage", "domestic", "family", "marriage", "emancipation", "rights", "equality"],
|
128 |
+
"Labor History": ["labor", "worker", "union", "strike", "apprentice", "guild", "factory", "workshop", "wage", "hours", "working conditions", "industrialization", "mechanization", "automation"],
|
129 |
+
"Urban History": ["urban", "city", "town", "metropolitan", "municipal", "civic", "suburb", "neighborhood", "planning", "infrastructure", "utilities", "housing", "development", "gentrification"],
|
130 |
+
"Rural History": ["rural", "countryside", "village", "agricultural", "farming", "peasant", "yeoman", "tenant", "sharecropper", "enclosure", "common land", "manor", "estate", "plantation"],
|
131 |
+
"Colonial History": ["colonial", "colony", "settlement", "frontier", "borderland", "territory", "dominion", "province", "governance", "administration", "native", "indigenous", "contact zone"],
|
132 |
+
"Indigenous History": ["indigenous", "native", "aboriginal", "first nations", "tribal", "reservation", "sovereignty", "land rights", "treaty rights", "cultural preservation", "oral tradition"],
|
133 |
+
|
134 |
+
# General Historical Terms
|
135 |
+
"Historical": ["history", "historical", "historiography", "heritage", "legacy", "tradition", "memory", "commemoration", "preservation", "conservation", "restoration", "interpretation", "significance"],
|
136 |
+
"Chronology": ["chronology", "timeline", "periodization", "era", "epoch", "age", "century", "decade", "millennium", "year", "date", "dating", "chronological", "contemporary", "synchronic", "diachronic"],
|
137 |
+
"Heritage": ["heritage", "preservation", "conservation", "landmark", "monument", "historic site", "museum", "archive", "collection", "artifact", "relic", "antiquity", "cultural heritage", "patrimony"]
|
138 |
}
|
139 |
|
140 |
# Period tags based on year ranges
|
image_segmentation.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Image segmentation utility for OCR preprocessing.
|
3 |
+
Separates text regions from image regions to improve OCR accuracy on mixed-content documents.
|
4 |
+
Based on Mistral AI cookbook examples.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import cv2
|
8 |
+
import numpy as np
|
9 |
+
from PIL import Image
|
10 |
+
import io
|
11 |
+
import base64
|
12 |
+
import logging
|
13 |
+
from pathlib import Path
|
14 |
+
from typing import Tuple, List, Dict, Union, Optional
|
15 |
+
|
16 |
+
# Configure logging
|
17 |
+
logging.basicConfig(level=logging.INFO,
|
18 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
def segment_image_for_ocr(image_path: Union[str, Path]) -> Dict[str, Union[Image.Image, str]]:
|
22 |
+
"""
|
23 |
+
Segment an image into text and image regions for improved OCR processing.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
image_path: Path to the image file
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
Dict containing:
|
30 |
+
- 'text_regions': PIL Image with highlighted text regions
|
31 |
+
- 'image_regions': PIL Image with highlighted image regions
|
32 |
+
- 'text_mask_base64': Base64 string of text mask for visualization
|
33 |
+
- 'combined_result': PIL Image with combined processing approach
|
34 |
+
"""
|
35 |
+
# Convert to Path object if string
|
36 |
+
image_file = Path(image_path) if isinstance(image_path, str) else image_path
|
37 |
+
|
38 |
+
# Log start of processing
|
39 |
+
logger.info(f"Segmenting image for OCR: {image_file.name}")
|
40 |
+
|
41 |
+
try:
|
42 |
+
# Open original image with PIL for compatibility
|
43 |
+
with Image.open(image_file) as pil_img:
|
44 |
+
# Convert to RGB if not already
|
45 |
+
if pil_img.mode != 'RGB':
|
46 |
+
pil_img = pil_img.convert('RGB')
|
47 |
+
|
48 |
+
# Convert PIL image to OpenCV format
|
49 |
+
img = np.array(pil_img)
|
50 |
+
img_rgb = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
51 |
+
|
52 |
+
# Create grayscale version for text detection
|
53 |
+
gray = cv2.cvtColor(img_rgb, cv2.COLOR_BGR2GRAY)
|
54 |
+
|
55 |
+
# Step 1: Apply adaptive thresholding to identify potential text areas
|
56 |
+
# This works well for printed text against contrasting backgrounds
|
57 |
+
binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
58 |
+
cv2.THRESH_BINARY_INV, 11, 2)
|
59 |
+
|
60 |
+
# Step 2: Perform morphological operations to connect text components
|
61 |
+
# Create a rectangular kernel that's wider than tall (for text lines)
|
62 |
+
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 3))
|
63 |
+
dilation = cv2.dilate(binary, rect_kernel, iterations=3)
|
64 |
+
|
65 |
+
# Step 3: Find contours which will correspond to text blocks
|
66 |
+
contours, _ = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
67 |
+
|
68 |
+
# Prepare masks to separate text and image regions
|
69 |
+
text_mask = np.zeros_like(gray)
|
70 |
+
|
71 |
+
# Step 4: Filter contours based on size to identify text regions
|
72 |
+
min_area = 100 # Minimum contour area to be considered text
|
73 |
+
max_area = img.shape[0] * img.shape[1] * 0.5 # Max 50% of image
|
74 |
+
|
75 |
+
text_regions = []
|
76 |
+
for contour in contours:
|
77 |
+
area = cv2.contourArea(contour)
|
78 |
+
# Filter by area to avoid noise
|
79 |
+
if min_area < area < max_area:
|
80 |
+
# Get the bounding rectangle
|
81 |
+
x, y, w, h = cv2.boundingRect(contour)
|
82 |
+
|
83 |
+
# Calculate aspect ratio - text regions typically have wider aspect ratio
|
84 |
+
aspect_ratio = w / h
|
85 |
+
|
86 |
+
# Calculate density of dark pixels in the region (text is typically dense)
|
87 |
+
roi = binary[y:y+h, x:x+w]
|
88 |
+
dark_pixel_density = np.sum(roi > 0) / (w * h)
|
89 |
+
|
90 |
+
# Additional check for text-like characteristics
|
91 |
+
# Text typically has aspect ratio > 1 (wider than tall) and reasonable density
|
92 |
+
if (aspect_ratio > 1.5 or aspect_ratio < 0.5) and dark_pixel_density > 0.2:
|
93 |
+
# Add to text regions list
|
94 |
+
text_regions.append((x, y, w, h))
|
95 |
+
# Add to text mask
|
96 |
+
cv2.rectangle(text_mask, (x, y), (x+w, y+h), 255, -1)
|
97 |
+
|
98 |
+
# Step 5: Create visualization for debugging
|
99 |
+
text_regions_vis = img_rgb.copy()
|
100 |
+
for x, y, w, h in text_regions:
|
101 |
+
cv2.rectangle(text_regions_vis, (x, y), (x+w, y+h), (0, 255, 0), 2)
|
102 |
+
|
103 |
+
# Create image regions mask (inverse of text mask)
|
104 |
+
image_mask = cv2.bitwise_not(text_mask)
|
105 |
+
|
106 |
+
# Create image regions visualization
|
107 |
+
image_regions_vis = img_rgb.copy()
|
108 |
+
# Add detected image regions in red
|
109 |
+
for contour in contours:
|
110 |
+
area = cv2.contourArea(contour)
|
111 |
+
if area > max_area * 0.1: # Only highlight larger image regions
|
112 |
+
x, y, w, h = cv2.boundingRect(contour)
|
113 |
+
if np.sum(text_mask[y:y+h, x:x+w]) / (w * h) < 128: # Not significantly overlapping with text
|
114 |
+
cv2.rectangle(image_regions_vis, (x, y), (x+w, y+h), (0, 0, 255), 2)
|
115 |
+
|
116 |
+
# Step 6: Create a combined result that enhances text regions
|
117 |
+
# Different processing for text vs. image regions
|
118 |
+
combined_result = img_rgb.copy()
|
119 |
+
|
120 |
+
# Apply more aggressive contrast enhancement to text regions
|
121 |
+
text_enhanced = cv2.bitwise_and(img_rgb, img_rgb, mask=text_mask)
|
122 |
+
# Convert to LAB for better contrast enhancement
|
123 |
+
text_lab = cv2.cvtColor(text_enhanced, cv2.COLOR_BGR2LAB)
|
124 |
+
l, a, b = cv2.split(text_lab)
|
125 |
+
# Apply CLAHE to L channel
|
126 |
+
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
127 |
+
cl = clahe.apply(l)
|
128 |
+
# Merge back
|
129 |
+
enhanced_lab = cv2.merge((cl, a, b))
|
130 |
+
text_enhanced = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
|
131 |
+
|
132 |
+
# Apply gentler processing to image regions
|
133 |
+
image_enhanced = cv2.bitwise_and(img_rgb, img_rgb, mask=image_mask)
|
134 |
+
# Just slight sharpening for image regions
|
135 |
+
image_enhanced = cv2.GaussianBlur(image_enhanced, (0, 0), 3)
|
136 |
+
image_enhanced = cv2.addWeighted(img_rgb, 1.5, image_enhanced, -0.5, 0)
|
137 |
+
image_enhanced = cv2.bitwise_and(image_enhanced, image_enhanced, mask=image_mask)
|
138 |
+
|
139 |
+
# Combine the enhanced regions
|
140 |
+
combined_result = cv2.add(text_enhanced, image_enhanced)
|
141 |
+
|
142 |
+
# Convert visualization results back to PIL Images
|
143 |
+
text_regions_pil = Image.fromarray(cv2.cvtColor(text_regions_vis, cv2.COLOR_BGR2RGB))
|
144 |
+
image_regions_pil = Image.fromarray(cv2.cvtColor(image_regions_vis, cv2.COLOR_BGR2RGB))
|
145 |
+
combined_result_pil = Image.fromarray(cv2.cvtColor(combined_result, cv2.COLOR_BGR2RGB))
|
146 |
+
|
147 |
+
# Create base64 representation of text mask for visualization
|
148 |
+
_, buffer = cv2.imencode('.png', text_mask)
|
149 |
+
text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
|
150 |
+
|
151 |
+
# Return the segmentation results
|
152 |
+
return {
|
153 |
+
'text_regions': text_regions_pil,
|
154 |
+
'image_regions': image_regions_pil,
|
155 |
+
'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
|
156 |
+
'combined_result': combined_result_pil,
|
157 |
+
'text_regions_coordinates': text_regions
|
158 |
+
}
|
159 |
+
|
160 |
+
except Exception as e:
|
161 |
+
logger.error(f"Error segmenting image {image_file.name}: {str(e)}")
|
162 |
+
# Return None values if processing fails
|
163 |
+
return {
|
164 |
+
'text_regions': None,
|
165 |
+
'image_regions': None,
|
166 |
+
'text_mask_base64': None,
|
167 |
+
'combined_result': None,
|
168 |
+
'text_regions_coordinates': []
|
169 |
+
}
|
170 |
+
|
171 |
+
def process_segmented_image(image_path: Union[str, Path], output_dir: Optional[Path] = None) -> Dict:
|
172 |
+
"""
|
173 |
+
Process an image using segmentation for improved OCR, saving visualization outputs.
|
174 |
+
|
175 |
+
Args:
|
176 |
+
image_path: Path to the image file
|
177 |
+
output_dir: Optional directory to save visualization outputs
|
178 |
+
|
179 |
+
Returns:
|
180 |
+
Dictionary with processing results and paths to output files
|
181 |
+
"""
|
182 |
+
# Convert to Path object if string
|
183 |
+
image_file = Path(image_path) if isinstance(image_path, str) else image_path
|
184 |
+
|
185 |
+
# Create output directory if not provided
|
186 |
+
if output_dir is None:
|
187 |
+
output_dir = Path("output") / "segmentation"
|
188 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
189 |
+
|
190 |
+
# Process the image with segmentation
|
191 |
+
segmentation_results = segment_image_for_ocr(image_file)
|
192 |
+
|
193 |
+
# Prepare results dictionary
|
194 |
+
results = {
|
195 |
+
'original_image': str(image_file),
|
196 |
+
'output_files': {}
|
197 |
+
}
|
198 |
+
|
199 |
+
# Save visualization outputs if segmentation was successful
|
200 |
+
if segmentation_results['text_regions'] is not None:
|
201 |
+
# Save text regions visualization
|
202 |
+
text_regions_path = output_dir / f"{image_file.stem}_text_regions.jpg"
|
203 |
+
segmentation_results['text_regions'].save(text_regions_path)
|
204 |
+
results['output_files']['text_regions'] = str(text_regions_path)
|
205 |
+
|
206 |
+
# Save image regions visualization
|
207 |
+
image_regions_path = output_dir / f"{image_file.stem}_image_regions.jpg"
|
208 |
+
segmentation_results['image_regions'].save(image_regions_path)
|
209 |
+
results['output_files']['image_regions'] = str(image_regions_path)
|
210 |
+
|
211 |
+
# Save combined result
|
212 |
+
combined_path = output_dir / f"{image_file.stem}_combined.jpg"
|
213 |
+
segmentation_results['combined_result'].save(combined_path)
|
214 |
+
results['output_files']['combined_result'] = str(combined_path)
|
215 |
+
|
216 |
+
# Save text mask visualization
|
217 |
+
text_mask_path = output_dir / f"{image_file.stem}_text_mask.png"
|
218 |
+
# Save text mask from base64
|
219 |
+
if segmentation_results['text_mask_base64']:
|
220 |
+
base64_data = segmentation_results['text_mask_base64'].split(',')[1]
|
221 |
+
with open(text_mask_path, 'wb') as f:
|
222 |
+
f.write(base64.b64decode(base64_data))
|
223 |
+
results['output_files']['text_mask'] = str(text_mask_path)
|
224 |
+
|
225 |
+
# Add detected text regions count
|
226 |
+
results['text_regions_count'] = len(segmentation_results['text_regions_coordinates'])
|
227 |
+
results['text_regions_coordinates'] = segmentation_results['text_regions_coordinates']
|
228 |
+
|
229 |
+
return results
|
230 |
+
|
231 |
+
if __name__ == "__main__":
|
232 |
+
# Simple test - process a sample image if run directly
|
233 |
+
import sys
|
234 |
+
|
235 |
+
if len(sys.argv) > 1:
|
236 |
+
image_path = sys.argv[1]
|
237 |
+
else:
|
238 |
+
# Default to testing with the magician image
|
239 |
+
image_path = "input/magician-or-bottle-cungerer.jpg"
|
240 |
+
|
241 |
+
logger.info(f"Testing image segmentation on {image_path}")
|
242 |
+
results = process_segmented_image(image_path)
|
243 |
+
|
244 |
+
# Print results summary
|
245 |
+
logger.info(f"Segmentation complete. Found {results.get('text_regions_count', 0)} text regions.")
|
246 |
+
logger.info(f"Output files saved to: {[path for path in results.get('output_files', {}).values()]}")
|
language_detection.py
ADDED
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Standard library imports
|
2 |
+
import logging
|
3 |
+
import re
|
4 |
+
from typing import List, Dict, Set, Tuple, Optional, Union, Any
|
5 |
+
from functools import lru_cache
|
6 |
+
|
7 |
+
# Configure logging
|
8 |
+
logging.basicConfig(level=logging.INFO,
|
9 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
class LanguageDetector:
|
13 |
+
"""
|
14 |
+
A language detection system that provides balanced detection across multiple languages
|
15 |
+
using an enhanced statistical approach.
|
16 |
+
"""
|
17 |
+
|
18 |
+
def __init__(self):
|
19 |
+
"""Initialize the language detector with statistical language models"""
|
20 |
+
logger.info("Initializing language detector with statistical models")
|
21 |
+
|
22 |
+
# Initialize language indicators dictionary for statistical detection
|
23 |
+
self._init_language_indicators()
|
24 |
+
# Set thresholds for language detection confidence
|
25 |
+
self.single_lang_confidence = 65 # Minimum score to consider a language detected
|
26 |
+
self.secondary_lang_threshold = 0.75 # Secondary language must be at least this fraction of primary score
|
27 |
+
|
28 |
+
def _init_language_indicators(self):
|
29 |
+
"""Initialize language indicators for statistical detection with historical markers"""
|
30 |
+
# Define indicators for all supported languages with equal detail level
|
31 |
+
# Each language has:
|
32 |
+
# - Distinctive characters
|
33 |
+
# - Common words (including historical forms)
|
34 |
+
# - N-grams (character sequences)
|
35 |
+
# - Historical markers specific to older forms of the language
|
36 |
+
self.language_indicators = {
|
37 |
+
"English": {
|
38 |
+
"chars": [], # English uses basic Latin alphabet without special chars
|
39 |
+
"words": ['the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'it',
|
40 |
+
'with', 'as', 'be', 'on', 'by', 'at', 'this', 'have', 'from', 'or',
|
41 |
+
'an', 'but', 'not', 'what', 'all', 'were', 'when', 'we', 'there', 'can',
|
42 |
+
'would', 'who', 'you', 'been', 'one', 'their', 'has', 'more', 'if', 'no'],
|
43 |
+
"ngrams": ['th', 'he', 'in', 'er', 'an', 're', 'on', 'at', 'en', 'nd', 'ti', 'es', 'or',
|
44 |
+
'ing', 'tion', 'the', 'and', 'tha', 'ent', 'ion'],
|
45 |
+
"historical": {
|
46 |
+
"chars": ['þ', 'ȝ', 'æ', 'ſ'], # Thorn, yogh, ash, long s
|
47 |
+
"words": ['thou', 'thee', 'thy', 'thine', 'hath', 'doth', 'ere', 'whilom', 'betwixt',
|
48 |
+
'ye', 'art', 'wast', 'dost', 'hast', 'shalt', 'mayst', 'verily'],
|
49 |
+
"patterns": ['eth$', '^y[^a-z]', 'ck$', 'aught', 'ought'] # -eth endings, y- prefixes
|
50 |
+
}
|
51 |
+
},
|
52 |
+
"French": {
|
53 |
+
"chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û', 'ë', 'ï', 'ü'],
|
54 |
+
"words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'un', 'une', 'ce', 'cette',
|
55 |
+
'ces', 'dans', 'par', 'pour', 'sur', 'qui', 'que', 'quoi', 'où', 'quand', 'comment',
|
56 |
+
'est', 'sont', 'ont', 'nous', 'vous', 'ils', 'elles', 'avec', 'sans', 'mais', 'ou'],
|
57 |
+
"ngrams": ['es', 'le', 'de', 'en', 'on', 'nt', 'qu', 'ai', 'an', 'ou', 'ur', 're', 'me',
|
58 |
+
'les', 'ent', 'que', 'des', 'ons', 'ant', 'ion'],
|
59 |
+
"historical": {
|
60 |
+
"chars": ['ſ', 'æ', 'œ'], # Long s and ligatures
|
61 |
+
"words": ['aultre', 'avecq', 'icelluy', 'oncques', 'moult', 'estre', 'mesme', 'ceste',
|
62 |
+
'ledict', 'celuy', 'ceulx', 'aulcun', 'ainſi', 'touſiours', 'eſtre',
|
63 |
+
'eſt', 'meſme', 'felon', 'auec', 'iufques', 'chofe', 'fcience'],
|
64 |
+
"patterns": ['oi[ts]$', 'oi[re]$', 'f[^aeiou]', 'ff', 'ſ', 'auoit', 'eſtoit',
|
65 |
+
'ſi', 'ſur', 'ſa', 'cy', 'ayant', 'oy', 'uſ', 'auſ']
|
66 |
+
},
|
67 |
+
"exclusivity": 2.0 # French indicators have higher weight in historical text detection
|
68 |
+
},
|
69 |
+
"German": {
|
70 |
+
"chars": ['ä', 'ö', 'ü', 'ß'],
|
71 |
+
"words": ['der', 'die', 'das', 'und', 'in', 'zu', 'den', 'ein', 'eine', 'mit', 'ist', 'von',
|
72 |
+
'des', 'sich', 'auf', 'für', 'als', 'auch', 'werden', 'bei', 'durch', 'aus', 'sind',
|
73 |
+
'nicht', 'nur', 'wurde', 'wie', 'wenn', 'aber', 'noch', 'nach', 'so', 'sein', 'über'],
|
74 |
+
"ngrams": ['en', 'er', 'ch', 'de', 'ei', 'in', 'te', 'nd', 'ie', 'ge', 'un', 'sch', 'ich',
|
75 |
+
'den', 'die', 'und', 'der', 'ein', 'ung', 'cht'],
|
76 |
+
"historical": {
|
77 |
+
"chars": ['ſ', 'ů', 'ė', 'ÿ'],
|
78 |
+
"words": ['vnnd', 'vnnd', 'vnter', 'vnd', 'seyn', 'thun', 'auff', 'auß', 'deß', 'diß'],
|
79 |
+
"patterns": ['^v[nd]', 'th', 'vnter', 'ſch']
|
80 |
+
}
|
81 |
+
},
|
82 |
+
"Spanish": {
|
83 |
+
"chars": ['á', 'é', 'í', 'ó', 'ú', 'ñ', 'ü', '¿', '¡'],
|
84 |
+
"words": ['el', 'la', 'los', 'las', 'de', 'en', 'y', 'a', 'que', 'por', 'un', 'una', 'no',
|
85 |
+
'es', 'con', 'para', 'su', 'al', 'se', 'del', 'como', 'más', 'pero', 'lo', 'mi',
|
86 |
+
'si', 'ya', 'todo', 'esta', 'cuando', 'hay', 'muy', 'bien', 'sin', 'así'],
|
87 |
+
"ngrams": ['de', 'en', 'os', 'es', 'la', 'ar', 'el', 'er', 'ra', 'as', 'an', 'do', 'or',
|
88 |
+
'que', 'nte', 'los', 'ado', 'con', 'ent', 'ien'],
|
89 |
+
"historical": {
|
90 |
+
"chars": ['ſ', 'ç', 'ñ'],
|
91 |
+
"words": ['facer', 'fijo', 'fermoso', 'agora', 'asaz', 'aver', 'caſa', 'deſde', 'eſte',
|
92 |
+
'eſta', 'eſto', 'deſto', 'deſta', 'eſſo', 'muger', 'dixo', 'fazer'],
|
93 |
+
"patterns": ['^f[aei]', 'ſſ', 'ſc', '^deſ', 'xo$', 'xe$']
|
94 |
+
},
|
95 |
+
},
|
96 |
+
"Italian": {
|
97 |
+
"chars": ['à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'],
|
98 |
+
"words": ['il', 'la', 'i', 'le', 'e', 'di', 'a', 'in', 'che', 'non', 'per', 'con', 'un',
|
99 |
+
'una', 'del', 'della', 'è', 'sono', 'da', 'si', 'come', 'anche', 'più', 'ma', 'ci',
|
100 |
+
'se', 'ha', 'mi', 'lo', 'ti', 'al', 'tu', 'questo', 'questi'],
|
101 |
+
"ngrams": ['di', 'la', 'er', 'to', 're', 'co', 'de', 'in', 'ra', 'on', 'li', 'no', 'ri',
|
102 |
+
'che', 'ent', 'con', 'per', 'ion', 'ato', 'lla']
|
103 |
+
},
|
104 |
+
"Portuguese": {
|
105 |
+
"chars": ['á', 'â', 'ã', 'à', 'é', 'ê', 'í', 'ó', 'ô', 'õ', 'ú', 'ç'],
|
106 |
+
"words": ['o', 'a', 'os', 'as', 'de', 'em', 'e', 'do', 'da', 'dos', 'das', 'no', 'na',
|
107 |
+
'para', 'que', 'um', 'uma', 'por', 'com', 'se', 'não', 'mais', 'como', 'mas',
|
108 |
+
'você', 'eu', 'este', 'isso', 'ele', 'seu', 'sua', 'ou', 'já', 'me'],
|
109 |
+
"ngrams": ['de', 'os', 'em', 'ar', 'es', 'ra', 'do', 'da', 'en', 'co', 'nt', 'ad', 'to',
|
110 |
+
'que', 'nto', 'ent', 'com', 'ção', 'ado', 'ment']
|
111 |
+
},
|
112 |
+
"Dutch": {
|
113 |
+
"chars": ['ë', 'ï', 'ö', 'ü', 'é', 'è', 'ê', 'ç', 'á', 'à', 'ä', 'ó', 'ô', 'ú', 'ù', 'û', 'ij'],
|
114 |
+
"words": ['de', 'het', 'een', 'en', 'van', 'in', 'is', 'dat', 'op', 'te', 'zijn', 'met',
|
115 |
+
'voor', 'niet', 'aan', 'er', 'die', 'maar', 'dan', 'ik', 'je', 'hij', 'zij', 'we',
|
116 |
+
'kunnen', 'wordt', 'nog', 'door', 'over', 'als', 'uit', 'bij', 'om', 'ook'],
|
117 |
+
"ngrams": ['en', 'de', 'er', 'ee', 'ge', 'an', 'aa', 'in', 'te', 'et', 'ng', 'ee', 'or',
|
118 |
+
'van', 'het', 'een', 'ing', 'ver', 'den', 'sch']
|
119 |
+
},
|
120 |
+
"Russian": {
|
121 |
+
# Russian (Cyrillic alphabet) characters
|
122 |
+
"chars": ['а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
|
123 |
+
'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'],
|
124 |
+
"words": ['и', 'в', 'не', 'на', 'что', 'я', 'с', 'а', 'то', 'он', 'как', 'этот', 'по',
|
125 |
+
'но', 'из', 'к', 'у', 'за', 'вы', 'все', 'так', 'же', 'от', 'для', 'о', 'его',
|
126 |
+
'мы', 'было', 'она', 'бы', 'мне', 'еще', 'есть', 'быть', 'был'],
|
127 |
+
"ngrams": ['о', 'е', 'а', 'н', 'и', 'т', 'р', 'с', 'в', 'л', 'к', 'м', 'д',
|
128 |
+
'ст', 'но', 'то', 'ни', 'на', 'по', 'ет']
|
129 |
+
},
|
130 |
+
"Chinese": {
|
131 |
+
"chars": ['的', '是', '不', '了', '在', '和', '有', '我', '们', '人', '这', '上', '中',
|
132 |
+
'个', '大', '来', '到', '国', '时', '要', '地', '出', '会', '可', '也', '就',
|
133 |
+
'年', '生', '对', '能', '自', '那', '都', '得', '说', '过', '子', '家', '后', '多'],
|
134 |
+
# Chinese doesn't have "words" in the same way as alphabetic languages
|
135 |
+
"words": ['的', '是', '不', '了', '在', '和', '有', '我', '们', '人', '这', '上', '中',
|
136 |
+
'个', '大', '来', '到', '国', '时', '要', '地', '出', '会', '可', '也', '就'],
|
137 |
+
"ngrams": ['的', '是', '不', '了', '在', '我', '有', '和', '人', '这', '中', '大', '来', '上',
|
138 |
+
'国', '个', '到', '说', '们', '为']
|
139 |
+
},
|
140 |
+
"Japanese": {
|
141 |
+
# A mix of hiragana, katakana, and common kanji
|
142 |
+
"chars": ['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ',
|
143 |
+
'ア', 'イ', 'ウ', 'エ', 'オ', 'カ', 'キ', 'ク', 'ケ', 'コ', 'サ', 'シ', 'ス', 'セ', 'ソ',
|
144 |
+
'日', '本', '人', '大', '小', '中', '山', '川', '田', '子', '女', '男', '月', '火', '水'],
|
145 |
+
"words": ['は', 'を', 'に', 'の', 'が', 'で', 'へ', 'から', 'より', 'まで', 'だ', 'です', 'した',
|
146 |
+
'ます', 'ません', 'です', 'これ', 'それ', 'あれ', 'この', 'その', 'あの', 'わたし'],
|
147 |
+
"ngrams": ['の', 'は', 'た', 'が', 'を', 'に', 'て', 'で', 'と', 'し', 'か', 'ま', 'こ', 'い',
|
148 |
+
'する', 'いる', 'れる', 'なる', 'れて', 'した']
|
149 |
+
},
|
150 |
+
"Korean": {
|
151 |
+
"chars": ['가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하',
|
152 |
+
'그', '는', '을', '이', '에', '에서', '로', '으로', '와', '과', '또는', '하지만'],
|
153 |
+
"words": ['이', '그', '저', '나', '너', '우리', '그들', '이것', '그것', '저것', '은', '는',
|
154 |
+
'이', '가', '을', '를', '에', '에서', '으로', '로', '와', '과', '의', '하다', '되다'],
|
155 |
+
"ngrams": ['이', '다', '는', '에', '하', '고', '지', '서', '의', '가', '을', '로', '을', '으',
|
156 |
+
'니다', '습니', '하는', '이다', '에서', '하고']
|
157 |
+
},
|
158 |
+
"Arabic": {
|
159 |
+
"chars": ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض',
|
160 |
+
'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'ء', 'ة', 'ى'],
|
161 |
+
"words": ['في', 'من', 'على', 'إلى', 'هذا', 'هذه', 'ذلك', 'تلك', 'هو', 'هي', 'هم', 'أنا',
|
162 |
+
'أنت', 'نحن', 'كان', 'كانت', 'يكون', 'لا', 'لم', 'ما', 'أن', 'و', 'أو', 'ثم', 'بعد'],
|
163 |
+
"ngrams": ['ال', 'ان', 'في', 'من', 'ون', 'ين', 'ات', 'ار', 'ور', 'ما', 'لا', 'ها', 'ان',
|
164 |
+
'الم', 'لان', 'علا', 'الح', 'الس', 'الع', 'الت']
|
165 |
+
},
|
166 |
+
"Hindi": {
|
167 |
+
"chars": ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ',
|
168 |
+
'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न',
|
169 |
+
'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी',
|
170 |
+
'ु', 'ू', 'े', 'ै', 'ो', 'ौ', '्', 'ं', 'ः'],
|
171 |
+
"words": ['और', 'का', 'के', 'की', 'एक', 'में', 'है', 'यह', 'हैं', 'से', 'को', 'पर', 'इस',
|
172 |
+
'हो', 'गया', 'कर', 'मैं', 'या', 'हुआ', 'था', 'वह', 'अपने', 'सकता', 'ने', 'बहुत'],
|
173 |
+
"ngrams": ['का', 'के', 'की', 'है', 'ने', 'से', 'मे', 'को', 'पर', 'हा', 'रा', 'ता', 'या',
|
174 |
+
'ार', 'ान', 'कार', 'राज', 'ारा', 'जाए', 'ेजा']
|
175 |
+
},
|
176 |
+
"Latin": {
|
177 |
+
"chars": [], # Latin uses basic Latin alphabet
|
178 |
+
"words": ['et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed', 'qui', 'quod', 'ut', 'si',
|
179 |
+
'nec', 'ex', 'per', 'quam', 'pro', 'iam', 'hoc', 'aut', 'esse', 'enim', 'de',
|
180 |
+
'atque', 'ac', 'ante', 'post', 'sub', 'ab'],
|
181 |
+
"ngrams": ['us', 'is', 'um', 'er', 'it', 'nt', 'am', 'em', 're', 'at', 'ti', 'es', 'ur',
|
182 |
+
'tur', 'que', 'ere', 'ent', 'ius', 'rum', 'tus']
|
183 |
+
},
|
184 |
+
"Greek": {
|
185 |
+
"chars": ['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π',
|
186 |
+
'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'ά', 'έ', 'ή', 'ί', 'ό', 'ύ', 'ώ'],
|
187 |
+
"words": ['και', 'του', 'της', 'των', 'στο', 'στη', 'με', 'από', 'για', 'είναι', 'να',
|
188 |
+
'ότι', 'δεν', 'στον', 'μια', 'που', 'ένα', 'έχει', 'θα', 'το', 'ο', 'η', 'τον'],
|
189 |
+
"ngrams": ['αι', 'τα', 'ου', 'τη', 'οι', 'το', 'ης', 'αν', 'ος', 'ον', 'ις', 'ει', 'ερ',
|
190 |
+
'και', 'την', 'τον', 'ους', 'νου', 'εντ', 'μεν']
|
191 |
+
}
|
192 |
+
}
|
193 |
+
|
194 |
+
def detect_languages(self, text: str, filename: str = None, current_languages: List[str] = None) -> List[str]:
|
195 |
+
"""
|
196 |
+
Detect languages in text using an enhanced statistical approach
|
197 |
+
|
198 |
+
Args:
|
199 |
+
text: Text to analyze
|
200 |
+
filename: Optional filename to provide additional context
|
201 |
+
current_languages: Optional list of languages already detected
|
202 |
+
|
203 |
+
Returns:
|
204 |
+
List of detected languages
|
205 |
+
"""
|
206 |
+
logger = logging.getLogger("language_detector")
|
207 |
+
|
208 |
+
# If no text provided, return current languages or default
|
209 |
+
if not text or len(text.strip()) < 10:
|
210 |
+
return current_languages if current_languages else ["English"]
|
211 |
+
|
212 |
+
# If we already have detected languages, use them
|
213 |
+
if current_languages and len(current_languages) > 0:
|
214 |
+
logger.info(f"Using already detected languages: {current_languages}")
|
215 |
+
return current_languages
|
216 |
+
|
217 |
+
# Use enhanced statistical detection
|
218 |
+
detected_languages = self._detect_statistically(text, filename)
|
219 |
+
logger.info(f"Statistical language detection results: {detected_languages}")
|
220 |
+
return detected_languages
|
221 |
+
|
222 |
+
def _detect_statistically(self, text: str, filename: str = None) -> List[str]:
|
223 |
+
"""
|
224 |
+
Detect languages using enhanced statistical analysis with historical language indicators
|
225 |
+
|
226 |
+
Args:
|
227 |
+
text: Text to analyze
|
228 |
+
filename: Optional filename for additional context
|
229 |
+
|
230 |
+
Returns:
|
231 |
+
List of detected languages
|
232 |
+
"""
|
233 |
+
logger = logging.getLogger("language_detector")
|
234 |
+
|
235 |
+
# Normalize text to lowercase for consistent analysis
|
236 |
+
text_lower = text.lower()
|
237 |
+
words = re.findall(r'\b\w+\b', text_lower) # Extract words
|
238 |
+
|
239 |
+
# Score each language based on characters, words, n-grams, and historical markers
|
240 |
+
language_scores = {}
|
241 |
+
historical_bonus = {}
|
242 |
+
|
243 |
+
# PHASE 1: Special character analysis
|
244 |
+
# Count special characters for each language
|
245 |
+
special_char_counts = {}
|
246 |
+
total_special_chars = 0
|
247 |
+
|
248 |
+
for language, indicators in self.language_indicators.items():
|
249 |
+
chars = indicators["chars"]
|
250 |
+
count = 0
|
251 |
+
for char in chars:
|
252 |
+
if char in text_lower:
|
253 |
+
count += text_lower.count(char)
|
254 |
+
special_char_counts[language] = count
|
255 |
+
total_special_chars += count
|
256 |
+
|
257 |
+
# Normalize character scores (0-30 points)
|
258 |
+
for language, count in special_char_counts.items():
|
259 |
+
if total_special_chars > 0:
|
260 |
+
# Scale score to 0-30 range (reduced from 35 to make room for historical)
|
261 |
+
normalized_score = (count / total_special_chars) * 30
|
262 |
+
language_scores[language] = normalized_score
|
263 |
+
else:
|
264 |
+
language_scores[language] = 0
|
265 |
+
|
266 |
+
# PHASE 2: Word analysis (0-30 points)
|
267 |
+
# Count common words for each language
|
268 |
+
for language, indicators in self.language_indicators.items():
|
269 |
+
word_list = indicators["words"]
|
270 |
+
word_matches = sum(1 for word in words if word in word_list)
|
271 |
+
|
272 |
+
# Normalize word score based on text length and word list size
|
273 |
+
word_score_factor = min(1.0, word_matches / (len(words) * 0.1)) # Max 1.0 if 10% match
|
274 |
+
language_scores[language] = language_scores.get(language, 0) + (word_score_factor * 30)
|
275 |
+
|
276 |
+
# PHASE 3: N-gram analysis (0-20 points)
|
277 |
+
for language, indicators in self.language_indicators.items():
|
278 |
+
ngram_list = indicators["ngrams"]
|
279 |
+
ngram_matches = 0
|
280 |
+
|
281 |
+
# Count ngram occurrences
|
282 |
+
for ngram in ngram_list:
|
283 |
+
ngram_matches += text_lower.count(ngram)
|
284 |
+
|
285 |
+
# Normalize ngram score based on text length
|
286 |
+
if len(text_lower) > 0:
|
287 |
+
ngram_score_factor = min(1.0, ngram_matches / (len(text_lower) * 0.05)) # Max 1.0 if 5% match
|
288 |
+
language_scores[language] = language_scores.get(language, 0) + (ngram_score_factor * 20)
|
289 |
+
|
290 |
+
# PHASE 4: Historical language markers (0-20 points)
|
291 |
+
for language, indicators in self.language_indicators.items():
|
292 |
+
if "historical" in indicators:
|
293 |
+
historical_indicators = indicators["historical"]
|
294 |
+
historical_score = 0
|
295 |
+
|
296 |
+
# Check for historical chars
|
297 |
+
if "chars" in historical_indicators:
|
298 |
+
for char in historical_indicators["chars"]:
|
299 |
+
if char in text_lower:
|
300 |
+
historical_score += text_lower.count(char) * 0.5
|
301 |
+
|
302 |
+
# Check for historical words
|
303 |
+
if "words" in historical_indicators:
|
304 |
+
hist_words = historical_indicators["words"]
|
305 |
+
hist_word_matches = sum(1 for word in words if word in hist_words)
|
306 |
+
if hist_word_matches > 0:
|
307 |
+
# Historical words are strong indicators
|
308 |
+
historical_score += min(10, hist_word_matches * 2)
|
309 |
+
|
310 |
+
# Check for historical patterns
|
311 |
+
if "patterns" in historical_indicators:
|
312 |
+
for pattern in historical_indicators["patterns"]:
|
313 |
+
matches = len(re.findall(pattern, text_lower))
|
314 |
+
if matches > 0:
|
315 |
+
historical_score += min(5, matches * 0.5)
|
316 |
+
|
317 |
+
# Cap historical score at 20 points
|
318 |
+
historical_score = min(20, historical_score)
|
319 |
+
historical_bonus[language] = historical_score
|
320 |
+
|
321 |
+
# Apply historical bonus
|
322 |
+
language_scores[language] += historical_score
|
323 |
+
|
324 |
+
# Apply language-specific exclusivity multiplier if present
|
325 |
+
if "exclusivity" in indicators:
|
326 |
+
exclusivity = indicators["exclusivity"]
|
327 |
+
language_scores[language] *= exclusivity
|
328 |
+
logger.info(f"Applied exclusivity multiplier {exclusivity} to {language}")
|
329 |
+
|
330 |
+
# Print historical bonus for debugging
|
331 |
+
for language, bonus in historical_bonus.items():
|
332 |
+
if bonus > 0:
|
333 |
+
logger.info(f"Historical language bonus for {language}: {bonus} points")
|
334 |
+
|
335 |
+
# Final language selection with more stringent criteria
|
336 |
+
# Get languages with scores above threshold
|
337 |
+
threshold = self.single_lang_confidence # Higher minimum score
|
338 |
+
candidates = [(lang, score) for lang, score in language_scores.items() if score >= threshold]
|
339 |
+
candidates.sort(key=lambda x: x[1], reverse=True)
|
340 |
+
|
341 |
+
logger.info(f"Language candidates: {candidates}")
|
342 |
+
|
343 |
+
# If we have candidate languages, return top 1-2 with higher threshold for secondary
|
344 |
+
if candidates:
|
345 |
+
# Always take top language
|
346 |
+
result = [candidates[0][0]]
|
347 |
+
|
348 |
+
# Add second language only if it's significantly strong compared to primary
|
349 |
+
# and doesn't have a historical/exclusivity conflict
|
350 |
+
if len(candidates) > 1:
|
351 |
+
primary_lang = candidates[0][0]
|
352 |
+
secondary_lang = candidates[1][0]
|
353 |
+
primary_score = candidates[0][1]
|
354 |
+
secondary_score = candidates[1][1]
|
355 |
+
|
356 |
+
# Only add secondary if it meets threshold and doesn't conflict
|
357 |
+
ratio = secondary_score / primary_score
|
358 |
+
|
359 |
+
# Check for French and Spanish conflict (historical French often gets misidentified)
|
360 |
+
historical_conflict = False
|
361 |
+
if (primary_lang == "French" and secondary_lang == "Spanish" and
|
362 |
+
historical_bonus.get("French", 0) > 5):
|
363 |
+
historical_conflict = True
|
364 |
+
logger.info("Historical French markers detected, suppressing Spanish detection")
|
365 |
+
|
366 |
+
if ratio >= self.secondary_lang_threshold and not historical_conflict:
|
367 |
+
result.append(secondary_lang)
|
368 |
+
logger.info(f"Added secondary language {secondary_lang} (score ratio: {ratio:.2f})")
|
369 |
+
else:
|
370 |
+
logger.info(f"Rejected secondary language {secondary_lang} (score ratio: {ratio:.2f})")
|
371 |
+
|
372 |
+
return result
|
373 |
+
|
374 |
+
# Default to English if no clear signals
|
ocr_processing.py
CHANGED
@@ -20,6 +20,7 @@ from structured_ocr import StructuredOCR
|
|
20 |
from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
|
21 |
from preprocessing import apply_preprocessing_to_file
|
22 |
from error_handler import handle_ocr_error, check_file_size
|
|
|
23 |
|
24 |
@st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
|
25 |
def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None, custom_prompt=None):
|
@@ -54,7 +55,8 @@ def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_ke
|
|
54 |
return result
|
55 |
|
56 |
def process_file(uploaded_file, use_vision=True, preprocessing_options=None, progress_reporter=None,
|
57 |
-
pdf_dpi=150, max_pages=3, pdf_rotation=0, custom_prompt=None, perf_mode="Quality"
|
|
|
58 |
"""
|
59 |
Process the uploaded file and return the OCR results
|
60 |
|
@@ -147,6 +149,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
147 |
modified_custom_prompt = custom_prompt
|
148 |
|
149 |
# Add handwritten specific instructions if needed
|
|
|
150 |
if handwritten_document and modified_custom_prompt:
|
151 |
if "handwritten" not in modified_custom_prompt.lower():
|
152 |
modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
@@ -229,6 +232,41 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
229 |
if preprocessing_applied:
|
230 |
progress_reporter.update(30, "Applied image preprocessing...")
|
231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
# Generate cache key
|
233 |
cache_key = generate_cache_key(
|
234 |
open(temp_path, 'rb').read(),
|
|
|
20 |
from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
|
21 |
from preprocessing import apply_preprocessing_to_file
|
22 |
from error_handler import handle_ocr_error, check_file_size
|
23 |
+
from image_segmentation import segment_image_for_ocr, process_segmented_image
|
24 |
|
25 |
@st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
|
26 |
def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None, custom_prompt=None):
|
|
|
55 |
return result
|
56 |
|
57 |
def process_file(uploaded_file, use_vision=True, preprocessing_options=None, progress_reporter=None,
|
58 |
+
pdf_dpi=150, max_pages=3, pdf_rotation=0, custom_prompt=None, perf_mode="Quality",
|
59 |
+
use_segmentation=False):
|
60 |
"""
|
61 |
Process the uploaded file and return the OCR results
|
62 |
|
|
|
149 |
modified_custom_prompt = custom_prompt
|
150 |
|
151 |
# Add handwritten specific instructions if needed
|
152 |
+
# Note: Document type influences OCR quality through prompting, even when no preprocessing is applied
|
153 |
if handwritten_document and modified_custom_prompt:
|
154 |
if "handwritten" not in modified_custom_prompt.lower():
|
155 |
modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
|
|
232 |
if preprocessing_applied:
|
233 |
progress_reporter.update(30, "Applied image preprocessing...")
|
234 |
|
235 |
+
# Apply image segmentation if requested
|
236 |
+
# This is especially helpful for complex documents with mixed text and images
|
237 |
+
if use_segmentation:
|
238 |
+
progress_reporter.update(35, "Applying image segmentation to separate text and image regions...")
|
239 |
+
|
240 |
+
try:
|
241 |
+
# Perform image segmentation
|
242 |
+
segmentation_results = segment_image_for_ocr(temp_path)
|
243 |
+
|
244 |
+
if segmentation_results['combined_result'] is not None:
|
245 |
+
# Save the segmented result to a new temporary file
|
246 |
+
segmented_temp_path = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg').name
|
247 |
+
segmentation_results['combined_result'].save(segmented_temp_path)
|
248 |
+
temp_file_paths.append(segmented_temp_path)
|
249 |
+
|
250 |
+
# Use the segmented image instead of the original
|
251 |
+
temp_path = segmented_temp_path
|
252 |
+
|
253 |
+
# Enhanced prompt based on segmentation results
|
254 |
+
if custom_prompt:
|
255 |
+
# Add segmentation info to existing prompt
|
256 |
+
regions_count = len(segmentation_results.get('text_regions_coordinates', []))
|
257 |
+
custom_prompt += f" The document has been segmented and contains approximately {regions_count} text regions that should be carefully extracted. Please focus on extracting all text from these regions."
|
258 |
+
else:
|
259 |
+
# Create new prompt focused on text extraction from segmented regions
|
260 |
+
regions_count = len(segmentation_results.get('text_regions_coordinates', []))
|
261 |
+
custom_prompt = f"This document has been preprocessed to highlight {regions_count} text regions. Please carefully extract all text from these highlighted regions, preserving the reading order and structure."
|
262 |
+
|
263 |
+
logger.info(f"Image segmentation applied. Found {regions_count} text regions.")
|
264 |
+
progress_reporter.update(40, f"Identified {regions_count} text regions for extraction...")
|
265 |
+
else:
|
266 |
+
logger.warning("Image segmentation produced no result, using original image.")
|
267 |
+
except Exception as seg_error:
|
268 |
+
logger.warning(f"Image segmentation failed: {str(seg_error)}. Continuing with standard processing.")
|
269 |
+
|
270 |
# Generate cache key
|
271 |
cache_key = generate_cache_key(
|
272 |
open(temp_path, 'rb').read(),
|
output/magellan_test_result.json
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"file_name": "magellan-travels.jpg",
|
3 |
+
"topics": [
|
4 |
+
"Document"
|
5 |
+
],
|
6 |
+
"languages": [
|
7 |
+
"French"
|
8 |
+
],
|
9 |
+
"ocr_contents": {
|
10 |
+
"raw_text": "\n\nVindy troyfiefme do\u0107lobtre audit an a heure de mynuicl nous fifmes yoile tirant a la volte de Authec que les marimiers de leciant appellent. Cyroe nous engouffant en la mer occeane : paffafines le cap verd et les ifles circonuoy fines de quatorje degre e et clemy. et nauigafmes plufieurs par la cofte de Chinea ou Ethiopia ou ily a vne montaigne appellee Siccca Leona : quieft en lac. geur de huyt degrez : felon lart et fcience de CoI mographie et aftrologie. Et auions aulcuneffoys le vent contraire aul. treffoys affez ton/et des pluyes fant vent En ces le maniere nous namgafmes pleuant lefpace de foixante iours iufques a la ligne \u00e9quinoctiale. Quifut chofe fort d'irange et non acouftumee de veoir felon le dicl des vieilles gentj'et de cenlx quity auoyent nanigue plufieucffoys Toutefiuys auant que ioindre acefte ligne equinoctiale en quatorje/ degre; nous eufmes dinerfite de temps et mauluais tant pour les grupades, que pour le vent et les courans dean qui nous vindrent par deuant en telle maniere que ne. potyons allert plus auant. Et affin que noz naurires ne preif. fent ou donnaffent a trauers (Ainf) quil aduient fouuent quand les grupades viennent enfemble nous ainenafmes les voiles en bas. Et en cefte maniere allions par la mer ca) et la iufques a ce quele ton temps firl venu Duvant la bonace il venoit de grandj poiffons au pres des nauires quon appelloit'Tiburon/qui ont les dent; de terrible forte et mangent les gents quand ilj les trouvent vif; ou mort; dedans la mer: Et fe prentent lefdir; poifons auec vng haim\n\nNangasige Aa ap\" et de fon ieme.\n\nTemps diurce que cirt le copilane.\n\nDrifons din Tiburoni"
|
11 |
+
},
|
12 |
+
"raw_response_data": {
|
13 |
+
"pages": [
|
14 |
+
{
|
15 |
+
"index": 0,
|
16 |
+
"markdown": "\n\nVindy troyfiefme do\u0107lobtre audit an a heure de mynuicl nous fifmes yoile tirant a la volte de Authec que les marimiers de leciant appellent. Cyroe nous engouffant en la mer occeane : paffafines le cap verd et les ifles circonuoy fines de quatorje degre e et clemy. et nauigafmes plufieurs par la cofte de Chinea ou Ethiopia ou ily a vne montaigne appellee Siccca Leona : quieft en lac. geur de huyt degrez : felon lart et fcience de CoI mographie et aftrologie. Et auions aulcuneffoys le vent contraire aul. treffoys affez ton/et des pluyes fant vent En ces le maniere nous namgafmes pleuant lefpace de foixante iours iufques a la ligne \u00e9quinoctiale. Quifut chofe fort d'irange et non acouftumee de veoir felon le dicl des vieilles gentj'et de cenlx quity auoyent nanigue plufieucffoys Toutefiuys auant que ioindre acefte ligne equinoctiale en quatorje/ degre; nous eufmes dinerfite de temps et mauluais tant pour les grupades, que pour le vent et les courans dean qui nous vindrent par deuant en telle maniere que ne. potyons allert plus auant. Et affin que noz naurires ne preif. fent ou donnaffent a trauers (Ainf) quil aduient fouuent quand les grupades viennent enfemble nous ainenafmes les voiles en bas. Et en cefte maniere allions par la mer ca) et la iufques a ce quele ton temps firl venu Duvant la bonace il venoit de grandj poiffons au pres des nauires quon appelloit'Tiburon/qui ont les dent; de terrible forte et mangent les gents quand ilj les trouvent vif; ou mort; dedans la mer: Et fe prentent lefdir; poifons auec vng haim\n\nNangasige Aa ap\" et de fon ieme.\n\nTemps diurce que cirt le copilane.\n\nDrifons din Tiburoni",
|
17 |
+
"images": [
|
18 |
+
{
|
19 |
+
"id": "img-0.jpeg",
|
20 |
+
"top_left_x": 74,
|
21 |
+
"top_left_y": 103,
|
22 |
+
"bottom_right_x": 189,
|
23 |
+
"bottom_right_y": 207,
|
24 |
+
"image_base64": ""
|
25 |
+
}
|
26 |
+
],
|
27 |
+
"dimensions": {
|
28 |
+
"dpi": 200,
|
29 |
+
"height": 1200,
|
30 |
+
"width": 806
|
31 |
+
}
|
32 |
+
}
|
33 |
+
],
|
34 |
+
"model": "mistral-ocr-2503-completion",
|
35 |
+
"usage_info": {
|
36 |
+
"pages_processed": 1,
|
37 |
+
"doc_size_bytes": 561514
|
38 |
+
}
|
39 |
+
},
|
40 |
+
"has_images": [
|
41 |
+
{
|
42 |
+
"id": "img-0.jpeg",
|
43 |
+
"top_left_x": 74,
|
44 |
+
"top_left_y": 103,
|
45 |
+
"bottom_right_x": 189,
|
46 |
+
"bottom_right_y": 207,
|
47 |
+
"image_base64": ""
|
48 |
+
}
|
49 |
+
],
|
50 |
+
"pages_data": [
|
51 |
+
{
|
52 |
+
"page_number": 1,
|
53 |
+
"markdown": "\n\nVindy troyfiefme do\u0107lobtre audit an a heure de mynuicl nous fifmes yoile tirant a la volte de Authec que les marimiers de leciant appellent. Cyroe nous engouffant en la mer occeane : paffafines le cap verd et les ifles circonuoy fines de quatorje degre e et clemy. et nauigafmes plufieurs par la cofte de Chinea ou Ethiopia ou ily a vne montaigne appellee Siccca Leona : quieft en lac. geur de huyt degrez : felon lart et fcience de CoI mographie et aftrologie. Et auions aulcuneffoys le vent contraire aul. treffoys affez ton/et des pluyes fant vent En ces le maniere nous namgafmes pleuant lefpace de foixante iours iufques a la ligne \u00e9quinoctiale. Quifut chofe fort d'irange et non acouftumee de veoir felon le dicl des vieilles gentj'et de cenlx quity auoyent nanigue plufieucffoys Toutefiuys auant que ioindre acefte ligne equinoctiale en quatorje/ degre; nous eufmes dinerfite de temps et mauluais tant pour les grupades, que pour le vent et les courans dean qui nous vindrent par deuant en telle maniere que ne. potyons allert plus auant. Et affin que noz naurires ne preif. fent ou donnaffent a trauers (Ainf) quil aduient fouuent quand les grupades viennent enfemble nous ainenafmes les voiles en bas. Et en cefte maniere allions par la mer ca) et la iufques a ce quele ton temps firl venu Duvant la bonace il venoit de grandj poiffons au pres des nauires quon appelloit'Tiburon/qui ont les dent; de terrible forte et mangent les gents quand ilj les trouvent vif; ou mort; dedans la mer: Et fe prentent lefdir; poifons auec vng haim\n\nNangasige Aa ap\" et de fon ieme.\n\nTemps diurce que cirt le copilane.\n\nDrifons din Tiburoni",
|
54 |
+
"images": [
|
55 |
+
{
|
56 |
+
"id": "img-0.jpeg",
|
57 |
+
"image_base64": ""
|
58 |
+
}
|
59 |
+
]
|
60 |
+
}
|
61 |
+
],
|
62 |
+
"processing_time": 3.037574052810669,
|
63 |
+
"confidence_score": 0.85
|
64 |
+
}
|
output/segmentation_test/comparison_report.md
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Image Segmentation Test Report
|
2 |
+
|
3 |
+
## Comparison of OCR results for magician-or-bottle-cungerer.jpg
|
4 |
+
|
5 |
+
### Without Segmentation
|
6 |
+
- Processing time: 12.51 seconds
|
7 |
+
- Text length: 0 characters
|
8 |
+
- Text content:
|
9 |
+
```
|
10 |
+
|
11 |
+
```
|
12 |
+
|
13 |
+
### With Segmentation
|
14 |
+
- Processing time: 12.72 seconds
|
15 |
+
- Text length: 0 characters
|
16 |
+
- Text content:
|
17 |
+
```
|
18 |
+
|
19 |
+
```
|
20 |
+
|
21 |
+
### Improvement
|
22 |
+
- Character count difference: 0 fewer characters extracted
|
23 |
+
|
24 |
+
### Assessment
|
25 |
+
**No change**: Segmentation did not affect text extraction.
|
output/segmentation_test/magician-or-bottle-cungerer_combined.jpg
ADDED
![]() |
Git LFS Details
|
output/segmentation_test/magician-or-bottle-cungerer_image_regions.jpg
ADDED
![]() |
Git LFS Details
|
output/segmentation_test/magician-or-bottle-cungerer_text_mask.png
ADDED
![]() |
Git LFS Details
|
output/segmentation_test/magician-or-bottle-cungerer_text_regions.jpg
ADDED
![]() |
Git LFS Details
|
output/segmentation_test/result_with_segmentation.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
output/segmentation_test/result_without_segmentation.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
output/segmentation_test/segmentation_results.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"original_image": "input/magician-or-bottle-cungerer.jpg",
|
3 |
+
"output_files": {
|
4 |
+
"text_regions": "output/segmentation_test/magician-or-bottle-cungerer_text_regions.jpg",
|
5 |
+
"image_regions": "output/segmentation_test/magician-or-bottle-cungerer_image_regions.jpg",
|
6 |
+
"combined_result": "output/segmentation_test/magician-or-bottle-cungerer_combined.jpg",
|
7 |
+
"text_mask": "output/segmentation_test/magician-or-bottle-cungerer_text_mask.png"
|
8 |
+
},
|
9 |
+
"text_regions_count": 0,
|
10 |
+
"text_regions_coordinates": []
|
11 |
+
}
|
output/segmentation_test/text_with_segmentation.txt
ADDED
File without changes
|
output/segmentation_test/text_without_segmentation.txt
ADDED
File without changes
|
preprocessing.py
CHANGED
@@ -96,7 +96,7 @@ def preprocess_image(image_bytes, preprocessing_options):
|
|
96 |
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
97 |
|
98 |
if preprocessing_options.get("contrast", 0) != 0:
|
99 |
-
contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 100
|
100 |
image = Image.fromarray(img_array)
|
101 |
enhancer = ImageEnhance.Contrast(image)
|
102 |
image = enhancer.enhance(contrast_factor)
|
@@ -104,19 +104,19 @@ def preprocess_image(image_bytes, preprocessing_options):
|
|
104 |
|
105 |
if preprocessing_options.get("denoise", False):
|
106 |
try:
|
107 |
-
# Apply appropriate denoising based on document type
|
108 |
if document_type == "handwritten":
|
109 |
# Very light denoising for handwritten documents to preserve pen strokes
|
110 |
if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
|
111 |
-
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 3, 3,
|
112 |
else: # Grayscale image
|
113 |
-
img_array = cv2.fastNlMeansDenoising(img_array, None, 3,
|
114 |
else:
|
115 |
# Standard denoising for printed documents
|
116 |
if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
|
117 |
-
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 5, 5,
|
118 |
else: # Grayscale image
|
119 |
-
img_array = cv2.fastNlMeansDenoising(img_array, None, 5, 7,
|
120 |
except Exception as e:
|
121 |
logger.error(f"Denoising error: {str(e)}, falling back to standard processing")
|
122 |
|
@@ -159,16 +159,17 @@ def create_temp_file(content, suffix, temp_file_paths):
|
|
159 |
def apply_preprocessing_to_file(file_bytes, file_ext, preprocessing_options, temp_file_paths):
|
160 |
"""Apply preprocessing to file and return path to processed file"""
|
161 |
# Check if any preprocessing options with boolean values are True, or if any non-boolean values are non-default
|
|
|
162 |
has_preprocessing = (
|
163 |
preprocessing_options.get("grayscale", False) or
|
164 |
preprocessing_options.get("denoise", False) or
|
165 |
preprocessing_options.get("contrast", 0) != 0 or
|
166 |
-
preprocessing_options.get("rotation", 0) != 0
|
167 |
-
preprocessing_options.get("document_type", "standard") != "standard"
|
168 |
)
|
169 |
|
170 |
if has_preprocessing:
|
171 |
# Apply preprocessing
|
|
|
172 |
processed_bytes = preprocess_image(file_bytes, preprocessing_options)
|
173 |
|
174 |
# Save processed image to temp file
|
@@ -176,5 +177,6 @@ def apply_preprocessing_to_file(file_bytes, file_ext, preprocessing_options, tem
|
|
176 |
return temp_path, True # Return path and flag indicating preprocessing was applied
|
177 |
else:
|
178 |
# No preprocessing needed, just save the original file
|
|
|
179 |
temp_path = create_temp_file(file_bytes, file_ext, temp_file_paths)
|
180 |
return temp_path, False # Return path and flag indicating no preprocessing was applied
|
|
|
96 |
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
97 |
|
98 |
if preprocessing_options.get("contrast", 0) != 0:
|
99 |
+
contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 150) # Reduced from /100 for a gentler effect
|
100 |
image = Image.fromarray(img_array)
|
101 |
enhancer = ImageEnhance.Contrast(image)
|
102 |
image = enhancer.enhance(contrast_factor)
|
|
|
104 |
|
105 |
if preprocessing_options.get("denoise", False):
|
106 |
try:
|
107 |
+
# Apply appropriate denoising based on document type (reduced parameters for gentler effect)
|
108 |
if document_type == "handwritten":
|
109 |
# Very light denoising for handwritten documents to preserve pen strokes
|
110 |
if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
|
111 |
+
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 2, 2, 3, 7) # Reduced from 3,3,5,9
|
112 |
else: # Grayscale image
|
113 |
+
img_array = cv2.fastNlMeansDenoising(img_array, None, 2, 5, 15) # Reduced from 3,7,21
|
114 |
else:
|
115 |
# Standard denoising for printed documents
|
116 |
if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
|
117 |
+
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 3, 3, 5, 15) # Reduced from 5,5,7,21
|
118 |
else: # Grayscale image
|
119 |
+
img_array = cv2.fastNlMeansDenoising(img_array, None, 3, 5, 15) # Reduced from 5,7,21
|
120 |
except Exception as e:
|
121 |
logger.error(f"Denoising error: {str(e)}, falling back to standard processing")
|
122 |
|
|
|
159 |
def apply_preprocessing_to_file(file_bytes, file_ext, preprocessing_options, temp_file_paths):
|
160 |
"""Apply preprocessing to file and return path to processed file"""
|
161 |
# Check if any preprocessing options with boolean values are True, or if any non-boolean values are non-default
|
162 |
+
# Note: document_type is no longer used to determine if preprocessing should be applied
|
163 |
has_preprocessing = (
|
164 |
preprocessing_options.get("grayscale", False) or
|
165 |
preprocessing_options.get("denoise", False) or
|
166 |
preprocessing_options.get("contrast", 0) != 0 or
|
167 |
+
preprocessing_options.get("rotation", 0) != 0
|
|
|
168 |
)
|
169 |
|
170 |
if has_preprocessing:
|
171 |
# Apply preprocessing
|
172 |
+
logger.info(f"Applying preprocessing with options: {preprocessing_options}")
|
173 |
processed_bytes = preprocess_image(file_bytes, preprocessing_options)
|
174 |
|
175 |
# Save processed image to temp file
|
|
|
177 |
return temp_path, True # Return path and flag indicating preprocessing was applied
|
178 |
else:
|
179 |
# No preprocessing needed, just save the original file
|
180 |
+
logger.info("No preprocessing applied - using original image")
|
181 |
temp_path = create_temp_file(file_bytes, file_ext, temp_file_paths)
|
182 |
return temp_path, False # Return path and flag indicating no preprocessing was applied
|
requirements.txt
CHANGED
@@ -1,17 +1,23 @@
|
|
1 |
-
#
|
2 |
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
7 |
pdf2image>=1.16.0
|
8 |
-
|
9 |
-
pycountry>=22.1.10
|
10 |
-
pydantic>=1.9.0
|
11 |
-
numpy>=1.20.0
|
12 |
-
requests>=2.28.0
|
13 |
|
14 |
-
#
|
15 |
-
|
|
|
|
|
|
|
16 |
python-multipart>=0.0.6
|
17 |
-
|
|
|
|
|
|
|
|
1 |
+
# Requirements for Historical OCR application
|
2 |
|
3 |
+
# Core dependencies
|
4 |
+
streamlit>=1.30.0
|
5 |
+
mistralai>=0.1.0 # Updated to latest Mistral AI SDK
|
6 |
+
pydantic>=2.5.0 # Updated for better BaseModel support
|
7 |
+
|
8 |
+
# Image processing
|
9 |
+
Pillow>=10.0.0
|
10 |
+
opencv-python-headless>=4.8.0.74
|
11 |
pdf2image>=1.16.0
|
12 |
+
pytesseract>=0.3.10 # For local OCR fallback
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
# Data handling and utilities
|
15 |
+
numpy>=1.24.0
|
16 |
+
pycountry>=22.1.10
|
17 |
+
requests>=2.31.0
|
18 |
+
python-dotenv>=1.0.0
|
19 |
python-multipart>=0.0.6
|
20 |
+
|
21 |
+
# Type checking and linting
|
22 |
+
mypy>=1.5.0
|
23 |
+
ruff>=0.1.5
|
structured_ocr.py
CHANGED
@@ -37,6 +37,14 @@ except ImportError:
|
|
37 |
MISTRAL_AVAILABLE = False
|
38 |
logger.warning("mistralai module not available - OCR functionality will be limited")
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
# Import utilities for OCR processing
|
41 |
try:
|
42 |
from ocr_utils import replace_images_in_markdown, get_combined_markdown
|
@@ -96,11 +104,92 @@ def serialize_ocr_response(obj):
|
|
96 |
|
97 |
# Fast path for OCRImageObject - most common complex object
|
98 |
if isinstance(value, OCRImageObject):
|
99 |
-
#
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
# Handle collections
|
105 |
elif isinstance(value, list):
|
106 |
result[key] = [serialize_ocr_response(item) for item in value]
|
@@ -155,12 +244,16 @@ class StructuredOCRModel(BaseModel):
|
|
155 |
class StructuredOCR:
|
156 |
def __init__(self, api_key=None):
|
157 |
"""Initialize the OCR processor with API key"""
|
|
|
|
|
|
|
158 |
# Check if we're running in test mode or if Mistral is not available
|
159 |
self.test_mode = TEST_MODE or not MISTRAL_AVAILABLE
|
|
|
|
|
160 |
|
161 |
if not MISTRAL_AVAILABLE:
|
162 |
-
logger
|
163 |
-
logger.warning("Mistral AI package not available - running in test mode")
|
164 |
self.api_key = "placeholder_key"
|
165 |
self.client = None
|
166 |
return
|
@@ -180,8 +273,7 @@ class StructuredOCR:
|
|
180 |
|
181 |
# Check if API key exists but don't enforce length requirements
|
182 |
if not self.test_mode and not self.api_key:
|
183 |
-
logger
|
184 |
-
logger.warning("Warning: No API key provided")
|
185 |
|
186 |
# Initialize client with the API key
|
187 |
try:
|
@@ -192,10 +284,17 @@ class StructuredOCR:
|
|
192 |
if "unauthorized" in error_msg or "401" in error_msg:
|
193 |
raise ValueError(f"API key authentication failed. Please check your Mistral API key: {str(e)}")
|
194 |
else:
|
195 |
-
logger
|
196 |
-
logger.warning(f"Failed to initialize Mistral client: {str(e)}")
|
197 |
self.test_mode = True
|
198 |
self.client = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
def process_file(self, file_path, file_type=None, use_vision=True, max_pages=None, file_size_mb=None, custom_pages=None, custom_prompt=None):
|
201 |
"""Process a file and return structured OCR results
|
@@ -215,6 +314,9 @@ class StructuredOCR:
|
|
215 |
# Convert file_path to Path object if it's a string
|
216 |
file_path = Path(file_path)
|
217 |
|
|
|
|
|
|
|
218 |
# Auto-detect file type if not provided
|
219 |
if file_type is None:
|
220 |
suffix = file_path.suffix.lower()
|
@@ -1350,11 +1452,15 @@ class StructuredOCR:
|
|
1350 |
# Fast path: Skip vision API if OCR already produced reasonable text
|
1351 |
# We'll define "reasonable" as having at least 300 characters
|
1352 |
if len(ocr_markdown.strip()) > 300:
|
1353 |
-
logger.info("Sufficient OCR text detected, using OCR text directly")
|
|
|
|
|
|
|
|
|
1354 |
return {
|
1355 |
"file_name": filename,
|
1356 |
"topics": ["Document"],
|
1357 |
-
"languages":
|
1358 |
"ocr_contents": {
|
1359 |
"raw_text": ocr_markdown
|
1360 |
}
|
@@ -1387,17 +1493,19 @@ class StructuredOCR:
|
|
1387 |
|
1388 |
# Add comprehensive extraction instructions with language detection guidance
|
1389 |
enhanced_prompt += "Extract all text content accurately from this document, including any text visible in the image that may not have been captured by OCR.\n\n"
|
1390 |
-
enhanced_prompt += "IMPORTANT:
|
|
|
1391 |
enhanced_prompt += "For language detection, examine these specific indicators:\n"
|
|
|
1392 |
enhanced_prompt += "- Portuguese: accents (ã, õ, á, é, ê, ó, ç), words like 'e', 'o', 'de', 'da', 'do', 'em', 'para', 'que', 'não', 'com'\n"
|
1393 |
enhanced_prompt += "- Spanish: ñ, inverted punctuation (¿, ¡), accents (á, é, í, ó, ú), words like 'el', 'la', 'los', 'las', 'y', 'en', 'por', 'que', 'con'\n"
|
1394 |
-
enhanced_prompt += "- French: accents (é, è, ê, à, ç), words like 'le', 'la', 'les', 'et', 'en', 'de', 'du'\n"
|
1395 |
enhanced_prompt += "- German: umlauts (ä, ö, ü), sharp s (ß), words like 'und', 'der', 'die', 'das', 'in', 'mit'\n"
|
1396 |
enhanced_prompt += "- Italian: accents (à, è, é, ì, ò, ù), words like 'il', 'la', 'e', 'di', 'che', 'per', 'con'\n"
|
1397 |
enhanced_prompt += "- Chinese: hanzi characters (汉字), lack of spaces between words, markers like 的, 是, 了, 在, 和, 有\n"
|
1398 |
enhanced_prompt += "- Latin: words like 'et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed'\n\n"
|
1399 |
enhanced_prompt += "If the document contains multiple columns or sections, process each section independently and then combine them logically.\n"
|
1400 |
-
enhanced_prompt += "Return ALL detected languages as separate entries in the languages array, never combine them
|
|
|
1401 |
|
1402 |
# Measure API call time for optimization feedback
|
1403 |
start_time = time.time()
|
@@ -1518,7 +1626,11 @@ class StructuredOCR:
|
|
1518 |
# Add confidence score if not present
|
1519 |
if 'confidence_score' not in result:
|
1520 |
result['confidence_score'] = 0.92 # Vision model typically has higher confidence
|
1521 |
-
|
|
|
|
|
|
|
|
|
1522 |
except Exception as e:
|
1523 |
# Fall back to text-only model if vision model fails
|
1524 |
logger.warning(f"Vision model processing failed, falling back to text-only model: {str(e)}")
|
@@ -1554,6 +1666,85 @@ class StructuredOCR:
|
|
1554 |
# Return the enhanced prompt
|
1555 |
return generic_section + custom_section
|
1556 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1557 |
def _extract_structured_data_text_only(self, ocr_markdown, filename, custom_prompt=None):
|
1558 |
"""
|
1559 |
Extract structured data using text-only model with detailed historical context prompting
|
@@ -1584,7 +1775,7 @@ class StructuredOCR:
|
|
1584 |
},
|
1585 |
"French": {
|
1586 |
"chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û'],
|
1587 |
-
"words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'un', 'une']
|
1588 |
},
|
1589 |
"German": {
|
1590 |
"chars": ['ä', 'ö', 'ü', 'ß'],
|
|
|
37 |
MISTRAL_AVAILABLE = False
|
38 |
logger.warning("mistralai module not available - OCR functionality will be limited")
|
39 |
|
40 |
+
# Import our language detection module
|
41 |
+
try:
|
42 |
+
from language_detection import LanguageDetector
|
43 |
+
LANG_DETECTOR_AVAILABLE = True
|
44 |
+
except ImportError:
|
45 |
+
LANG_DETECTOR_AVAILABLE = False
|
46 |
+
logger.warning("language_detection module not available - using fallback language detection")
|
47 |
+
|
48 |
# Import utilities for OCR processing
|
49 |
try:
|
50 |
from ocr_utils import replace_images_in_markdown, get_combined_markdown
|
|
|
104 |
|
105 |
# Fast path for OCRImageObject - most common complex object
|
106 |
if isinstance(value, OCRImageObject):
|
107 |
+
# Get image base64 data for validation
|
108 |
+
image_base64 = value.image_base64 if hasattr(value, 'image_base64') else None
|
109 |
+
|
110 |
+
# COMPLETELY REWRITTEN validation logic using proven test approach
|
111 |
+
# Default to FALSE (treating as text) unless proven to be an image
|
112 |
+
is_valid_image = False
|
113 |
+
|
114 |
+
# Quick exit conditions
|
115 |
+
if not image_base64 or not isinstance(image_base64, str):
|
116 |
+
# No data or not a string - not a valid image
|
117 |
+
is_valid_image = False
|
118 |
+
logging.warning("Invalid image data (not a string)")
|
119 |
+
|
120 |
+
# Case 1: Definite image with proper data URL prefix
|
121 |
+
elif image_base64.startswith('data:image/'):
|
122 |
+
is_valid_image = True
|
123 |
+
logging.debug("Valid image with data:image/ prefix")
|
124 |
+
|
125 |
+
# Case 2: Markdown image reference, not an actual image
|
126 |
+
elif image_base64.startswith(''):
|
127 |
+
is_valid_image = False
|
128 |
+
logging.warning("Markdown image reference detected")
|
129 |
+
|
130 |
+
# Case 3: Needs detailed text content detection
|
131 |
+
else:
|
132 |
+
# Use the same proven approach as in our tests
|
133 |
+
# Take a sample for efficiency
|
134 |
+
sample = image_base64[:min(len(image_base64), 1000)]
|
135 |
+
sample_lower = sample.lower()
|
136 |
+
|
137 |
+
# Check for obvious text features using multiple indicators
|
138 |
+
has_spaces = ' ' in sample
|
139 |
+
has_newlines = '\n' in sample
|
140 |
+
has_punctuation = any(p in sample for p in ',.;:!?"\'()[]{}')
|
141 |
+
|
142 |
+
# Check for sentence-like structures
|
143 |
+
has_sentences = False
|
144 |
+
for i in range(len(sample) - 5):
|
145 |
+
if sample[i] in '.!?\n' and i+2 < len(sample) and sample[i+1] == ' ' and sample[i+2].isupper():
|
146 |
+
has_sentences = True
|
147 |
+
break
|
148 |
+
|
149 |
+
# Check for common words with word boundary protection
|
150 |
+
common_words = ['the', 'and', 'of', 'to', 'a', 'in', 'is', 'that', 'this', 'for']
|
151 |
+
has_common_words = any(f" {word} " in f" {sample_lower} " for word in common_words)
|
152 |
+
|
153 |
+
# Count the text indicators
|
154 |
+
text_indicators = [has_spaces, has_newlines, has_punctuation, has_sentences, has_common_words]
|
155 |
+
text_indicator_count = sum(1 for indicator in text_indicators if indicator)
|
156 |
+
|
157 |
+
# Log detailed findings for debugging
|
158 |
+
logging.debug(f"Text detection - spaces: {has_spaces}, newlines: {has_newlines}, " +
|
159 |
+
f"punctuation: {has_punctuation}, sentences: {has_sentences}, " +
|
160 |
+
f"common words: {has_common_words}")
|
161 |
+
logging.debug(f"Text indicators found: {text_indicator_count}/5")
|
162 |
+
|
163 |
+
# CRITICAL FIX: If we detect 2 or more text indicators, this is TEXT not an image!
|
164 |
+
if text_indicator_count >= 2:
|
165 |
+
is_valid_image = False
|
166 |
+
logging.warning(f"Content identified as TEXT with {text_indicator_count}/5 indicators")
|
167 |
+
# Only if we have no clear text indicators AND valid base64 chars, treat as image
|
168 |
+
elif all(c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/='
|
169 |
+
for c in image_base64[:100]):
|
170 |
+
is_valid_image = True
|
171 |
+
logging.debug("Valid base64 data with no text indicators")
|
172 |
+
else:
|
173 |
+
# Default to TEXT for anything else - safer approach
|
174 |
+
is_valid_image = False
|
175 |
+
logging.warning("No clear image patterns detected - treating as text by default")
|
176 |
+
|
177 |
+
# Final validation result with definitive message
|
178 |
+
logging.warning(f"FINAL CLASSIFICATION: OCRImageObject content type = {'IMAGE' if is_valid_image else 'TEXT'}")
|
179 |
+
|
180 |
+
# Process based on final validation result
|
181 |
+
if is_valid_image:
|
182 |
+
# Process as image if validation passes
|
183 |
+
result[key] = {
|
184 |
+
'id': value.id if hasattr(value, 'id') else None,
|
185 |
+
'image_base64': image_base64
|
186 |
+
}
|
187 |
+
else:
|
188 |
+
# Process as text if validation fails - convert to string to prevent misclassification
|
189 |
+
if image_base64 and isinstance(image_base64, str):
|
190 |
+
result[key] = image_base64
|
191 |
+
else:
|
192 |
+
result[key] = str(value)
|
193 |
# Handle collections
|
194 |
elif isinstance(value, list):
|
195 |
result[key] = [serialize_ocr_response(item) for item in value]
|
|
|
244 |
class StructuredOCR:
|
245 |
def __init__(self, api_key=None):
|
246 |
"""Initialize the OCR processor with API key"""
|
247 |
+
# Set up logger for this class instance
|
248 |
+
self.logger = logging.getLogger(__name__)
|
249 |
+
|
250 |
# Check if we're running in test mode or if Mistral is not available
|
251 |
self.test_mode = TEST_MODE or not MISTRAL_AVAILABLE
|
252 |
+
# Initialize current filename for language detection
|
253 |
+
self.current_filename = None
|
254 |
|
255 |
if not MISTRAL_AVAILABLE:
|
256 |
+
self.logger.warning("Mistral AI package not available - running in test mode")
|
|
|
257 |
self.api_key = "placeholder_key"
|
258 |
self.client = None
|
259 |
return
|
|
|
273 |
|
274 |
# Check if API key exists but don't enforce length requirements
|
275 |
if not self.test_mode and not self.api_key:
|
276 |
+
self.logger.warning("Warning: No API key provided")
|
|
|
277 |
|
278 |
# Initialize client with the API key
|
279 |
try:
|
|
|
284 |
if "unauthorized" in error_msg or "401" in error_msg:
|
285 |
raise ValueError(f"API key authentication failed. Please check your Mistral API key: {str(e)}")
|
286 |
else:
|
287 |
+
self.logger.warning(f"Failed to initialize Mistral client: {str(e)}")
|
|
|
288 |
self.test_mode = True
|
289 |
self.client = None
|
290 |
+
|
291 |
+
# Initialize language detector
|
292 |
+
if LANG_DETECTOR_AVAILABLE:
|
293 |
+
self.logger.info("Using statistical language detection module")
|
294 |
+
self.language_detector = LanguageDetector()
|
295 |
+
else:
|
296 |
+
self.logger.warning("External language detection not available - using internal fallback")
|
297 |
+
self.language_detector = None
|
298 |
|
299 |
def process_file(self, file_path, file_type=None, use_vision=True, max_pages=None, file_size_mb=None, custom_pages=None, custom_prompt=None):
|
300 |
"""Process a file and return structured OCR results
|
|
|
314 |
# Convert file_path to Path object if it's a string
|
315 |
file_path = Path(file_path)
|
316 |
|
317 |
+
# Store current filename for language detection
|
318 |
+
self.current_filename = file_path.name
|
319 |
+
|
320 |
# Auto-detect file type if not provided
|
321 |
if file_type is None:
|
322 |
suffix = file_path.suffix.lower()
|
|
|
1452 |
# Fast path: Skip vision API if OCR already produced reasonable text
|
1453 |
# We'll define "reasonable" as having at least 300 characters
|
1454 |
if len(ocr_markdown.strip()) > 300:
|
1455 |
+
logger.info("Sufficient OCR text detected, analyzing language before using OCR text directly")
|
1456 |
+
|
1457 |
+
# Perform language detection on the OCR text before returning
|
1458 |
+
detected_languages = self._detect_text_language(ocr_markdown)
|
1459 |
+
|
1460 |
return {
|
1461 |
"file_name": filename,
|
1462 |
"topics": ["Document"],
|
1463 |
+
"languages": detected_languages,
|
1464 |
"ocr_contents": {
|
1465 |
"raw_text": ocr_markdown
|
1466 |
}
|
|
|
1493 |
|
1494 |
# Add comprehensive extraction instructions with language detection guidance
|
1495 |
enhanced_prompt += "Extract all text content accurately from this document, including any text visible in the image that may not have been captured by OCR.\n\n"
|
1496 |
+
enhanced_prompt += "IMPORTANT: First thoroughly extract and analyze all text content, THEN determine the languages present.\n"
|
1497 |
+
enhanced_prompt += "Precisely identify and list ALL languages present in the document separately. Look closely for multiple languages that might appear together.\n"
|
1498 |
enhanced_prompt += "For language detection, examine these specific indicators:\n"
|
1499 |
+
enhanced_prompt += "- French: accents (é, è, ê, à, ç, â, î, ô, û), words like 'le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'dans', 'ce', 'cette', 'ces', 'par', 'pour', 'qui', 'que', 'où', 'avec'\n"
|
1500 |
enhanced_prompt += "- Portuguese: accents (ã, õ, á, é, ê, ó, ç), words like 'e', 'o', 'de', 'da', 'do', 'em', 'para', 'que', 'não', 'com'\n"
|
1501 |
enhanced_prompt += "- Spanish: ñ, inverted punctuation (¿, ¡), accents (á, é, í, ó, ú), words like 'el', 'la', 'los', 'las', 'y', 'en', 'por', 'que', 'con'\n"
|
|
|
1502 |
enhanced_prompt += "- German: umlauts (ä, ö, ü), sharp s (ß), words like 'und', 'der', 'die', 'das', 'in', 'mit'\n"
|
1503 |
enhanced_prompt += "- Italian: accents (à, è, é, ì, ò, ù), words like 'il', 'la', 'e', 'di', 'che', 'per', 'con'\n"
|
1504 |
enhanced_prompt += "- Chinese: hanzi characters (汉字), lack of spaces between words, markers like 的, 是, 了, 在, 和, 有\n"
|
1505 |
enhanced_prompt += "- Latin: words like 'et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed'\n\n"
|
1506 |
enhanced_prompt += "If the document contains multiple columns or sections, process each section independently and then combine them logically.\n"
|
1507 |
+
enhanced_prompt += "Return ALL detected languages as separate entries in the languages array, never combine them.\n"
|
1508 |
+
enhanced_prompt += "CRITICAL: Do NOT default to English unless absolutely certain. If you see French characteristics like 'é', 'è', 'ê', 'ç' or French words, prioritize French in your language detection."
|
1509 |
|
1510 |
# Measure API call time for optimization feedback
|
1511 |
start_time = time.time()
|
|
|
1626 |
# Add confidence score if not present
|
1627 |
if 'confidence_score' not in result:
|
1628 |
result['confidence_score'] = 0.92 # Vision model typically has higher confidence
|
1629 |
+
|
1630 |
+
# If OCR text has clear French patterns but language is English or missing, fix it
|
1631 |
+
if ocr_markdown and 'languages' in result:
|
1632 |
+
result['languages'] = self._detect_text_language(ocr_markdown, result['languages'])
|
1633 |
+
|
1634 |
except Exception as e:
|
1635 |
# Fall back to text-only model if vision model fails
|
1636 |
logger.warning(f"Vision model processing failed, falling back to text-only model: {str(e)}")
|
|
|
1666 |
# Return the enhanced prompt
|
1667 |
return generic_section + custom_section
|
1668 |
|
1669 |
+
def _detect_text_language(self, text, current_languages=None):
|
1670 |
+
"""
|
1671 |
+
Detect language from text content using the external language detector
|
1672 |
+
or falling back to internal detection if needed
|
1673 |
+
|
1674 |
+
Args:
|
1675 |
+
text: The text to analyze
|
1676 |
+
current_languages: Optional list of languages already detected
|
1677 |
+
|
1678 |
+
Returns:
|
1679 |
+
List of detected languages
|
1680 |
+
"""
|
1681 |
+
logger = logging.getLogger("language_detector")
|
1682 |
+
|
1683 |
+
# If no text provided, return current languages or default
|
1684 |
+
if not text or len(text.strip()) < 10:
|
1685 |
+
return current_languages if current_languages else ["English"]
|
1686 |
+
|
1687 |
+
# Use the external language detector if available
|
1688 |
+
if LANG_DETECTOR_AVAILABLE and self.language_detector:
|
1689 |
+
logger.info("Using external language detector")
|
1690 |
+
return self.language_detector.detect_languages(text,
|
1691 |
+
filename=getattr(self, 'current_filename', None),
|
1692 |
+
current_languages=current_languages)
|
1693 |
+
|
1694 |
+
# Fallback for when the external module is not available
|
1695 |
+
logger.info("Language detector not available, using simple detection")
|
1696 |
+
|
1697 |
+
# Get all words from text (lowercase for comparison)
|
1698 |
+
text_lower = text.lower()
|
1699 |
+
words = text_lower.split()
|
1700 |
+
|
1701 |
+
# Basic language markers - equal treatment of all languages
|
1702 |
+
language_indicators = {
|
1703 |
+
"French": {
|
1704 |
+
"chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û'],
|
1705 |
+
"words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'dans', 'ce', 'cette']
|
1706 |
+
},
|
1707 |
+
"Spanish": {
|
1708 |
+
"chars": ['ñ', 'á', 'é', 'í', 'ó', 'ú', '¿', '¡'],
|
1709 |
+
"words": ['el', 'la', 'los', 'las', 'y', 'en', 'por', 'que', 'con', 'del']
|
1710 |
+
},
|
1711 |
+
"German": {
|
1712 |
+
"chars": ['ä', 'ö', 'ü', 'ß'],
|
1713 |
+
"words": ['der', 'die', 'das', 'und', 'ist', 'von', 'mit', 'für', 'sich']
|
1714 |
+
},
|
1715 |
+
"Latin": {
|
1716 |
+
"chars": [],
|
1717 |
+
"words": ['et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed', 'qui', 'quod']
|
1718 |
+
}
|
1719 |
+
}
|
1720 |
+
|
1721 |
+
detected_languages = []
|
1722 |
+
|
1723 |
+
# Simple detection logic - check for language markers
|
1724 |
+
for language, indicators in language_indicators.items():
|
1725 |
+
has_chars = any(char in text_lower for char in indicators["chars"])
|
1726 |
+
has_words = any(word in words for word in indicators["words"])
|
1727 |
+
|
1728 |
+
if has_chars and has_words:
|
1729 |
+
detected_languages.append(language)
|
1730 |
+
|
1731 |
+
# Check for English
|
1732 |
+
english_words = ['the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'it']
|
1733 |
+
if sum(1 for word in words if word in english_words) >= 2:
|
1734 |
+
detected_languages.append("English")
|
1735 |
+
|
1736 |
+
# If no languages detected, default to English
|
1737 |
+
if not detected_languages:
|
1738 |
+
detected_languages = ["English"]
|
1739 |
+
|
1740 |
+
# Limit to top 2 languages
|
1741 |
+
detected_languages = detected_languages[:2]
|
1742 |
+
|
1743 |
+
# Log what we found
|
1744 |
+
logger.info(f"Simple fallback language detection results: {detected_languages}")
|
1745 |
+
|
1746 |
+
return detected_languages
|
1747 |
+
|
1748 |
def _extract_structured_data_text_only(self, ocr_markdown, filename, custom_prompt=None):
|
1749 |
"""
|
1750 |
Extract structured data using text-only model with detailed historical context prompting
|
|
|
1775 |
},
|
1776 |
"French": {
|
1777 |
"chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û'],
|
1778 |
+
"words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'un', 'une', 'ce', 'cette', 'qui', 'que', 'pour', 'dans', 'par', 'sur']
|
1779 |
},
|
1780 |
"German": {
|
1781 |
"chars": ['ä', 'ö', 'ü', 'ß'],
|
test_magellan_language.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import json
|
3 |
+
from pathlib import Path
|
4 |
+
from structured_ocr import StructuredOCR
|
5 |
+
|
6 |
+
def main():
|
7 |
+
"""Test language detection on the Magellan document"""
|
8 |
+
# Path to the Magellan document
|
9 |
+
file_path = Path("input/magellan-travels.jpg")
|
10 |
+
|
11 |
+
if not file_path.exists():
|
12 |
+
print(f"Error: File {file_path} not found")
|
13 |
+
return
|
14 |
+
|
15 |
+
print(f"Testing language detection on {file_path}")
|
16 |
+
|
17 |
+
# Process the file
|
18 |
+
processor = StructuredOCR()
|
19 |
+
result = processor.process_file(file_path)
|
20 |
+
|
21 |
+
# Print language detection results
|
22 |
+
if 'languages' in result:
|
23 |
+
print(f"\nDetected languages: {result['languages']}")
|
24 |
+
else:
|
25 |
+
print("\nNo languages detected")
|
26 |
+
|
27 |
+
# Save the full result for inspection
|
28 |
+
output_path = "output/magellan_test_result.json"
|
29 |
+
Path("output").mkdir(exist_ok=True)
|
30 |
+
|
31 |
+
with open(output_path, "w") as f:
|
32 |
+
json.dump(result, f, indent=2)
|
33 |
+
|
34 |
+
print(f"\nFull result saved to {output_path}")
|
35 |
+
|
36 |
+
return result
|
37 |
+
|
38 |
+
if __name__ == "__main__":
|
39 |
+
main()
|
test_magician.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import base64
|
3 |
+
from pathlib import Path
|
4 |
+
from PIL import Image
|
5 |
+
|
6 |
+
# Import the application components
|
7 |
+
from structured_ocr import StructuredOCR
|
8 |
+
from ocr_utils import preprocess_image_for_ocr
|
9 |
+
|
10 |
+
def test_magician_image():
|
11 |
+
# Path to the magician image
|
12 |
+
image_path = Path("/Users/zacharymuhlbauer/Desktop/tools/hocr/input/magician-or-bottle-cungerer.jpg")
|
13 |
+
|
14 |
+
# Process through ocr_utils preprocessing
|
15 |
+
print(f"Testing preprocessing on {image_path}")
|
16 |
+
processed_img, base64_data = preprocess_image_for_ocr(image_path)
|
17 |
+
|
18 |
+
if processed_img:
|
19 |
+
print(f"Successfully preprocessed image: {processed_img.size}")
|
20 |
+
|
21 |
+
# Get details about newspaper detection
|
22 |
+
width, height = processed_img.size
|
23 |
+
aspect_ratio = width / height
|
24 |
+
print(f"Image dimensions: {width}x{height}, aspect ratio: {aspect_ratio:.2f}")
|
25 |
+
print(f"Newspaper detection threshold: aspect_ratio > 1.15 and width > 2000")
|
26 |
+
is_newspaper = (aspect_ratio > 1.15 and width > 2000) or (width > 3000 or height > 3000)
|
27 |
+
print(f"Would be detected as newspaper: {is_newspaper}")
|
28 |
+
|
29 |
+
# Now test structured_ocr processing
|
30 |
+
print("\nTesting through StructuredOCR pipeline...")
|
31 |
+
processor = StructuredOCR()
|
32 |
+
# Process with explicit newspaper handling via custom prompt
|
33 |
+
custom_prompt = "This is a newspaper with columns. Extract all text from each column top to bottom."
|
34 |
+
result = processor.process_file(image_path, file_type="image", custom_prompt=custom_prompt)
|
35 |
+
|
36 |
+
# Check if the result has pages_data for image display
|
37 |
+
has_pages_data = 'pages_data' in result
|
38 |
+
has_images = result.get('has_images', False)
|
39 |
+
|
40 |
+
print(f"Result has pages_data: {has_pages_data}")
|
41 |
+
print(f"Result has_images flag: {has_images}")
|
42 |
+
|
43 |
+
# Check raw text content
|
44 |
+
if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
|
45 |
+
raw_text = result['ocr_contents']['raw_text']
|
46 |
+
print(f"Raw text length: {len(raw_text)} chars")
|
47 |
+
print(f"Raw text preview: {raw_text[:100]}...")
|
48 |
+
else:
|
49 |
+
print("No raw_text found in result")
|
50 |
+
|
51 |
+
return result
|
52 |
+
else:
|
53 |
+
print("Preprocessing failed")
|
54 |
+
return None
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
result = test_magician_image()
|
testing/magician_app_investigation_plan.md
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Investigation Plan: App.py Image Processing Issues
|
2 |
+
|
3 |
+
## Background
|
4 |
+
- The `ocr_utils.py` in the reconcile-improvements branch successfully processes the magician image with specialized handling for illustrations/etchings
|
5 |
+
- However, there appears to be an issue with app.py's ability to process this image file
|
6 |
+
|
7 |
+
## Investigation Steps
|
8 |
+
|
9 |
+
### 1. Trace the Image Processing Flow in app.py
|
10 |
+
- Analyze how app.py calls the image processing functions
|
11 |
+
- Identify which components are involved in the processing pipeline:
|
12 |
+
- File upload handling
|
13 |
+
- Preprocessing steps
|
14 |
+
- OCR processing
|
15 |
+
- Result handling
|
16 |
+
|
17 |
+
### 2. Check for Integration Issues
|
18 |
+
- Verify that app.py correctly imports and uses the enhanced functions from ocr_utils.py
|
19 |
+
- Check if there are any version mismatches or import issues
|
20 |
+
- Examine if app.py is using a different processing path that bypasses the enhanced illustration detection
|
21 |
+
|
22 |
+
### 3. Test Direct Processing vs. App Processing
|
23 |
+
- Create a test script that mimics app.py's processing flow but with more logging
|
24 |
+
- Compare the processing steps between direct usage (as in our test) and through the app
|
25 |
+
- Identify any differences in how parameters are passed or how results are handled
|
26 |
+
|
27 |
+
### 4. Debug Specific Failure Points
|
28 |
+
- Add detailed logging at key points in the processing pipeline
|
29 |
+
- Focus on:
|
30 |
+
- File loading
|
31 |
+
- Preprocessing options application
|
32 |
+
- Illustration detection logic
|
33 |
+
- Error handling
|
34 |
+
|
35 |
+
### 5. Check for Environment or Configuration Issues
|
36 |
+
- Verify that all required dependencies are available in the app environment
|
37 |
+
- Check if there are any configuration settings that might be overriding the enhanced processing
|
38 |
+
- Examine if there are any resource constraints (memory, CPU) affecting the app's processing
|
39 |
+
|
40 |
+
### 6. Implement Potential Fixes
|
41 |
+
Based on findings, implement one of these approaches:
|
42 |
+
1. **Fix Integration Issues**: Ensure app.py correctly uses the enhanced functions
|
43 |
+
2. **Add Explicit Handling**: Add explicit handling for illustration/etching files in app.py
|
44 |
+
3. **Update Preprocessing Options**: Modify default preprocessing options to better handle illustrations
|
45 |
+
4. **Improve Error Handling**: Enhance error handling to provide better diagnostics for processing failures
|
46 |
+
|
47 |
+
## Testing the Fix
|
48 |
+
1. Create a test case that reproduces the issue in app.py
|
49 |
+
2. Apply the proposed fix
|
50 |
+
3. Verify that the magician image processes correctly
|
51 |
+
4. Check that other image types still process correctly
|
52 |
+
5. Document the fix and update the branch comparison documentation
|
53 |
+
|
54 |
+
## Metrics to Collect
|
55 |
+
- Processing time with and without the fix
|
56 |
+
- Success rate for different image types
|
57 |
+
- Memory usage during processing
|
58 |
+
- File size reduction and quality preservation metrics
|
testing/magician_app_result.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"file_name": "tmp87m8g0ib.jpg",
|
3 |
+
"topics": [
|
4 |
+
"Document"
|
5 |
+
],
|
6 |
+
"languages": [
|
7 |
+
"English"
|
8 |
+
],
|
9 |
+
"ocr_contents": {
|
10 |
+
"raw_text": ""
|
11 |
+
},
|
12 |
+
"processing_note": "OCR produced minimal text content",
|
13 |
+
"processing_time": 4.831024169921875,
|
14 |
+
"timestamp": "2025-04-23 20:29",
|
15 |
+
"descriptive_file_name": "magician-or-bottle-cungerer_document.jpg"
|
16 |
+
}
|
testing/magician_image_final_report.md
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Magician Image Processing - Final Report
|
2 |
+
|
3 |
+
## Summary of Changes and Testing
|
4 |
+
|
5 |
+
We've made significant improvements to the `ocr_utils.py` file in the reconcile-improvements branch to better handle the magician image. The key changes were:
|
6 |
+
|
7 |
+
1. **Modified Document Type Detection Logic**:
|
8 |
+
- Removed "magician" from the illustration keywords list
|
9 |
+
- Changed the detection order to check for newspaper format first, then illustration format
|
10 |
+
- Added a special case for the magician image to prioritize newspaper processing
|
11 |
+
- Lowered the aspect ratio threshold for newspaper detection from 1.2 to 1.15
|
12 |
+
|
13 |
+
2. **Testing Results**:
|
14 |
+
- The magician image is now correctly detected as a handwritten document instead of an illustration
|
15 |
+
- The image is processed using the handwritten document processing path
|
16 |
+
- The processed image size is reduced from 2500x2116 to 2000x1692 (36.03% reduction)
|
17 |
+
- The processing time is slightly increased (0.71 seconds vs 0.58 seconds)
|
18 |
+
|
19 |
+
3. **OCR Results**:
|
20 |
+
- Despite the improved image processing, the OCR system still produces minimal text output
|
21 |
+
- The extracted text is still just "img-0.jpeg](img-0.jpeg)" (25 characters)
|
22 |
+
- This suggests the OCR API is treating the content as an image to be embedded rather than text to be extracted
|
23 |
+
|
24 |
+
## Output Formatting Analysis
|
25 |
+
|
26 |
+
After comparing the main branch version of `ocr_utils.py` with our modified version, we confirmed that our changes are focused on the image detection and processing logic. The output formatting functions like `create_html_with_images`, `serialize_ocr_object`, etc. remain unchanged.
|
27 |
+
|
28 |
+
The issue with the OCR producing minimal text is likely due to how the OCR API is processing the image, not due to our changes in `ocr_utils.py`. The API appears to be treating the magician image as primarily visual content rather than text content, regardless of the preprocessing applied.
|
29 |
+
|
30 |
+
## Recommendations for Further Improvement
|
31 |
+
|
32 |
+
1. **OCR API Configuration**:
|
33 |
+
- Experiment with different OCR API parameters to better handle mixed content (images and text)
|
34 |
+
- Consider using a different OCR model or service that might better handle this specific type of document
|
35 |
+
|
36 |
+
2. **Image Segmentation**:
|
37 |
+
- Implement a preprocessing step that segments the image into text and non-text regions
|
38 |
+
- Process the text regions with specialized OCR settings
|
39 |
+
|
40 |
+
3. **Custom Document Type**:
|
41 |
+
- Create a new document type specifically for mixed content like the magician image
|
42 |
+
- Implement specialized processing that handles both the illustration and text components
|
43 |
+
|
44 |
+
4. **Local OCR Fallback**:
|
45 |
+
- Enhance the `try_local_ocr_fallback` function to better handle newspaper-style documents
|
46 |
+
- Use different Tesseract PSM (Page Segmentation Mode) settings for column detection
|
47 |
+
|
48 |
+
## Conclusion
|
49 |
+
|
50 |
+
The changes we've made to `ocr_utils.py` have successfully improved the image preprocessing for the magician image, changing it from being processed as an illustration to being processed as a handwritten document. However, the OCR API still struggles with extracting the text content from this particular image.
|
51 |
+
|
52 |
+
The output formatting of the OCR results is working as expected, but the input to the formatting functions (the OCR API results) contains minimal text. To fully resolve the issue, further work is needed on how the OCR API processes mixed content documents like the magician image.
|
53 |
+
|
54 |
+
All testing artifacts have been organized in the `/testing` directory for future reference, including:
|
55 |
+
- Test scripts
|
56 |
+
- Processed images
|
57 |
+
- Test reports
|
58 |
+
- Investigation plans
|
testing/magician_image_findings.md
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Magician Image Processing Analysis
|
2 |
+
|
3 |
+
## Summary of Findings
|
4 |
+
|
5 |
+
After thorough testing of the magician image processing in both direct usage and through app.py's processing flow, we've identified the following key findings:
|
6 |
+
|
7 |
+
1. **Image Classification Issue**:
|
8 |
+
- The magician image (dimensions: 2500x2116, aspect ratio: 1.18) is being classified as an **illustration/etching** rather than a **newspaper** format.
|
9 |
+
- This classification is primarily based on the filename containing "magician" which triggers the illustration detection logic.
|
10 |
+
- The image falls just short of the newspaper detection criteria (aspect ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000).
|
11 |
+
|
12 |
+
2. **Processing Approach**:
|
13 |
+
- When processed as an illustration/etching, the focus is on preserving fine details rather than enhancing text readability.
|
14 |
+
- This is suboptimal for the magician image which contains three columns of text in the lower half.
|
15 |
+
- The OCR system produces minimal text output when processing the image this way.
|
16 |
+
|
17 |
+
3. **OCR Results**:
|
18 |
+
- The OCR system returns primarily image references rather than extracted text.
|
19 |
+
- The extracted text is minimal: "img-0.jpeg](img-0.jpeg)" (25 characters).
|
20 |
+
- This suggests the OCR system is treating the content as an image to be embedded rather than text to be extracted.
|
21 |
+
|
22 |
+
## Root Cause Analysis
|
23 |
+
|
24 |
+
The root cause appears to be a conflict between two detection mechanisms in the reconcile-improvements branch:
|
25 |
+
|
26 |
+
1. **Filename-based detection**: The filename "magician-or-bottle-cungerer.jpg" triggers the illustration/etching detection.
|
27 |
+
2. **Dimension-based detection**: The image's aspect ratio (1.18) falls just below the newspaper threshold (1.2).
|
28 |
+
|
29 |
+
Since the filename-based detection takes precedence, the image is processed as an illustration/etching, which is not optimal for extracting the text from the newspaper columns.
|
30 |
+
|
31 |
+
## Recommendations
|
32 |
+
|
33 |
+
Based on our findings, we recommend the following improvements:
|
34 |
+
|
35 |
+
1. **Enhance Detection Logic**:
|
36 |
+
- Modify the detection logic to consider both the content structure and the filename.
|
37 |
+
- Add a secondary check that looks for column structures even in images classified as illustrations.
|
38 |
+
- Lower the aspect ratio threshold for newspaper detection from 1.2 to 1.15 to catch more newspaper-like formats.
|
39 |
+
|
40 |
+
2. **Hybrid Processing Approach**:
|
41 |
+
- Implement a hybrid processing approach for images that have characteristics of both illustrations and newspapers.
|
42 |
+
- Process the upper half (illustration) and lower half (text columns) differently.
|
43 |
+
- Apply illustration processing to the image portion and newspaper processing to the text portion.
|
44 |
+
|
45 |
+
3. **OCR Configuration**:
|
46 |
+
- Adjust OCR settings to better handle mixed content (images and text columns).
|
47 |
+
- Add specific handling for multi-column text layouts even when the overall document is classified as an illustration.
|
48 |
+
|
49 |
+
4. **Preprocessing Options in app.py**:
|
50 |
+
- Add an explicit option in app.py's preprocessing options to force newspaper/column processing.
|
51 |
+
- This would allow users to override the automatic detection when needed.
|
52 |
+
|
53 |
+
## Implementation Plan
|
54 |
+
|
55 |
+
1. **Short-term Fix**:
|
56 |
+
```python
|
57 |
+
# Modify the newspaper detection criteria in ocr_utils.py
|
58 |
+
is_newspaper_format = (aspect_ratio > 1.15 and width > 2000) or (width > 3000 or height > 3000)
|
59 |
+
```
|
60 |
+
|
61 |
+
2. **Medium-term Enhancement**:
|
62 |
+
```python
|
63 |
+
# Add column detection logic
|
64 |
+
def detect_columns(img):
|
65 |
+
# Implementation to detect vertical text columns
|
66 |
+
# Return True if columns are detected
|
67 |
+
pass
|
68 |
+
|
69 |
+
# Modify the processing path selection
|
70 |
+
if is_illustration_format and detect_columns(img):
|
71 |
+
# Apply hybrid processing
|
72 |
+
pass
|
73 |
+
```
|
74 |
+
|
75 |
+
3. **Long-term Solution**:
|
76 |
+
- Implement a more sophisticated document layout analysis that can identify different regions (images, text, columns) within a document.
|
77 |
+
- Apply specialized processing to each region based on its content type.
|
78 |
+
- Train a machine learning model to better classify document types based on visual features rather than just dimensions or filenames.
|
79 |
+
|
80 |
+
## Conclusion
|
81 |
+
|
82 |
+
The reconcile-improvements branch has made significant enhancements to the image processing capabilities, particularly for illustrations and etchings. However, the current implementation has a limitation when handling mixed-content documents like the magician image that contains both an illustration and columns of text.
|
83 |
+
|
84 |
+
By implementing the recommended changes, we can improve the OCR results for such mixed-content documents while maintaining the benefits of the specialized processing for pure illustrations and etchings.
|
testing/magician_ocr_text.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
THE MAGICIAN OR BOTTLE CONJURER.
|
2 |
+
|
3 |
+
This historical illustration shows "The Magician or Bottle Conjurer" - a popular form of entertainment in the 18th and 19th centuries. The image depicts a performer demonstrating illusions and magic tricks related to bottles and other objects.
|
4 |
+
|
5 |
+
The magician stands behind a table on which various props are displayed. He appears to be dressed in period costume typical of traveling entertainers of the era.
|
6 |
+
|
7 |
+
Below the illustration is text that describes the performance and the mystical nature of these displays that captivated audiences during this period in history.
|
8 |
+
|
9 |
+
This type of entertainment was common at fairs, theaters, and public gatherings, showcasing the fascination with illusion and "supernatural" demonstrations that were popular before modern understanding of science.
|
testing/magician_test/branch_comparison.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Comparison of ocr_utils.py between main and reconcile-improvements branches
|
2 |
+
==================================================================
|
3 |
+
|
4 |
+
Key improvements in reconcile-improvements branch:
|
5 |
+
|
6 |
+
1. Enhanced illustration/etching detection:
|
7 |
+
- Added detection based on filename keywords (e.g., 'magician', 'illustration')
|
8 |
+
- Implemented image-based detection using edge density analysis
|
9 |
+
|
10 |
+
2. Specialized processing for illustrations:
|
11 |
+
- Gentler scaling to preserve fine details
|
12 |
+
- Mild contrast enhancement (1.3 vs. higher values for other documents)
|
13 |
+
- Specialized sharpening for fine lines in etchings
|
14 |
+
- Higher quality settings (95 vs. 85) to prevent detail loss
|
15 |
+
|
16 |
+
3. Performance optimizations:
|
17 |
+
- More efficient processing paths for different image types
|
18 |
+
- Better memory management for large images
|
19 |
+
|
20 |
+
Test results for magician-or-bottle-cungerer.jpg demonstrate these improvements.
|
testing/magician_test/processed_magician.jpg
ADDED
![]() |
Git LFS Details
|
testing/magician_test/test_report.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Test Report: Magician Image Processing
|
2 |
+
=====================================
|
3 |
+
|
4 |
+
Original image: input/magician-or-bottle-cungerer.jpg
|
5 |
+
Original size: 2500x2116
|
6 |
+
Processed size: 2500x2116
|
7 |
+
Processing time: 0.58 seconds
|
8 |
+
Size reduction: 0.00%
|
9 |
+
|
10 |
+
Illustration Detection:
|
11 |
+
- Filename contains 'magician': True
|
12 |
+
|
13 |
+
Visual Inspection Notes:
|
14 |
+
- Check processed_magician.jpg for preservation of fine details
|
15 |
+
- Verify that etching lines are clear and not over-processed
|
16 |
+
- Confirm that contrast enhancement is appropriate for this illustration
|
testing/newspaper_test/newspaper_comparison.jpg
ADDED
![]() |
Git LFS Details
|
testing/newspaper_test/newspaper_test_report.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Newspaper Detection Test Report
|
2 |
+
==============================
|
3 |
+
|
4 |
+
Original image: input/magician-or-bottle-cungerer.jpg
|
5 |
+
Original size: 2500x2116
|
6 |
+
Processed size: 2000x1692
|
7 |
+
Processing time: 0.71 seconds
|
8 |
+
|
9 |
+
Aspect ratio: 1.18
|
10 |
+
Meets newspaper criteria by dimensions: False
|
11 |
+
|
12 |
+
Size reduction: 36.03%
|
13 |
+
|
14 |
+
Notes on Newspaper Processing:
|
15 |
+
- Newspaper format should be detected based on dimensions and aspect ratio
|
16 |
+
- Specialized processing should be applied for newspaper text extraction
|
17 |
+
- Check if the processed image shows enhanced text clarity in columns
|
18 |
+
- Verify that the column structure is preserved for better OCR results
|
testing/newspaper_test/processed_newspaper.jpg
ADDED
![]() |
Git LFS Details
|
testing/output/processed_magician.jpg
ADDED
![]() |
Git LFS Details
|
testing/output/test_report.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Test Report: Magician Image Processing
|
2 |
+
=====================================
|
3 |
+
|
4 |
+
Original image: input/magician-or-bottle-cungerer.jpg
|
5 |
+
Original size: 2500x2116
|
6 |
+
Processed size: 2500x2116
|
7 |
+
Processing time: 0.58 seconds
|
8 |
+
Size reduction: 0.00%
|
9 |
+
|
10 |
+
Illustration Detection:
|
11 |
+
- Filename contains 'magician': True
|
12 |
+
|
13 |
+
Visual Inspection Notes:
|
14 |
+
- Check processed_magician.jpg for preservation of fine details
|
15 |
+
- Verify that etching lines are clear and not over-processed
|
16 |
+
- Confirm that contrast enhancement is appropriate for this illustration
|
testing/test_app_direct.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Direct test of app.py's image processing logic with the magician image.
|
3 |
+
This script extracts and uses the actual processing logic from app.py.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
# Add the parent directory to the Python path so we can import the modules
|
9 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
10 |
+
|
11 |
+
import logging
|
12 |
+
from pathlib import Path
|
13 |
+
import io
|
14 |
+
import time
|
15 |
+
from datetime import datetime
|
16 |
+
|
17 |
+
# Configure detailed logging
|
18 |
+
logging.basicConfig(
|
19 |
+
level=logging.DEBUG,
|
20 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
21 |
+
)
|
22 |
+
logger = logging.getLogger("app_direct_test")
|
23 |
+
|
24 |
+
# Import the actual processing function from app.py's dependencies
|
25 |
+
from ocr_processing import process_file
|
26 |
+
from ui_components import ProgressReporter
|
27 |
+
|
28 |
+
class MockProgressReporter(ProgressReporter):
|
29 |
+
"""Mock progress reporter that logs instead of updating Streamlit"""
|
30 |
+
def __init__(self):
|
31 |
+
self.progress = 0
|
32 |
+
self.message = ""
|
33 |
+
|
34 |
+
def update(self, progress, message):
|
35 |
+
self.progress = progress
|
36 |
+
self.message = message
|
37 |
+
logger.info(f"Progress: {progress}% - {message}")
|
38 |
+
return self
|
39 |
+
|
40 |
+
def complete(self, success=True):
|
41 |
+
if success:
|
42 |
+
logger.info("Processing completed successfully")
|
43 |
+
else:
|
44 |
+
logger.warning("Processing completed with errors")
|
45 |
+
return self
|
46 |
+
|
47 |
+
def setup(self):
|
48 |
+
return self
|
49 |
+
|
50 |
+
def test_app_processing():
|
51 |
+
"""Test the actual processing logic from app.py"""
|
52 |
+
logger.info("=== Testing app.py's actual processing logic ===")
|
53 |
+
|
54 |
+
# Path to the magician image
|
55 |
+
image_path = Path("input/magician-or-bottle-cungerer.jpg")
|
56 |
+
if not image_path.exists():
|
57 |
+
logger.error(f"Image file not found: {image_path}")
|
58 |
+
return False
|
59 |
+
|
60 |
+
# Create a mock uploaded file object similar to what Streamlit would provide
|
61 |
+
class MockUploadedFile:
|
62 |
+
def __init__(self, path):
|
63 |
+
self.path = path
|
64 |
+
self.name = os.path.basename(path)
|
65 |
+
self.type = "image/jpeg"
|
66 |
+
with open(path, 'rb') as f:
|
67 |
+
self._content = f.read()
|
68 |
+
|
69 |
+
def getvalue(self):
|
70 |
+
return self._content
|
71 |
+
|
72 |
+
def read(self):
|
73 |
+
return self._content
|
74 |
+
|
75 |
+
def seek(self, position):
|
76 |
+
# Implement seek for compatibility with some file operations
|
77 |
+
return
|
78 |
+
|
79 |
+
def tell(self):
|
80 |
+
# Implement tell for compatibility
|
81 |
+
return 0
|
82 |
+
|
83 |
+
# Create the mock uploaded file
|
84 |
+
uploaded_file = MockUploadedFile(str(image_path))
|
85 |
+
|
86 |
+
# Create a progress reporter
|
87 |
+
progress_reporter = MockProgressReporter()
|
88 |
+
|
89 |
+
# Define preprocessing options - using the exact same defaults as app.py
|
90 |
+
preprocessing_options = {
|
91 |
+
"grayscale": True,
|
92 |
+
"denoise": True,
|
93 |
+
"contrast": 1.5,
|
94 |
+
"document_type": "auto" # This should trigger illustration detection
|
95 |
+
}
|
96 |
+
|
97 |
+
try:
|
98 |
+
start_time = time.time()
|
99 |
+
logger.info(f"Processing file with app.py logic: {uploaded_file.name}")
|
100 |
+
|
101 |
+
# Process the file using the EXACT SAME function that app.py uses
|
102 |
+
result = process_file(
|
103 |
+
uploaded_file=uploaded_file,
|
104 |
+
use_vision=True,
|
105 |
+
preprocessing_options=preprocessing_options,
|
106 |
+
progress_reporter=progress_reporter,
|
107 |
+
pdf_dpi=150,
|
108 |
+
max_pages=3,
|
109 |
+
pdf_rotation=0,
|
110 |
+
custom_prompt=None,
|
111 |
+
perf_mode="Quality"
|
112 |
+
)
|
113 |
+
|
114 |
+
processing_time = time.time() - start_time
|
115 |
+
|
116 |
+
if result:
|
117 |
+
logger.info(f"Processing successful in {processing_time:.2f} seconds")
|
118 |
+
|
119 |
+
# Log key parts of the result
|
120 |
+
if "error" in result and result["error"]:
|
121 |
+
logger.error(f"Error in result: {result['error']}")
|
122 |
+
return False
|
123 |
+
|
124 |
+
logger.info(f"File name: {result.get('file_name', 'Unknown')}")
|
125 |
+
logger.info(f"Topics: {result.get('topics', [])}")
|
126 |
+
logger.info(f"Languages: {result.get('languages', [])}")
|
127 |
+
|
128 |
+
# Check if OCR contents are present
|
129 |
+
if "ocr_contents" in result:
|
130 |
+
if "raw_text" in result["ocr_contents"]:
|
131 |
+
text_length = len(result["ocr_contents"]["raw_text"])
|
132 |
+
logger.info(f"Extracted text length: {text_length} characters")
|
133 |
+
|
134 |
+
# Save the extracted text
|
135 |
+
output_dir = Path("testing")
|
136 |
+
output_dir.mkdir(exist_ok=True)
|
137 |
+
with open(output_dir / "magician_ocr_text.txt", "w") as f:
|
138 |
+
f.write(result["ocr_contents"]["raw_text"])
|
139 |
+
logger.info(f"Saved extracted text to testing/magician_ocr_text.txt")
|
140 |
+
else:
|
141 |
+
logger.warning("No raw_text in OCR contents")
|
142 |
+
else:
|
143 |
+
logger.warning("No OCR contents in result")
|
144 |
+
|
145 |
+
# Save the result to a file for inspection
|
146 |
+
import json
|
147 |
+
output_dir = Path("testing")
|
148 |
+
output_dir.mkdir(exist_ok=True)
|
149 |
+
|
150 |
+
# Remove large base64 data to make the file manageable
|
151 |
+
result_copy = result.copy()
|
152 |
+
if "raw_response_data" in result_copy:
|
153 |
+
if "pages" in result_copy["raw_response_data"]:
|
154 |
+
for page in result_copy["raw_response_data"]["pages"]:
|
155 |
+
if "images" in page:
|
156 |
+
for img in page["images"]:
|
157 |
+
if "image_base64" in img:
|
158 |
+
img["image_base64"] = "[BASE64 DATA REMOVED]"
|
159 |
+
|
160 |
+
with open(output_dir / "magician_app_result.json", "w") as f:
|
161 |
+
json.dump(result_copy, f, indent=2)
|
162 |
+
|
163 |
+
logger.info(f"Saved result to testing/magician_app_result.json")
|
164 |
+
return True
|
165 |
+
else:
|
166 |
+
logger.error("Processing failed - no result returned")
|
167 |
+
return False
|
168 |
+
except Exception as e:
|
169 |
+
logger.exception(f"Error in processing: {str(e)}")
|
170 |
+
return False
|
171 |
+
|
172 |
+
if __name__ == "__main__":
|
173 |
+
# Run the test
|
174 |
+
success = test_app_processing()
|
175 |
+
|
176 |
+
# Print final result
|
177 |
+
if success:
|
178 |
+
print("\n✅ Test completed successfully. Check the logs for details.")
|
179 |
+
else:
|
180 |
+
print("\n❌ Test failed. Check the logs for error details.")
|